In [1]:
import sys

 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# setting path
sys.path.append('../src')
from hybrid_fs.hybrid_fs import MRMRFeatureSelection

In [2]:
df = pd.read_csv('../data/uci-secom.csv')
df.head()

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,...,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1


In [3]:
features = df.drop(['Time', 'Pass/Fail'], axis = 1).columns.tolist()
target = 'Pass/Fail'

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

In [4]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_train.mean())

In [5]:
y_train = y_train.map({-1: 0, 1: 1})
y_test = y_test.map({-1: 0, 1: 1})


In [6]:
y_train.value_counts()

0    1173
1      80
Name: Pass/Fail, dtype: int64

In [7]:
1173/80

14.6625

In [8]:
y_test.value_counts()

0    290
1     24
Name: Pass/Fail, dtype: int64

In [9]:
# Train a classifier on the selected features
clf = LogisticRegression(class_weight = {1: 15, 0: 1})

clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

print(f"Number of Features: ", len(features))

# Evaluate the classifier's performance
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Precision
precision = precision_score(y_test, y_pred)  # You can use 'micro', 'macro', or 'weighted'
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_pred)  # You can use 'micro', 'macro', or 'weighted'
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_test, y_pred)  # You can use 'micro', 'macro', or 'weighted'
print("F1 Score:", f1)

# ROC
rocauc = roc_auc_score(y_test, y_pred_proba)  # You can use 'micro', 'macro', or 'weighted'
print("ROC-AUC", rocauc)



Number of Features:  590
Accuracy: 0.6496815286624203
Precision: 0.0784313725490196
Recall: 0.3333333333333333
F1 Score: 0.12698412698412698
ROC-AUC 0.555316091954023


In [11]:
mrmr_selector = MRMRFeatureSelection()

selected_features = mrmr_selector.mrmr_classification(X_train, y_train, max_features=30)

  0%|          | 0/30 [00:00<?, ?it/s]

100%|██████████| 30/30 [01:33<00:00,  3.10s/it]


In [None]:
print(selected_features)
print(len(selected_features))

['41', '497', '477', '138', '65', '33', '510', '40', '288', '443', '137', '541', '309', '416', '64', '127', '408', '411', '28', '126', '91', '277', '184', '130', '212', '37', '413', '25', '563', '447']
30


In [None]:
# Train a classifier on the selected features
clf = LogisticRegression(class_weight = {1: 15, 0: 1})

clf.fit(X_train[selected_features], y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test[selected_features])
y_pred_proba = clf.predict_proba(X_test[selected_features])[:, 1]

print(f"Number of Features: ", len(selected_features))

# Evaluate the classifier's performance
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Precision
precision = precision_score(y_test, y_pred)  # You can use 'micro', 'macro', or 'weighted'
print("Precision:", precision)

# Recall
recall = recall_score(y_test, y_pred)  # You can use 'micro', 'macro', or 'weighted'
print("Recall:", recall)

# F1 Score
f1 = f1_score(y_test, y_pred)  # You can use 'micro', 'macro', or 'weighted'
print("F1 Score:", f1)

# ROC
rocauc = roc_auc_score(y_test, y_pred_proba)  # You can use 'micro', 'macro', or 'weighted'
print("ROC-AUC", rocauc)



Number of Features:  30
Accuracy: 0.678343949044586
Precision: 0.1262135922330097
Recall: 0.5416666666666666
F1 Score: 0.20472440944881887
ROC-AUC 0.6227011494252874
