In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [2]:
# Load dataset
df = pd.read_csv("../data/train_set.csv")  # Update path if needed
X = df.drop(columns=["ID", "CLASS"])
y = df["CLASS"]

In [3]:
np.isinf(X).sum().sum()

4

In [4]:
X.replace([np.inf, -np.inf], np.nan, inplace=True)

In [5]:
from sklearn.feature_selection import VarianceThreshold

# Remove features with variance below 0.01 (adjust as needed) - almost constant ones
selector = VarianceThreshold(threshold=0.01)
X_reduced = selector.fit_transform(X)

In [6]:
# Get mask of selected features
mask = selector.get_support()
selected_columns_vt = X.columns[mask]
X_reduced_df = pd.DataFrame(X_reduced, columns=selected_columns_vt)
X_reduced_df.isnull().sum().sum()

1860

In [7]:
# check each colum with missing values
missing_per_feature = X_reduced_df.isnull().sum()
print(missing_per_feature[missing_per_feature > 0].sort_values(ascending=False))


Feature_1724    116
Feature_1725    116
Feature_1731    116
Feature_1730    116
Feature_1729    116
Feature_1728    116
Feature_1727    116
Feature_1726    116
Feature_1733    116
Feature_1722    116
Feature_1721    116
Feature_1719    116
Feature_1716    116
Feature_1715    116
Feature_1714    116
Feature_1712    116
Feature_90        2
Feature_72        2
dtype: int64


In [8]:
# Impute with mean because there are low number of inf values
X_reduced_df[['Feature_90', 'Feature_72']] = X_reduced_df[['Feature_90', 'Feature_72']].fillna(X_reduced_df[['Feature_90', 'Feature_72']].mean())

In [9]:
# check again after filling missing columns with mean values
missing_per_feature = X_reduced_df.isnull().sum()
print(missing_per_feature[missing_per_feature > 0].sort_values(ascending=False))


Feature_1712    116
Feature_1714    116
Feature_1715    116
Feature_1716    116
Feature_1719    116
Feature_1721    116
Feature_1722    116
Feature_1724    116
Feature_1725    116
Feature_1726    116
Feature_1727    116
Feature_1728    116
Feature_1729    116
Feature_1730    116
Feature_1731    116
Feature_1733    116
dtype: int64


In [10]:
cols_to_drop = missing_per_feature[missing_per_feature > 0].index # get column names of data with missing values

In [11]:
data_dropped_cols = X_reduced_df.drop(columns=cols_to_drop) #drop columns with missing values


In [12]:
# Compute correlation matrix (absolute value)
corr_matrix = data_dropped_cols.corr().abs()
# Upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop_from_corr = [column for column in upper.columns if any(upper[column] > 0.9)]
df_reduced = data_dropped_cols.drop(columns=to_drop_from_corr)
df_reduced

Unnamed: 0,Feature_1,Feature_3,Feature_7,Feature_10,Feature_11,Feature_13,Feature_14,Feature_21,Feature_27,Feature_55,...,Feature_3028,Feature_3048,Feature_3077,Feature_3109,Feature_3124,Feature_3140,Feature_3188,Feature_3204,Feature_3220,Feature_3236
0,18281.541667,9409.650391,2.803803,564.936250,179.125654,44.140704,18.149861,0.213534,-2.134874e+04,124.538689,...,69.471170,487.714844,4.194865,2.998731,47.809475,52.135694,34.822393,32.099384,28.957580,28.154838
1,20010.083333,8303.049072,2.338398,31.291507,122.447882,27.150254,14.165947,0.761767,3.672210e+05,63.580803,...,61.297834,541.331303,3.812588,2.746598,46.657519,54.141836,32.359908,30.100056,28.665010,27.934229
2,27260.125000,12189.649414,2.782842,11.965643,241.904940,47.572298,22.613842,0.164498,1.030497e+06,80.279137,...,66.123893,526.392135,4.396606,2.877551,50.655665,56.333640,33.117770,30.148227,28.948552,27.904807
3,41938.125000,17866.433594,3.060655,8.966286,226.260911,34.835854,29.188142,0.306770,1.476299e+06,72.866798,...,70.872825,524.944231,4.480287,2.747198,54.488171,58.659785,33.817017,30.898766,29.198077,27.870588
4,41274.125000,14315.041992,2.478506,34.898671,208.619270,45.788460,20.180269,0.327623,6.473812e+05,150.760040,...,63.694752,570.384010,3.966163,2.272514,53.290092,61.551127,34.674877,31.877879,31.136644,28.846909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,46787.916667,18052.070312,2.874885,23.499143,278.217541,48.569993,21.849623,0.259688,5.628597e+05,52.751225,...,77.684697,557.262952,3.286749,2.684472,60.121041,70.775633,32.748150,30.297602,28.787507,28.787507
311,8420.354167,4292.039795,2.145061,25.293867,93.618836,18.100903,13.007549,0.695202,1.754243e+05,63.232126,...,66.458778,512.079268,3.984145,2.989855,48.822271,51.780383,34.542655,32.628572,30.582970,29.625473
312,37262.750000,13950.793945,2.585819,37.474634,202.610464,38.141866,28.135131,0.272207,-3.544258e+05,-87.768852,...,68.660111,544.905633,3.626360,2.589302,49.460357,56.497616,31.404197,29.512201,26.865256,26.865256
313,25081.833333,11689.275391,2.820962,18.321132,204.320826,47.209078,25.232333,0.165380,8.314115e+04,44.233105,...,56.532470,537.040408,2.934316,2.768214,45.832172,53.025453,32.585463,31.186746,29.180584,29.180584


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_reduced)# data_dropped_cols


In [42]:

k = 200
selector = SelectKBest(score_func=f_classif, k=k)
X_selected_array = selector.fit_transform(X_scaled, y)
# Get boolean mask of selected features
mask = selector.get_support()

# Get the selected feature names from original DataFrame
selected_features = df_reduced.columns[mask]

# Convert numpy array back to DataFrame with selected columns
X_selected = pd.DataFrame(X_selected_array, columns=selected_features)

print(f"Selected {k} features:")
print(selected_features.tolist())




Selected 200 features:
['Feature_1', 'Feature_3', 'Feature_10', 'Feature_11', 'Feature_13', 'Feature_14', 'Feature_21', 'Feature_27', 'Feature_55', 'Feature_57', 'Feature_59', 'Feature_61', 'Feature_64', 'Feature_66', 'Feature_95', 'Feature_97', 'Feature_100', 'Feature_102', 'Feature_103', 'Feature_112', 'Feature_113', 'Feature_114', 'Feature_115', 'Feature_116', 'Feature_117', 'Feature_120', 'Feature_121', 'Feature_125', 'Feature_134', 'Feature_135', 'Feature_137', 'Feature_240', 'Feature_517', 'Feature_521', 'Feature_523', 'Feature_524', 'Feature_554', 'Feature_570', 'Feature_581', 'Feature_588', 'Feature_716', 'Feature_773', 'Feature_777', 'Feature_795', 'Feature_803', 'Feature_809', 'Feature_826', 'Feature_847', 'Feature_863', 'Feature_879', 'Feature_915', 'Feature_931', 'Feature_991', 'Feature_1007', 'Feature_1023', 'Feature_1034', 'Feature_1075', 'Feature_1146', 'Feature_1187', 'Feature_1195', 'Feature_1285', 'Feature_1286', 'Feature_1288', 'Feature_1291', 'Feature_1304', 'Featur

In [56]:
# Model-based selection (RF) on f_classif features
import xgboost as xgb

# sfm_f = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="mean")

sfm_f = SelectFromModel(xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'),
        threshold='mean')
sfm_f.fit(X_selected, y)
X_train = sfm_f.transform(X_selected)

Parameters: { "use_label_encoder" } are not used.



In [57]:
# Assuming X_selected is a pandas DataFrame with column names
selected_mask = sfm_f.get_support()
selected_features = X_selected.columns[selected_mask]

print("Selected features:", len(selected_features), list(selected_features))


Selected features: 68 ['Feature_3', 'Feature_11', 'Feature_21', 'Feature_27', 'Feature_55', 'Feature_57', 'Feature_61', 'Feature_64', 'Feature_95', 'Feature_100', 'Feature_120', 'Feature_121', 'Feature_125', 'Feature_137', 'Feature_517', 'Feature_581', 'Feature_716', 'Feature_826', 'Feature_1007', 'Feature_1023', 'Feature_1075', 'Feature_1187', 'Feature_1285', 'Feature_1316', 'Feature_1351', 'Feature_1356', 'Feature_1629', 'Feature_1646', 'Feature_1674', 'Feature_1677', 'Feature_1678', 'Feature_1680', 'Feature_1681', 'Feature_1737', 'Feature_1741', 'Feature_1757', 'Feature_2142', 'Feature_2206', 'Feature_2209', 'Feature_2222', 'Feature_2262', 'Feature_2392', 'Feature_2406', 'Feature_2410', 'Feature_2429', 'Feature_2458', 'Feature_2463', 'Feature_2472', 'Feature_2473', 'Feature_2487', 'Feature_2524', 'Feature_2535', 'Feature_2545', 'Feature_2593', 'Feature_2607', 'Feature_2620', 'Feature_2623', 'Feature_2662', 'Feature_2664', 'Feature_2679', 'Feature_2718', 'Feature_2907', 'Feature_2925

In [58]:
X_train_df = pd.DataFrame(X_train, columns=selected_features)
X_train_df

Unnamed: 0,Feature_3,Feature_11,Feature_21,Feature_27,Feature_55,Feature_57,Feature_61,Feature_64,Feature_95,Feature_100,...,Feature_2664,Feature_2679,Feature_2718,Feature_2907,Feature_2925,Feature_2967,Feature_3109,Feature_3140,Feature_3188,Feature_3220
0,-0.768458,-0.428091,-0.747417,-0.390353,0.205056,-0.385229,-0.815394,-0.767716,0.013030,-0.077508,...,-0.413453,0.189342,-0.144508,-0.381207,-0.259868,-0.104992,1.594895,-0.484487,0.617847,-0.288034
1,-0.980408,-1.472364,1.498038,-0.082340,-0.392835,0.206795,-0.110663,-0.603884,1.121934,-1.139035,...,-0.413453,1.155240,-0.145574,1.695309,-0.390861,0.169802,0.664808,-0.300151,-0.350134,-0.421546
2,-0.235997,0.728602,-0.948256,0.443427,-0.229053,0.742487,0.342377,0.542939,0.360922,-0.077508,...,-0.339312,-0.804071,0.175159,0.352278,-0.578093,0.919149,1.147876,-0.098756,-0.052225,-0.292154
3,0.851293,0.440365,-0.365538,0.796807,-0.301755,0.667233,0.745080,0.313574,0.279922,-0.077508,...,-0.413453,-0.517150,-0.182116,-0.528754,-0.363280,-0.373752,0.667021,0.114984,0.222643,-0.178286
4,0.171085,0.115322,-0.280130,0.139738,0.462241,0.102453,0.543729,0.248041,-0.686907,-0.077508,...,-0.356351,0.084710,-0.003855,0.043430,-0.441582,-0.058681,-1.084025,0.380657,0.559860,0.706358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,0.886849,1.397653,-0.558378,0.072739,-0.499054,-0.003312,-0.865731,-1.029847,1.564673,-0.077508,...,6.351456,-2.602131,-0.157295,-0.545625,0.180954,-0.634662,0.435635,1.228258,-0.197520,-0.365645
311,-1.748648,-2.003532,1.225403,-0.234374,-0.396255,0.252318,-0.790225,-0.014090,0.698011,-0.077508,...,-0.413453,-0.957210,0.314899,2.843584,-0.604404,2.120941,1.562151,-0.517135,0.507884,0.453695
312,0.101320,0.004611,-0.507103,-0.654377,-1.877311,-0.626680,-0.009988,-0.112389,-1.565583,-0.077508,...,-0.117949,-0.309610,-0.563627,-0.725531,-0.031575,-0.693964,0.084565,-0.083689,-0.725815,-1.242844
313,-0.331835,0.036124,-0.944644,-0.307525,-0.582602,-0.255512,-0.966407,-1.685174,3.741769,-2.200562,...,1.574901,-1.165303,-0.973084,1.139053,-0.342652,-0.274538,0.744546,-0.402731,-0.261470,-0.186269


In [59]:
X_train_df.isnull().sum().sum()

0

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import numpy as np

# Step 1: Define pipeline
pipeline = Pipeline([
    ('kbest', SelectKBest(score_func=mutual_info_classif)),
    ('model_select', SelectFromModel(estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))),
    ('clf', LogisticRegression(max_iter=5000, class_weight='balanced', random_state=42))
])

# Step 2: Define hyperparameter grid
param_grid = {
    'kbest__k': [30, 50, 80, 100, 150, 200, 'all'],  # Choose top K features
    'model_select__estimator__n_estimators': [50, 100],
    'model_select__estimator__max_depth': [3, 5],
    'model_select__estimator__learning_rate': [0.01, 0.1],
    'clf__C': [0.01, 0.1, 1, 10]
}

# Step 3: Set up stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Step 4: Grid search with appropriate scoring
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring='f1',  # You can also try 'f1_macro' or 'f1_weighted'
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

# Step 5: Fit to your data (replace X_selected and y with actual data)
grid_search.fit(X_scaled, y)

# Step 6: Print results
print("Best hyperparameters:", grid_search.best_params_)
print(f"Best CV F1-score: {grid_search.best_score_:.4f}")

best_index = grid_search.best_index_
best_train_score = grid_search.cv_results_['mean_train_score'][best_index]
best_val_score = grid_search.cv_results_['mean_test_score'][best_index]
print(f"Train F1-score: {best_train_score:.4f}")
print(f"Validation F1-score: {best_val_score:.4f}")

# Step 7: Feature info
selected_features_mask = grid_search.best_estimator_.named_steps['model_select'].get_support()
print(f"Number of features selected by XGBoost: {np.sum(selected_features_mask)}")


Parameters: { "use_label_encoder" } are not used.



Best hyperparameters: {'clf__C': 0.01, 'kbest__k': 30, 'model_select__estimator__learning_rate': 0.01, 'model_select__estimator__max_depth': 5, 'model_select__estimator__n_estimators': 50}
Best CV F1-score: 0.6510
Train F1-score: 0.7085
Validation F1-score: 0.6510
Number of features selected by XGBoost: 14


In [85]:
# Get the fitted SelectKBest step from the pipeline
kbest = grid_search.best_estimator_.named_steps['kbest']
model_select = grid_search.best_estimator_.named_steps['model_select']
# Get boolean mask of selected features
mask_kbest = kbest.get_support()
mask_model_select = model_select.get_support()

# Get indices of selected features
selected_kbest = df_reduced.columns[mask_kbest]
selected_kbest = df_reduced[selected_kbest]
selected_kbest

selected_last = selected_kbest.columns[mask_model_select]
selected_last = selected_kbest[selected_last]
selected_last



Unnamed: 0,Feature_61,Feature_95,Feature_521,Feature_723,Feature_937,Feature_1677,Feature_1857,Feature_1860,Feature_2103,Feature_2201,Feature_2438,Feature_2464,Feature_2657,Feature_2749
0,4.0,-0.558479,56.292635,3.981212,876.179062,281.598389,32.226097,0.627324,0.469227,0.488325,8.268759,0.256198,51.0,0.62500
1,18.0,1.104781,40.097999,4.533407,468.436000,388.114365,80.189146,0.673492,0.941113,0.581282,6.774942,0.236111,20.5,0.81250
2,27.0,-0.036672,33.751130,4.762717,679.118972,1120.868408,260.135448,0.506439,0.968841,0.452132,7.995593,0.190083,36.0,0.75000
3,35.0,-0.158165,45.947029,4.917973,803.770768,922.462036,200.288068,0.479605,0.932353,0.427720,8.267753,0.294118,45.0,0.70000
4,31.0,-1.608325,59.465986,4.357734,719.658469,815.000549,143.505150,0.426656,0.917459,0.362268,7.954038,0.289256,39.0,0.75000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,3.0,1.768850,43.026335,4.654269,1075.097726,183.560440,17.742101,0.899713,1.000000,0.592529,7.930649,0.281250,33.0,1.00000
311,4.5,0.468933,38.888791,4.351076,1147.969506,563.062332,76.689680,0.717851,0.558859,0.556475,6.841871,0.244898,25.0,0.53125
312,20.0,-2.926261,89.474444,4.430717,1352.363570,475.671509,57.238233,0.691734,1.000000,0.271969,8.468504,0.382716,41.0,1.00000
313,1.0,5.034305,23.065945,3.683846,996.094482,50.296597,3.416901,0.980708,1.000000,0.897813,7.340294,0.440000,54.0,1.00000


In [82]:
df_reduced.columns[mask_kbest]

Index(['Feature_61', 'Feature_95', 'Feature_135', 'Feature_137', 'Feature_521',
       'Feature_723', 'Feature_819', 'Feature_937', 'Feature_1377',
       'Feature_1674', 'Feature_1677', 'Feature_1835', 'Feature_1857',
       'Feature_1860', 'Feature_2103', 'Feature_2156', 'Feature_2201',
       'Feature_2438', 'Feature_2463', 'Feature_2464', 'Feature_2472',
       'Feature_2480', 'Feature_2588', 'Feature_2657', 'Feature_2749',
       'Feature_2782', 'Feature_2910', 'Feature_2935', 'Feature_2988',
       'Feature_3124'],
      dtype='object')

In [86]:
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, f1_score, confusion_matrix

# # Initialize the model
# model = LogisticRegression(max_iter=5000, class_weight='balanced', random_state=42)

# # Train the model
# model.fit(X_train_df, y) #  X_train_selected

# # (Optional) Predict on training data to check performance
# y_pred = model.predict(X_train_df)
# y_proba = model.predict_proba(X_train_df)[:, 1]  # Probability for positive class

y_pred = grid_search.predict(X_scaled)#X_test_scaled
y_proba = grid_search.predict_proba(X_scaled)[:, 1]  # Probability for positive class

# Calculate metrics
accuracy = accuracy_score(y, y_pred)
roc_auc = roc_auc_score(y, y_proba)
recall = recall_score(y, y_pred)  # Sensitivity / True Positive Rate
f1 = f1_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
specificity = tn / (tn + fp)

print(f"Training Accuracy: {accuracy:.4f}")
print(f"Training AUROC: {roc_auc:.4f}")
print(f"Training Sensitivity (Recall): {recall:.4f}")
print(f"Training Specificity: {specificity:.4f}")
print(f"Training F1-score: {f1:.4f}")


Training Accuracy: 0.6413
Training AUROC: 0.7014
Training Sensitivity (Recall): 0.6855
Training Specificity: 0.6126
Training F1-score: 0.6007


In [87]:
test_set = pd.read_csv("../data/test_set.csv")
test_set = test_set.drop(['ID'], axis=1)


In [88]:
# separate data into features (X) and targets (y)
X_test = test_set.drop(['CLASS'], axis=1)
y_test = test_set['CLASS']


In [89]:
X_test_reduced = X_test[df_reduced.columns]
X_test_reduced

Unnamed: 0,Feature_1,Feature_3,Feature_7,Feature_10,Feature_11,Feature_13,Feature_14,Feature_21,Feature_27,Feature_55,...,Feature_3028,Feature_3048,Feature_3077,Feature_3109,Feature_3124,Feature_3140,Feature_3188,Feature_3204,Feature_3220,Feature_3236
0,15385.458333,6357.250488,2.108410,3.165713,128.889395,17.482995,14.444362,0.836478,6.184611e+05,32.561707,...,63.316359,548.989774,4.033346,2.736062,49.823700,57.258646,35.419265,31.969241,31.210169,29.952211
1,19539.729167,8168.700928,2.326599,11.257600,137.432916,26.914001,15.199072,0.629148,2.523413e+05,33.128307,...,64.173557,550.913085,3.843136,2.735155,56.183303,63.778178,33.974920,33.372518,31.625858,31.625858
2,34867.125000,14382.916992,2.786669,100.627321,247.727673,39.253779,22.395359,0.239072,1.862830e+05,148.010773,...,76.696824,543.634711,3.557548,2.518558,53.827111,67.636049,34.608313,32.041778,31.065053,28.805248
3,46112.083333,16512.630859,2.655353,8.614636,247.103217,45.856962,21.074495,0.315604,-1.135713e+06,168.510284,...,71.907854,564.798315,3.491748,2.380053,50.038946,63.254723,31.587256,29.016363,27.601655,27.601655
4,37229.000000,12981.727539,2.407654,10.869511,215.216170,28.984571,27.545766,0.364418,-6.012861e+06,172.120468,...,67.682200,547.746810,3.755803,2.485419,51.188984,57.835219,32.217626,30.790878,28.207070,28.207070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,41560.708333,16007.148438,2.758722,26.477994,208.252251,30.902906,25.962195,0.411078,-5.680349e+05,170.233109,...,78.602217,552.780687,3.449619,2.570321,53.821157,67.802711,33.736069,32.332433,30.470757,30.470757
96,47127.500000,15617.785156,2.475250,23.197246,218.158199,35.586120,25.525543,0.398650,1.558956e+06,63.255630,...,63.055799,569.967248,3.471397,2.299036,43.433289,56.481908,33.620432,30.983333,29.421751,28.104362
97,44099.625000,17311.683594,2.867908,26.133394,286.567619,45.846735,19.225935,0.284940,1.107357e+06,28.839373,...,62.919753,552.024919,4.143399,2.716388,43.308113,52.985252,31.363993,28.836150,27.852157,27.051511
98,29193.833333,11772.097656,2.567491,16.133637,222.139596,44.890517,14.535200,0.309183,9.106550e+05,61.907932,...,63.068743,552.552193,3.494078,2.418554,47.240634,54.409788,31.971078,29.565275,28.454736,28.454736


In [90]:
X_test_scaled = scaler.transform(X_test_reduced)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_reduced.columns)
X_test_scaled


Unnamed: 0,Feature_1,Feature_3,Feature_7,Feature_10,Feature_11,Feature_13,Feature_14,Feature_21,Feature_27,Feature_55,...,Feature_3028,Feature_3048,Feature_3077,Feature_3109,Feature_3124,Feature_3140,Feature_3188,Feature_3204,Feature_3220,Feature_3236
0,-0.877826,-1.353092,-1.522036,-0.064567,-1.353681,-1.656094,-1.136455,1.804039,0.116814,-0.697078,...,-0.187381,0.190744,0.580626,0.625942,0.060841,-0.013761,0.852472,0.486426,0.739911,0.843720
1,-0.704306,-1.006140,-0.806696,-0.063082,-1.196269,-0.997195,-1.019986,0.954858,-0.173403,-0.691521,...,-0.117913,0.255317,0.220845,0.622598,0.980888,0.585291,0.284713,1.120098,0.929605,1.823290
2,-0.064094,0.184086,0.701657,-0.046688,0.835885,-0.135073,0.090567,-0.642818,-0.225767,0.435276,...,0.896980,0.010953,-0.319345,-0.176399,0.640016,0.939775,0.533694,0.519181,0.673688,0.172414
3,0.405598,0.591996,0.271133,-0.063567,0.824379,0.326260,-0.113273,-0.329356,-1.273690,0.636340,...,0.508879,0.721498,-0.443806,-0.687328,0.091980,0.537193,-0.653856,-0.846993,-0.906796,-0.532038
4,0.034559,-0.084288,-0.540956,-0.063153,0.236868,-0.852534,0.885393,-0.129424,-5.139721,0.671750,...,0.166429,0.149013,0.055654,-0.298647,0.258357,0.039218,-0.406064,-0.045683,-0.630521,-0.177694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.215491,0.495179,0.610030,-0.060290,0.108560,-0.718509,0.641011,0.061684,-0.823701,0.653238,...,1.051395,0.318019,-0.523493,0.014547,0.639155,0.955088,0.190823,0.650431,0.402488,1.147220
96,0.448011,0.420603,-0.319341,-0.060892,0.291074,-0.391315,0.573626,0.010784,0.862328,-0.396024,...,-0.208497,0.895039,-0.482300,-0.986189,-0.863664,-0.085132,0.145367,0.041223,-0.076215,-0.237809
97,0.321539,0.745040,0.968001,-0.060353,1.551501,0.325546,-0.398548,-0.454949,0.504353,-0.733588,...,-0.219522,0.292645,0.788791,0.553367,-0.881773,-0.406425,-0.741619,-0.928371,-0.792482,-0.854032
98,-0.301063,-0.315972,-0.016925,-0.062188,0.364431,0.258739,-1.122437,-0.355655,0.348431,-0.409243,...,-0.207448,0.310348,-0.439399,-0.545304,-0.312853,-0.275531,-0.502980,-0.599123,-0.517502,-0.032738


In [65]:
X_test = X_test_scaled[selected_features]
X_test

Unnamed: 0,Feature_3,Feature_11,Feature_21,Feature_27,Feature_55,Feature_57,Feature_61,Feature_64,Feature_95,Feature_100,...,Feature_2664,Feature_2679,Feature_2718,Feature_2907,Feature_2925,Feature_2967,Feature_3109,Feature_3140,Feature_3188,Feature_3220
0,-1.353092,-1.353681,1.804039,0.116814,-0.697078,0.764580,0.996770,0.100593,-0.159105,-0.077508,...,-0.413453,0.015735,0.100570,2.761211,-0.644264,3.183964,0.625942,-0.013761,0.852472,0.739911
1,-1.006140,-1.196269,0.954858,-0.173403,-0.691521,0.033455,0.719911,-0.030473,-1.475441,-0.077508,...,-0.397602,1.100181,-0.842980,1.188219,-0.377773,-0.516636,0.622598,0.585291,0.284713,0.929605
2,0.184086,0.835885,-0.642818,-0.225767,0.435276,-0.196830,-0.664380,-1.029847,0.823530,-0.077508,...,-0.413453,-0.059458,0.034505,0.159850,-0.338480,0.229701,-0.176399,0.939775,0.533694,0.673688
3,0.591996,0.824379,-0.329356,-1.273690,0.636340,-1.064160,-1.570462,-1.095380,-1.805099,-0.077508,...,-0.413453,0.817131,-0.422973,-0.583501,0.394350,-0.590317,-0.687328,0.537193,-0.653856,-0.906796
4,-0.084288,0.236868,-0.129424,-5.139721,0.671750,-5.025208,-1.822151,-0.440052,-0.583780,-0.077508,...,0.012170,0.492225,-0.491347,-0.746860,0.559565,-0.732082,-0.298647,0.039218,-0.406064,-0.630521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.495179,0.108560,0.061684,-0.823701,0.653238,-0.746948,-0.714718,-0.767716,-0.649264,-0.077508,...,1.475819,-0.491828,-0.234015,-0.661399,0.704195,-0.622559,0.014547,0.955088,0.190823,0.402488
96,0.420603,0.291074,0.010784,0.862328,-0.396024,0.605818,1.147783,0.018677,0.129439,-0.077508,...,-0.413453,1.128749,0.275485,-0.715977,-0.332803,-0.444629,-0.986189,-0.085132,0.145367,-0.076215
97,0.745040,1.551501,-0.454949,0.504353,-0.733588,0.375206,0.594067,0.542939,0.020949,-0.077508,...,-0.413453,0.171127,0.636392,-0.341959,-0.504477,0.349632,0.553367,-0.406425,-0.741619,-0.792482
98,-0.315972,0.364431,-0.355655,0.348431,-0.409243,0.551237,0.040350,-0.014090,0.430885,-0.077508,...,-0.323979,0.724065,-0.491347,0.201468,-0.475800,-0.152025,-0.545304,-0.275531,-0.502980,-0.517502


In [66]:
X_test.isnull().sum().sum()

0

In [91]:
X_test_scaled

Unnamed: 0,Feature_1,Feature_3,Feature_7,Feature_10,Feature_11,Feature_13,Feature_14,Feature_21,Feature_27,Feature_55,...,Feature_3028,Feature_3048,Feature_3077,Feature_3109,Feature_3124,Feature_3140,Feature_3188,Feature_3204,Feature_3220,Feature_3236
0,-0.877826,-1.353092,-1.522036,-0.064567,-1.353681,-1.656094,-1.136455,1.804039,0.116814,-0.697078,...,-0.187381,0.190744,0.580626,0.625942,0.060841,-0.013761,0.852472,0.486426,0.739911,0.843720
1,-0.704306,-1.006140,-0.806696,-0.063082,-1.196269,-0.997195,-1.019986,0.954858,-0.173403,-0.691521,...,-0.117913,0.255317,0.220845,0.622598,0.980888,0.585291,0.284713,1.120098,0.929605,1.823290
2,-0.064094,0.184086,0.701657,-0.046688,0.835885,-0.135073,0.090567,-0.642818,-0.225767,0.435276,...,0.896980,0.010953,-0.319345,-0.176399,0.640016,0.939775,0.533694,0.519181,0.673688,0.172414
3,0.405598,0.591996,0.271133,-0.063567,0.824379,0.326260,-0.113273,-0.329356,-1.273690,0.636340,...,0.508879,0.721498,-0.443806,-0.687328,0.091980,0.537193,-0.653856,-0.846993,-0.906796,-0.532038
4,0.034559,-0.084288,-0.540956,-0.063153,0.236868,-0.852534,0.885393,-0.129424,-5.139721,0.671750,...,0.166429,0.149013,0.055654,-0.298647,0.258357,0.039218,-0.406064,-0.045683,-0.630521,-0.177694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.215491,0.495179,0.610030,-0.060290,0.108560,-0.718509,0.641011,0.061684,-0.823701,0.653238,...,1.051395,0.318019,-0.523493,0.014547,0.639155,0.955088,0.190823,0.650431,0.402488,1.147220
96,0.448011,0.420603,-0.319341,-0.060892,0.291074,-0.391315,0.573626,0.010784,0.862328,-0.396024,...,-0.208497,0.895039,-0.482300,-0.986189,-0.863664,-0.085132,0.145367,0.041223,-0.076215,-0.237809
97,0.321539,0.745040,0.968001,-0.060353,1.551501,0.325546,-0.398548,-0.454949,0.504353,-0.733588,...,-0.219522,0.292645,0.788791,0.553367,-0.881773,-0.406425,-0.741619,-0.928371,-0.792482,-0.854032
98,-0.301063,-0.315972,-0.016925,-0.062188,0.364431,0.258739,-1.122437,-0.355655,0.348431,-0.409243,...,-0.207448,0.310348,-0.439399,-0.545304,-0.312853,-0.275531,-0.502980,-0.599123,-0.517502,-0.032738


In [92]:
# # (Optional) Predict on training data to check performance
# y_pred_test = model.predict(X_test)#X_test_scaled
# y_proba_test = model.predict_proba(X_test)[:, 1]  # Probability for positive class

y_pred_test = grid_search.predict(X_test_scaled)#X_test_scaled
y_proba_test = grid_search.predict_proba(X_test_scaled)[:, 1]  # Probability for positive class


# Calculate metrics
accuracy_test = accuracy_score(y_test, y_pred_test)
roc_auc_test = roc_auc_score(y_test, y_proba_test)
recall_test = recall_score(y_test, y_pred_test)  # Sensitivity / True Positive Rate
f1_test = f1_score(y_test, y_pred_test)
tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_pred_test).ravel()
specificity_test = tn_test / (tn_test + fp_test)

print(f"Testing Accuracy: {accuracy_test:.4f}")
print(f"Testing AUROC: {roc_auc_test:.4f}")
print(f"Testing Sensitivity (Recall): {recall_test:.4f}")
print(f"Testing Specificity: {specificity_test:.4f}")
print(f"Testing F1-score: {f1_test:.4f}")

Testing Accuracy: 0.6600
Testing AUROC: 0.6905
Testing Sensitivity (Recall): 0.5952
Testing Specificity: 0.7069
Testing F1-score: 0.5952


