In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import optuna
import random
from tqdm.notebook import tqdm

from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.feature_selection import RFECV, mutual_info_classif, SelectKBest, f_classif
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.tree import ExtraTreeClassifier
from sklearn.preprocessing import LabelEncoder

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

experiment_name = 'multi-models'

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.shape, test.shape

((19219, 35), (12814, 28))

In [4]:
# Got function from https://www.kaggle.com/code/thomasmeiner/ps4e3-eda-feature-engineering-model

def reformat_data(df: pd.DataFrame) -> pd.DataFrame:
    target_cols = [
        "Pastry", #4
        "Z_Scratch", #6
        "K_Scatch", #2
        "Stains", #5
        "Dirtiness", #1
        "Bumps", #0
        "Other_Faults", #3
    ]
    non_target_cols = df.drop(target_cols, axis=1).columns.to_list()
    
    binary_dfs = []
    
    for col in target_cols:
        temp_df = df.loc[:, non_target_cols + [col]]
        temp_df = temp_df.loc[temp_df[col] == 1].copy() # keep positives only
        temp_df[col] = col # target value is class name now
        temp_df = temp_df.rename(columns={col: "target"}) # make target col name uniform for final concat
        binary_dfs.append(temp_df)
        
    reformatted_df = pd.concat(binary_dfs)
    return reformatted_df

In [5]:
train = reformat_data(train)
train = train.sort_values(by='id', ascending=True)
train.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
0,0,584,590,909972,909977,16,8,5,2274,113,140,1358,0,1,50,0.7393,0.4,0.5,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417,Stains
1,1,808,816,728350,728372,433,20,54,44478,70,111,1687,1,0,80,0.7772,0.2878,0.2581,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,Other_Faults
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,141,1400,0,1,40,0.0557,0.5282,0.9895,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0,K_Scatch
3,3,781,789,3353146,3353173,210,16,29,3202,114,134,1387,0,1,40,0.7202,0.3333,0.3333,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,K_Scatch
4,4,1540,1560,618457,618502,521,72,67,48231,82,111,1692,0,1,300,0.1211,0.5347,0.0842,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998,Other_Faults


In [6]:
TARGET = 'target'

In [7]:
label_encoder = LabelEncoder()

label_encoder.fit(train[TARGET])

train[TARGET] = label_encoder.transform(train[TARGET])

train.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
0,0,584,590,909972,909977,16,8,5,2274,113,140,1358,0,1,50,0.7393,0.4,0.5,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417,5
1,1,808,816,728350,728372,433,20,54,44478,70,111,1687,1,0,80,0.7772,0.2878,0.2581,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,3
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,141,1400,0,1,40,0.0557,0.5282,0.9895,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0,2
3,3,781,789,3353146,3353173,210,16,29,3202,114,134,1387,0,1,40,0.7202,0.3333,0.3333,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,2
4,4,1540,1560,618457,618502,521,72,67,48231,82,111,1692,0,1,300,0.1211,0.5347,0.0842,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998,3


In [8]:
X = train.drop([TARGET, 'id'], axis=1)
y = train[TARGET]

n_splits = 10
sk10 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [9]:
models = [
    LGBMClassifier(n_jobs=-1, random_state=5),
    XGBClassifier(random_state=5),
    RandomForestClassifier(random_state=5),
    AdaBoostClassifier(random_state=5),
    BaggingClassifier(random_state=5),
    ExtraTreesClassifier(random_state=5),
    HistGradientBoostingClassifier(random_state=5),
    ]

- Remove Correlated Features

In [10]:
# Remove correlated features (leaving just 1 of each pair)
# Leave features highly correlated with the target
df_no_corr = X.copy()
correlation_matrix_spear = df_no_corr.corr(method='spearman').abs()

# Select upper triangle of correlation matrix
upper_spear = correlation_matrix_spear.where(np.triu(np.ones(correlation_matrix_spear.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than a threshold (e.g., 0.9 in this case)
to_drop_spear = [column for column in upper_spear.columns if any(upper_spear[column] >= 0.9)]

# Drop features
df_reduced_spear = df_no_corr.drop(to_drop_spear, axis=1)

# Get list of low correlation features excluding TARGET
low_corr_feats_spear = list(df_reduced_spear.columns)

with open('low_corr_spear.txt', 'w') as f:
    f.write(str(low_corr_feats_spear))
    f.write('\n')

# Print the high correlation features effect
# Both pre and post drop dfs contain the TARGET
print(f"Dropped {len(to_drop_spear)} highly correlated features.\nOld Shape of the dataset was {df_no_corr.shape}\nNew shape of the dataset is {df_reduced_spear.shape}")

Dropped 9 highly correlated features.
Old Shape of the dataset was (18422, 27)
New shape of the dataset is (18422, 18)


- Feature Importances

In [19]:
feat_importance_features = {}

for model in models:
    model_name = model.__class__.__name__

    try:
        # Initialize array to store feature importances
        feature_importances = np.zeros(df_reduced_spear.shape[1])

        # Loop through each fold and calculate the feature importances
        for train_index, test_index in sk10.split(df_reduced_spear, y):
            X_train, X_test = df_reduced_spear.iloc[train_index], df_reduced_spear.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train, y_train)

            # Get the feature importances and them to the total
            feature_importances += model.feature_importances_

        feature_importances /= n_splits

        feature_importances_dict = dict(zip(df_reduced_spear.columns, feature_importances))

        df = pd.DataFrame.from_dict(feature_importances_dict, orient='index')

        # Resetting index with a name for the column
        df = df.reset_index().rename(columns={'index': 'Feature', 0: 'Avg_Feat_Importance'})
        df.sort_values(by='Avg_Feat_Importance', ascending=False, inplace=True)

        # Save to CSV
        df.to_csv(f'{model_name}_feature_importances.csv')

        fi_threshold = 0

        fi_feats = df[df['Avg_Feat_Importance'] > fi_threshold]['Feature'].tolist()

        feat_importance_features[model_name] = fi_feats
        print(f'Done with {model_name}')

    except AttributeError:
        feat_importance_features[model_name] = list(df_reduced_spear.columns)
        print(f'{model_name} does not have feature_importances_')

Done with LGBMClassifier
Done with XGBClassifier
Done with RandomForestClassifier
Done with AdaBoostClassifier
BaggingClassifier does not have feature_importances_
Done with ExtraTreesClassifier
HistGradientBoostingClassifier does not have feature_importances_


In [20]:
with open('featimp_features.txt', mode='w') as f:
    pprint(feat_importance_features, stream=f)

- SelectKBest with Mutual Information

In [16]:
best_features_list = []
kbest_features = {}

for model in models:
    model_name = model.__class__.__name__

    # Select whichever one had a better CV score generally
    # Also, consider computational expense and accuracy balance
    
    # features = feat_importance_features[model_name]
    features = list(df_reduced_spear.columns)

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_kbest = X[features]
    best_score = 0
    best_k = 0
    best_features = []

    # Iterate over k from 1 to number of features
    for k in range(1, len(features) + 1):
        print(f'currently running {k} features on {model_name}')
        # Apply SelectKBest
        selector = SelectKBest(f_classif, k=k)
        X_new = selector.fit_transform(X_kbest, y)

        # Get the selected feature names
        selected_features = X_kbest.columns[selector.get_support()]

        # Evaluate the model
        # model = LGBMClassifier(n_jobs=-1, random_state=5)
        roc_auc_scores = cross_validate(model, X_new, y, cv=sk10, scoring='roc_auc_ovr', n_jobs=-1)
        mean_roc_auc_scores = roc_auc_scores['test_score'].mean()

        if mean_roc_auc_scores > best_score:
            best_k = k
            best_score = mean_roc_auc_scores
            best_features = list(selected_features)

    best_features_list.append({'k': best_k,
                    'Selected Features': best_features,
                    'ROC AUC Score': best_score,
                    'Model Name': model_name})
    
    kbest_features[model_name] = best_features

best_features_df = pd.DataFrame(best_features_list)

best_features_df.sort_values(by='ROC AUC Score', ascending=False, inplace=True)

currently running 1 features on LGBMClassifier
currently running 2 features on LGBMClassifier
currently running 3 features on LGBMClassifier
currently running 4 features on LGBMClassifier
currently running 5 features on LGBMClassifier
currently running 6 features on LGBMClassifier
currently running 7 features on LGBMClassifier
currently running 8 features on LGBMClassifier
currently running 9 features on LGBMClassifier
currently running 10 features on LGBMClassifier
currently running 11 features on LGBMClassifier
currently running 12 features on LGBMClassifier
currently running 13 features on LGBMClassifier
currently running 14 features on LGBMClassifier
currently running 15 features on LGBMClassifier
currently running 16 features on LGBMClassifier
currently running 17 features on LGBMClassifier
currently running 18 features on LGBMClassifier
currently running 1 features on XGBClassifier
currently running 2 features on XGBClassifier
currently running 3 features on XGBClassifier
current

In [17]:
with open('kbest_features.txt', mode='w') as f:
    pprint(kbest_features, stream=f)

In [18]:
best_features_df

Unnamed: 0,k,Selected Features,ROC AUC Score,Model Name
6,14,"[X_Minimum, Pixels_Areas, X_Perimeter, Minimum...",0.887207,HistGradientBoostingClassifier
0,16,"[X_Minimum, Pixels_Areas, X_Perimeter, Minimum...",0.887172,LGBMClassifier
1,12,"[X_Minimum, Pixels_Areas, X_Perimeter, Minimum...",0.882923,XGBClassifier
2,17,"[X_Minimum, Pixels_Areas, X_Perimeter, Minimum...",0.879708,RandomForestClassifier
5,18,"[X_Minimum, Y_Minimum, Pixels_Areas, X_Perimet...",0.873584,ExtraTreesClassifier
4,13,"[X_Minimum, Pixels_Areas, X_Perimeter, Minimum...",0.825518,BaggingClassifier
3,16,"[X_Minimum, Pixels_Areas, X_Perimeter, Minimum...",0.768518,AdaBoostClassifier


- RFECV

In [28]:
# Initialize empty dictionary for RFECV features
rfecv_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__
		
    features = kbest_features[MLA_name]

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_rfecv = X[features]

    try:
        print(f'Starting with {MLA_name}')
        # Create the RFECV object and rank each feature
        selector = RFECV(alg, cv=sk10, step=1, scoring='roc_auc_ovr', verbose=2)
        selector = selector.fit(X_rfecv, y)

        selected_features = list(X_rfecv.columns[selector.support_])

        rfecv_features[MLA_name] = selected_features

        print(f'Done with {MLA_name}', end='\n\n')
    
    except ValueError:
        rfecv_features[MLA_name] = features
        print(f'{MLA_name} does not have coef_ or feature_importances_', end='\n\n')

Starting with LGBMClassifier
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting esti

In [29]:
with open('rfecv_features.txt', mode='w') as f:
    pprint(rfecv_features, stream=f)

- SFS

In [31]:
# Initialize empty dictionary for SFS features
sfs_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__

    try:
            
        features = rfecv_features[MLA_name]
        # features = feat_importance_features[MLA_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {MLA_name}')

        sfs = SFS(alg,
            k_features='best',
            forward=False,
            floating=False,
            scoring='roc_auc_ovr',
            verbose=2,
            n_jobs=-1,
            cv=sk10)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        sfs_features[MLA_name] = list(selected_sfs_feats)

        print(f'Done with {MLA_name}', end='\n\n')

    except KeyError:
        print(f'{MLA_name} not in the dictionary.')

# LGBM 12 features - 0.8874701640374246

Running backward feature selection with LGBMClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   49.0s remaining:   42.8s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.5min finished

[2024-03-02 00:25:20] Features: 14/1 -- score: 0.8870751097766492[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  14 | elapsed:   46.4s remaining:   46.4s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:  1.4min finished

[2024-03-02 00:26:44] Features: 13/1 -- score: 0.887115235751871[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:   44.2s remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:  1.2min finished

[2024-03-02 00:27:58] Features: 12/1 -- score: 0.8873020849825289[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 

Done with LGBMClassifier

Running backward feature selection with XGBClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:  4.3min remaining:  8.6min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  7.0min finished

[2024-03-02 00:41:58] Features: 11/1 -- score: 0.883421611982761[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:  3.9min remaining: 17.5min
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:  4.1min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:  6.2min finished

[2024-03-02 00:48:07] Features: 10/1 -- score: 0.8833993781694703[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  3.9min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.5min finished

[2024-03-02 00:53:40] Features: 9/1 -- score: 0.8822716646499149[Parallel(n_jobs=-1)]: Using backend Lo

Done with XGBClassifier

Running backward feature selection with RandomForestClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  17 | elapsed:  3.7min remaining:  2.0min
[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:  4.4min finished

[2024-03-02 01:15:26] Features: 16/1 -- score: 0.8797427048587174[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  16 | elapsed:  3.0min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  3.0min finished

[2024-03-02 01:18:25] Features: 15/1 -- score: 0.8804682219369477[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  1.6min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.9min finished

[2024-03-02 01:21:16] Features: 14/1 -- score: 0.8803470642794707[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  14

Done with RandomForestClassifier

Running backward feature selection with AdaBoostClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  14 | elapsed:   24.5s remaining:   24.5s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:   43.5s finished

[2024-03-02 01:39:43] Features: 13/1 -- score: 0.7746782691501986[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:   22.3s remaining:   35.8s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   39.1s finished

[2024-03-02 01:40:23] Features: 12/1 -- score: 0.777839897389148[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:   20.9s remaining:   41.9s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   36.1s finished

[2024-03-02 01:40:59] Features: 11/1 -- score: 0.7885161951538457[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 

Done with AdaBoostClassifier

Running backward feature selection with BaggingClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:   26.2s remaining:   42.0s
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:   46.0s finished

[2024-03-02 01:44:36] Features: 12/1 -- score: 0.8267622282380757[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:   24.0s remaining:   48.2s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:   39.5s finished

[2024-03-02 01:45:16] Features: 11/1 -- score: 0.8282774221645305[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:   22.0s remaining:  1.7min
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:   24.2s remaining:    9.0s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:   36.3s finished

[2024-03-02 01:45:53] Features: 10/1 -- score: 0.8265954353197271[Parallel(n_jobs=-1)]: Using backend 

Done with BaggingClassifier

Running backward feature selection with ExtraTreesClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  17 | elapsed:  2.4min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed:  2.8min finished

[2024-03-02 01:51:06] Features: 16/1 -- score: 0.8768819908862158[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  16 | elapsed:  2.1min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  2.3min finished

[2024-03-02 01:53:21] Features: 15/1 -- score: 0.8754591346614913[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  1.2min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.1min finished

[2024-03-02 01:55:28] Features: 14/1 -- score: 0.8753157290320731[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  14

Done with ExtraTreesClassifier

Running backward feature selection with HistGradientBoostingClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  14 | elapsed:   53.0s remaining:   53.0s
[Parallel(n_jobs=-1)]: Done  14 out of  14 | elapsed:  1.6min finished

[2024-03-02 02:12:32] Features: 13/1 -- score: 0.8870079776406999[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  13 | elapsed:   50.0s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  13 out of  13 | elapsed:  1.4min finished

[2024-03-02 02:13:58] Features: 12/1 -- score: 0.887019597723242[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed:   47.9s remaining:  1.6min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.3min finished

[2024-03-02 02:15:16] Features: 11/1 -- score: 0.8877296602336584[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 

Done with HistGradientBoostingClassifier



[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   12.2s finished

[2024-03-02 02:21:26] Features: 1/1 -- score: 0.7665165683344928

In [32]:
with open('sfs_features.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [21]:
def evaluate_models_roc(models, X, y, important_features, cv_split, experiment_name):
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train ROC AUC', 
                                        'MLA Test ROC AUC', 
                                        'MLA Test ROC AUC Std', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train ROC': 0,
                'MLA Test ROC': 0,
                'MLA Test ROC Std': 0,
                'MLA Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='roc_auc_ovr', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train ROC AUC': cv_results['train_score'].mean(),
            'MLA Test ROC AUC': cv_results['test_score'].mean(),
            'MLA Test ROC AUC Std': cv_results['test_score'].std(),
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {MLA_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            results_list.append(result)

    MLA_compare = pd.DataFrame(results_list)

    MLA_compare.sort_values(by=['MLA Test ROC AUC'], ascending=False, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [22]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [23]:
baseline_models = evaluate_models_roc(models, X, y, baseline_features, sk10, experiment_name)
baseline_models

Done with LGBMClassifier.
Done with BaggingClassifier.
Done with AdaBoostClassifier.
Done with RandomForestClassifier.
Done with ExtraTreesClassifier.
Done with HistGradientBoostingClassifier.
Done with XGBClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.960699,0.886802,0.002795,0 min 7.20 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.984409,0.886012,0.002457,0 min 7.73 sec
1,XGBClassifier,"{'objective': 'multi:softprob', 'use_label_enc...",0.994039,0.88199,0.003418,1 min 2.28 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999999,0.877453,0.002847,0 min 16.49 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.999999,0.872837,0.002675,0 min 8.35 sec
4,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.999629,0.823856,0.005831,0 min 7.69 sec
3,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.759459,0.756858,0.012197,0 min 3.93 sec


In [24]:
no_corr_features = {}

for model in models:
    model_name = model.__class__.__name__

    no_corr_features[model_name] = list(df_reduced_spear.columns)

In [25]:
no_corr_models = evaluate_models_roc(models, df_reduced_spear, y, no_corr_features, sk10, f'{experiment_name}_corr')
no_corr_models

Done with RandomForestClassifier.
Done with AdaBoostClassifier.
Done with HistGradientBoostingClassifier.
Done with ExtraTreesClassifier.
Done with XGBClassifier.
Done with LGBMClassifier.
Done with BaggingClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.958243,0.887165,0.002962,0 min 5.79 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.982443,0.885717,0.002586,0 min 5.66 sec
1,XGBClassifier,"{'objective': 'multi:softprob', 'use_label_enc...",0.992867,0.882526,0.002277,0 min 43.38 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999999,0.879269,0.001812,0 min 12.82 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.999999,0.873584,0.00303,0 min 7.14 sec
4,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.999626,0.82422,0.003279,0 min 4.76 sec
3,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.765191,0.761636,0.01379,0 min 2.89 sec


In [26]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

feat_importance_models = evaluate_models_roc(models, X, y, feat_importance_features, sk10, f'{experiment_name}_featimp')
feat_importance_models

Done with AdaBoostClassifier.
Done with HistGradientBoostingClassifier.
Done with BaggingClassifier.
Done with LGBMClassifier.
Done with ExtraTreesClassifier.
Done with RandomForestClassifier.
Done with XGBClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.958243,0.887165,0.002962,0 min 5.92 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.9823,0.885443,0.002803,0 min 5.58 sec
1,XGBClassifier,"{'objective': 'multi:softprob', 'use_label_enc...",0.992826,0.88124,0.002708,0 min 38.88 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999999,0.878704,0.003609,0 min 12.36 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.999999,0.872309,0.002279,0 min 7.13 sec
4,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.999626,0.82422,0.003279,0 min 4.81 sec
3,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.765191,0.761636,0.01379,0 min 2.87 sec


In [27]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

kbest_models = evaluate_models_roc(models, X, y, kbest_features, sk10, f'{experiment_name}_kbest')
kbest_models

Done with ExtraTreesClassifier.
Done with AdaBoostClassifier.
Done with XGBClassifier.
Done with BaggingClassifier.
Done with LGBMClassifier.
Done with HistGradientBoostingClassifier.
Done with RandomForestClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.955614,0.887207,0.003033,0 min 5.31 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.980523,0.887172,0.003243,0 min 5.06 sec
1,XGBClassifier,"{'objective': 'multi:softprob', 'use_label_enc...",0.986529,0.882923,0.003383,0 min 28.26 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999999,0.879708,0.003137,0 min 11.56 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.999999,0.873584,0.00303,0 min 6.88 sec
4,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.99959,0.825518,0.007205,0 min 3.11 sec
3,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.771378,0.768518,0.015698,0 min 2.68 sec


In [30]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

rfecv_models = evaluate_models_roc(models, X, y, rfecv_features, sk10, f'{experiment_name}_rfecv')
rfecv_models

Done with LGBMClassifier.
Done with XGBClassifier.
Done with ExtraTreesClassifier.
Done with HistGradientBoostingClassifier.
Done with BaggingClassifier.
Done with RandomForestClassifier.
Done with AdaBoostClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.980453,0.887673,0.003201,0 min 5.00 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.955614,0.887207,0.003033,0 min 4.94 sec
1,XGBClassifier,"{'objective': 'multi:softprob', 'use_label_enc...",0.986529,0.882923,0.003383,0 min 28.09 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999999,0.879708,0.003137,0 min 11.66 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.999999,0.874068,0.004031,0 min 6.92 sec
4,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.99959,0.825518,0.007205,0 min 2.94 sec
3,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.772502,0.770144,0.016261,0 min 2.26 sec


In [33]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

sfs_models = evaluate_models_roc(models, X, y, sfs_features, sk10, f'{experiment_name}_sfs')
sfs_models

Done with AdaBoostClassifier.
Done with HistGradientBoostingClassifier.
Done with XGBClassifier.
Done with RandomForestClassifier.
Done with BaggingClassifier.
Done with ExtraTreesClassifier.
Done with LGBMClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.953919,0.88773,0.002992,0 min 4.63 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.980453,0.887673,0.003201,0 min 4.70 sec
1,XGBClassifier,"{'objective': 'multi:softprob', 'use_label_enc...",0.985274,0.883422,0.003786,0 min 26.03 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999999,0.880468,0.002408,0 min 9.03 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.999999,0.876882,0.003447,0 min 6.89 sec
4,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.999592,0.828277,0.004752,0 min 2.78 sec
3,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.791082,0.788856,0.006228,0 min 1.87 sec
