In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import optuna
import random
from tqdm.notebook import tqdm

from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.feature_selection import RFECV, mutual_info_classif, SelectKBest, f_classif
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.tree import ExtraTreeClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

experiment_name = 'multi-models_with_original_dataset'

In [2]:
train = pd.read_csv('train.csv')
original = pd.read_csv('Fault.csv', delimiter='\t')
test = pd.read_csv('test.csv')

In [3]:
train.shape, original.shape, test.shape

((19219, 35), (1940, 34), (12814, 28))

In [4]:
# Remove id column and check the dataset
train.drop('id', axis=1, inplace=True)
train.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,584,590,909972,909977,16,8,5,2274,113,140,1358,0,1,50,0.7393,0.4,0.5,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,1687,1,0,80,0.7772,0.2878,0.2581,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,1400,0,1,40,0.0557,0.5282,0.9895,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,1387,0,1,40,0.7202,0.3333,0.3333,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,1692,0,1,300,0.1211,0.5347,0.0842,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [5]:
test.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,127,1656,0,1,150,0.3877,0.4896,0.3273,0.0095,0.5652,1.0,1.0,2.841,1.1139,1.6628,0.6727,-0.2261,0.9172
1,19220,1257,1271,419960,419973,370,26,28,39293,92,132,1354,0,1,40,0.1629,0.4136,0.0938,0.0047,0.2414,1.0,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,19221,1358,1372,117715,117724,289,36,32,29386,101,134,1360,0,1,40,0.0609,0.6234,0.4762,0.0155,0.6,0.75,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,19222,158,168,232415,232440,80,10,11,8586,107,140,1690,1,0,100,0.4439,0.3333,0.8182,0.0037,0.8,1.0,1.0,1.9031,0.699,1.0414,0.1818,-0.0738,0.2051
4,19223,559,592,544375,544389,140,19,15,15524,103,134,1688,1,0,60,0.8191,0.2619,0.4286,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.417


In [6]:
original_dataset_cols = list(train.columns)
original_dataset_cols

['X_Minimum',
 'X_Maximum',
 'Y_Minimum',
 'Y_Maximum',
 'Pixels_Areas',
 'X_Perimeter',
 'Y_Perimeter',
 'Sum_of_Luminosity',
 'Minimum_of_Luminosity',
 'Maximum_of_Luminosity',
 'Length_of_Conveyer',
 'TypeOfSteel_A300',
 'TypeOfSteel_A400',
 'Steel_Plate_Thickness',
 'Edges_Index',
 'Empty_Index',
 'Square_Index',
 'Outside_X_Index',
 'Edges_X_Index',
 'Edges_Y_Index',
 'Outside_Global_Index',
 'LogOfAreas',
 'Log_X_Index',
 'Log_Y_Index',
 'Orientation_Index',
 'Luminosity_Index',
 'SigmoidOfAreas',
 'Pastry',
 'Z_Scratch',
 'K_Scatch',
 'Stains',
 'Dirtiness',
 'Bumps',
 'Other_Faults']

In [7]:
original.columns = original_dataset_cols
original.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,645,651,2538079,2538108,108,10,30,11397,84,123,1687,1,0,80,0.7647,0.3793,0.2069,0.0036,0.6,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
1,829,835,1553913,1553931,71,8,19,7972,99,125,1623,1,0,100,0.971,0.3426,0.3333,0.0037,0.75,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.215,1,0,0,0,0,0,0
2,853,860,369370,369415,176,13,45,18996,99,126,1353,0,1,290,0.7287,0.4413,0.1556,0.0052,0.5385,1.0,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
3,1289,1306,498078,498335,2409,60,260,246930,37,126,1353,0,1,185,0.0695,0.4486,0.0662,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0,1,0,0,0,0,0,0
4,430,441,100250,100337,630,20,87,62357,64,127,1387,0,1,40,0.62,0.3417,0.1264,0.0079,0.55,1.0,1.0,2.7993,1.0414,1.9395,0.8736,-0.2267,0.9874,1,0,0,0,0,0,0


In [8]:
# Concat the train and original dataset
combined_df = pd.concat([train, original], axis=0).reset_index(drop=True)
combined_df.tail()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
21154,249,277,325780,325796,273,54,22,35033,119,141,1360,0,1,40,0.3662,0.3906,0.5714,0.0206,0.5185,0.7273,0.0,2.4362,1.4472,1.2041,-0.4286,0.0026,0.7254,0,0,0,0,0,0,1
21155,144,175,340581,340598,287,44,24,34599,112,133,1360,0,1,40,0.2118,0.4554,0.5484,0.0228,0.7046,0.7083,0.0,2.4579,1.4914,1.2305,-0.4516,-0.0582,0.8173,0,0,0,0,0,0,1
21156,145,174,386779,386794,292,40,22,37572,120,140,1360,0,1,40,0.2132,0.3287,0.5172,0.0213,0.725,0.6818,0.0,2.4654,1.4624,1.1761,-0.4828,0.0052,0.7079,0,0,0,0,0,0,1
21157,137,170,422497,422528,419,97,47,52715,117,140,1360,0,1,40,0.2015,0.5904,0.9394,0.0243,0.3402,0.6596,0.0,2.6222,1.5185,1.4914,-0.0606,-0.0171,0.9919,0,0,0,0,0,0,1
21158,1261,1281,87951,87967,103,26,22,11682,101,133,1360,1,0,80,0.1162,0.6781,0.8,0.0147,0.7692,0.7273,0.0,2.0128,1.301,1.2041,-0.2,-0.1139,0.5296,0,0,0,0,0,0,1


In [9]:
# Got function from https://www.kaggle.com/code/thomasmeiner/ps4e3-eda-feature-engineering-model

def reformat_data(df: pd.DataFrame) -> pd.DataFrame:
    target_cols = [
        "Pastry", #4
        "Z_Scratch", #6
        "K_Scatch", #2
        "Stains", #5
        "Dirtiness", #1
        "Bumps", #0
        "Other_Faults", #3
    ]
    non_target_cols = df.drop(target_cols, axis=1).columns.to_list()
    
    binary_dfs = []
    
    for col in target_cols:
        temp_df = df.loc[:, non_target_cols + [col]]
        temp_df = temp_df.loc[temp_df[col] == 1].copy() # keep positives only
        temp_df[col] = col # target value is class name now
        temp_df = temp_df.rename(columns={col: "target"}) # make target col name uniform for final concat
        binary_dfs.append(temp_df)
        
    reformatted_df = pd.concat(binary_dfs)
    return reformatted_df

In [10]:
combined_df = reformat_data(combined_df)
combined_df.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
7,1673,1687,294065,294091,571,38,57,53142,77,110,1692,0,1,300,0.1491,0.4326,0.9643,0.0142,0.5686,0.7179,1.0,2.7528,1.3802,1.7559,0.0357,-0.2661,0.9408,Pastry
24,1538,1549,849219,849235,275,19,32,28986,71,117,1626,1,0,70,0.1494,0.399,0.375,0.0088,0.6316,1.0,1.0,2.4393,1.0792,1.5051,0.625,-0.2988,0.633,Pastry
35,1310,1316,435871,435916,153,16,32,17101,104,132,1352,0,1,40,0.0532,0.3854,0.3333,0.0044,0.375,0.9688,1.0,2.1847,0.7782,1.5051,0.6667,-0.0916,0.4025,Pastry
60,765,774,6571361,6571375,59,9,12,6682,77,133,1360,0,1,100,0.3613,0.3571,0.28,0.0052,0.7778,1.0,1.0,1.7708,0.8451,1.0792,0.72,-0.1522,0.1892,Pastry
67,1677,1686,1319063,1319076,91,10,15,5608,57,95,1692,1,0,70,0.0024,0.3583,0.6667,0.0047,0.8,1.0,1.0,1.959,0.9031,1.1761,0.3333,-0.3868,0.266,Pastry


In [11]:
TARGET = 'target'

In [12]:
label_encoder = LabelEncoder()

label_encoder.fit(combined_df[TARGET])

combined_df[TARGET] = label_encoder.transform(combined_df[TARGET])

combined_df.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
7,1673,1687,294065,294091,571,38,57,53142,77,110,1692,0,1,300,0.1491,0.4326,0.9643,0.0142,0.5686,0.7179,1.0,2.7528,1.3802,1.7559,0.0357,-0.2661,0.9408,4
24,1538,1549,849219,849235,275,19,32,28986,71,117,1626,1,0,70,0.1494,0.399,0.375,0.0088,0.6316,1.0,1.0,2.4393,1.0792,1.5051,0.625,-0.2988,0.633,4
35,1310,1316,435871,435916,153,16,32,17101,104,132,1352,0,1,40,0.0532,0.3854,0.3333,0.0044,0.375,0.9688,1.0,2.1847,0.7782,1.5051,0.6667,-0.0916,0.4025,4
60,765,774,6571361,6571375,59,9,12,6682,77,133,1360,0,1,100,0.3613,0.3571,0.28,0.0052,0.7778,1.0,1.0,1.7708,0.8451,1.0792,0.72,-0.1522,0.1892,4
67,1677,1686,1319063,1319076,91,10,15,5608,57,95,1692,1,0,70,0.0024,0.3583,0.6667,0.0047,0.8,1.0,1.0,1.959,0.9031,1.1761,0.3333,-0.3868,0.266,4


In [13]:
combined_df.shape, test.shape

((20362, 28), (12814, 28))

In [14]:
X = combined_df.drop([TARGET], axis=1)
y = combined_df[TARGET]

n_splits = 10
sk10 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [15]:
models = [
    LGBMClassifier(n_jobs=-1, random_state=5),
    XGBClassifier(random_state=5),
    RandomForestClassifier(random_state=5),
    AdaBoostClassifier(random_state=5),
    BaggingClassifier(random_state=5),
    ExtraTreesClassifier(random_state=5),
    HistGradientBoostingClassifier(random_state=5),
    ]

- Remove Correlated Features

In [None]:
# Remove correlated features (leaving just 1 of each pair)
# Leave features highly correlated with the target
df_no_corr = X.copy()
correlation_matrix_spear = df_no_corr.corr(method='spearman').abs()

# Select upper triangle of correlation matrix
upper_spear = correlation_matrix_spear.where(np.triu(np.ones(correlation_matrix_spear.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than a threshold (e.g., 0.9 in this case)
to_drop_spear = [column for column in upper_spear.columns if any(upper_spear[column] >= 0.9)]

# Drop features
df_reduced_spear = df_no_corr.drop(to_drop_spear, axis=1)

# Get list of low correlation features excluding TARGET
low_corr_feats_spear = list(df_reduced_spear.columns)

with open('low_corr_spear.txt', 'w') as f:
    f.write(str(low_corr_feats_spear))
    f.write('\n')

# Print the high correlation features effect
# Both pre and post drop dfs contain the TARGET
print(f"Dropped {len(to_drop_spear)} highly correlated features.\nOld Shape of the dataset was {df_no_corr.shape}\nNew shape of the dataset is {df_reduced_spear.shape}")

- Feature Importances

In [None]:
feat_importance_features = {}

for model in models:
    model_name = model.__class__.__name__

    try:
        # Initialize array to store feature importances
        feature_importances = np.zeros(X.shape[1])

        # Loop through each fold and calculate the feature importances
        for train_index, test_index in sk10.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train, y_train)

            # Get the feature importances and them to the total
            feature_importances += model.feature_importances_

        feature_importances /= n_splits

        feature_importances_dict = dict(zip(X.columns, feature_importances))

        df = pd.DataFrame.from_dict(feature_importances_dict, orient='index')

        # Resetting index with a name for the column
        df = df.reset_index().rename(columns={'index': 'Feature', 0: 'Avg_Feat_Importance'})
        df.sort_values(by='Avg_Feat_Importance', ascending=False, inplace=True)

        # Save to CSV
        df.to_csv(f'{model_name}_feature_importances.csv')

        fi_threshold = 0

        fi_feats = df[df['Avg_Feat_Importance'] > fi_threshold]['Feature'].tolist()

        feat_importance_features[model_name] = fi_feats
        print(f'Done with {model_name}')

    except AttributeError:
        feat_importance_features[model_name] = list(X.columns)
        print(f'{model_name} does not have feature_importances_')

In [None]:
with open('featimp_features.txt', mode='w') as f:
    pprint(feat_importance_features, stream=f)

- SelectKBest with f_classif

In [None]:
best_features_list = []
kbest_features = {}

for model in models:
    model_name = model.__class__.__name__

    # Select whichever one had a better CV score generally
    # Also, consider computational expense and accuracy balance
    
    features = feat_importance_features[model_name]
    # features = list(df_reduced_spear.columns)

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_kbest = X[features]
    best_score = 0
    best_k = 0
    best_features = []

    # Iterate over k from 1 to number of features
    for k in range(1, len(features) + 1):
        print(f'currently running {k} features on {model_name}')
        # Apply SelectKBest
        selector = SelectKBest(f_classif, k=k)
        X_new = selector.fit_transform(X_kbest, y)

        # Get the selected feature names
        selected_features = X_kbest.columns[selector.get_support()]

        # Evaluate the model
        # model = LGBMClassifier(n_jobs=-1, random_state=5)
        roc_auc_scores = cross_validate(model, X_new, y, cv=sk10, scoring='roc_auc_ovr', n_jobs=-1)
        mean_roc_auc_scores = roc_auc_scores['test_score'].mean()

        if mean_roc_auc_scores > best_score:
            best_k = k
            best_score = mean_roc_auc_scores
            best_features = list(selected_features)

    best_features_list.append({'k': best_k,
                    'Selected Features': best_features,
                    'ROC AUC Score': best_score,
                    'Model Name': model_name})
    
    kbest_features[model_name] = best_features

best_features_df = pd.DataFrame(best_features_list)

best_features_df.sort_values(by='ROC AUC Score', ascending=False, inplace=True)

In [None]:
with open('kbest_features.txt', mode='w') as f:
    pprint(kbest_features, stream=f)

In [None]:
best_features_df

- RFECV

In [None]:
# Initialize empty dictionary for RFECV features
rfecv_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__
		
    features = kbest_features[MLA_name]

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_rfecv = X[features]

    try:
        print(f'Starting with {MLA_name}')
        # Create the RFECV object and rank each feature
        selector = RFECV(alg, cv=sk10, step=1, scoring='roc_auc_ovr', verbose=2)
        selector = selector.fit(X_rfecv, y)

        selected_features = list(X_rfecv.columns[selector.support_])

        rfecv_features[MLA_name] = selected_features

        print(f'Done with {MLA_name}', end='\n\n')
    
    except ValueError:
        rfecv_features[MLA_name] = features
        print(f'{MLA_name} does not have coef_ or feature_importances_', end='\n\n')

In [None]:
with open('rfecv_features.txt', mode='w') as f:
    pprint(rfecv_features, stream=f)

- SFS

In [16]:
rfecv_features = {'AdaBoostClassifier': ['Steel_Plate_Thickness', 'Log_Y_Index', 'Maximum_of_Luminosity', 'Outside_X_Index', 'Luminosity_Index', 'Edges_Index', 'Empty_Index', 'Orientation_Index', 'Y_Perimeter', 'LogOfAreas', 'Length_of_Conveyer', 'X_Maximum', 'Log_X_Index', 'Edges_Y_Index', 'X_Minimum', 'SigmoidOfAreas', 'Sum_of_Luminosity', 'Y_Maximum', 'TypeOfSteel_A300'],
 'BaggingClassifier': ['X_Minimum', 'X_Maximum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas'],
 'ExtraTreesClassifier': ['LogOfAreas', 'Log_X_Index', 'Length_of_Conveyer', 'Minimum_of_Luminosity', 'X_Minimum', 'Steel_Plate_Thickness', 'SigmoidOfAreas', 'X_Maximum', 'Edges_Y_Index', 'Edges_Index', 'Outside_X_Index', 'Orientation_Index', 'Log_Y_Index', 'Pixels_Areas', 'Square_Index', 'Sum_of_Luminosity', 'Edges_X_Index', 'Empty_Index', 'Luminosity_Index', 'X_Perimeter', 'Y_Perimeter', 'Maximum_of_Luminosity', 'Y_Maximum', 'Y_Minimum', 'TypeOfSteel_A300'],
 'HistGradientBoostingClassifier': ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas'],
 'LGBMClassifier': ['Empty_Index', 'Edges_Index', 'Luminosity_Index', 'Length_of_Conveyer', 'Minimum_of_Luminosity', 'Edges_X_Index', 'Steel_Plate_Thickness', 'X_Minimum', 'Y_Minimum', 'Sum_of_Luminosity', 'Orientation_Index', 'X_Maximum', 'Square_Index', 'Outside_X_Index', 'X_Perimeter', 'Pixels_Areas', 'Maximum_of_Luminosity', 'SigmoidOfAreas', 'Log_Y_Index', 'Y_Maximum', 'Edges_Y_Index', 'Y_Perimeter', 'Log_X_Index', 'LogOfAreas', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Outside_Global_Index'],
 'RandomForestClassifier': ['Outside_X_Index', 'Log_X_Index', 'Pixels_Areas', 'Sum_of_Luminosity', 'LogOfAreas', 'Minimum_of_Luminosity', 'X_Minimum', 'Length_of_Conveyer', 'X_Perimeter', 'SigmoidOfAreas', 'X_Maximum', 'Steel_Plate_Thickness', 'Edges_Index', 'Luminosity_Index', 'Empty_Index', 'Orientation_Index', 'Square_Index', 'Y_Maximum', 'Edges_X_Index', 'Y_Perimeter', 'Log_Y_Index', 'Maximum_of_Luminosity', 'Edges_Y_Index', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Outside_Global_Index']}

In [34]:
rfecv_features = {'XGBClassifier': ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index']}

In [None]:
for model in models:
    model_name = model.__class__.__name__
    try:
        print(rfecv_features[model_name])
    except KeyError:
        print(f'{model_name} not in the dictionary')

In [35]:
# Initialize empty dictionary for SFS features
sfs_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__

    try:
            
        # features = kbest_features[MLA_name]
        # features = feat_importance_features[MLA_name]
        features = rfecv_features[MLA_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {MLA_name}')

        sfs = SFS(alg,
            k_features='best',
            forward=False,
            floating=False,
            scoring='roc_auc_ovr',
            verbose=2,
            n_jobs=-1,
            cv=sk10)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        sfs_features[MLA_name] = list(selected_sfs_feats)

        print(f'Done with {MLA_name}', end='\n\n')

    except KeyError:
        print(f'{MLA_name} not in the dictionary.')

LGBMClassifier not in the dictionary.
Running backward feature selection with XGBClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed: 14.9min remaining:  5.7min
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed: 18.0min finished

[2024-03-06 22:30:48] Features: 17/1 -- score: 0.8934256763654176[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  17 | elapsed: 13.9min remaining:  7.6min
[Parallel(n_jobs=-1)]: Done  17 out of  17 | elapsed: 16.2min finished

[2024-03-06 22:47:02] Features: 16/1 -- score: 0.8933680525182373[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  16 | elapsed: 13.1min remaining:  7.8min
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed: 13.2min finished

[2024-03-06 23:00:17] Features: 15/1 -- score: 0.8929681933768467[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15

Done with XGBClassifier

RandomForestClassifier not in the dictionary.
AdaBoostClassifier not in the dictionary.
BaggingClassifier not in the dictionary.
ExtraTreesClassifier not in the dictionary.
HistGradientBoostingClassifier not in the dictionary.


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   30.6s finished

[2024-03-07 00:12:16] Features: 1/1 -- score: 0.7787491075259737

In [36]:
with open('sfs_features_xgboost.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [37]:
sfs_features = {'AdaBoostClassifier': ['Steel_Plate_Thickness',
                        'Log_Y_Index',
                        'Maximum_of_Luminosity',
                        'Edges_Index',
                        'Empty_Index',
                        'Length_of_Conveyer',
                        'Log_X_Index'],
 'BaggingClassifier': ['X_Minimum',
                       'X_Maximum',
                       'Y_Maximum',
                       'Pixels_Areas',
                       'Sum_of_Luminosity',
                       'Minimum_of_Luminosity',
                       'Maximum_of_Luminosity',
                       'Length_of_Conveyer',
                       'TypeOfSteel_A300',
                       'TypeOfSteel_A400',
                       'Steel_Plate_Thickness',
                       'Edges_Index',
                       'Empty_Index',
                       'Outside_X_Index',
                       'Outside_Global_Index',
                       'Log_X_Index',
                       'Log_Y_Index',
                       'Orientation_Index'],
 'ExtraTreesClassifier': ['Length_of_Conveyer',
                          'X_Minimum',
                          'Steel_Plate_Thickness',
                          'Edges_Y_Index',
                          'Edges_Index',
                          'Outside_X_Index',
                          'Orientation_Index',
                          'Log_Y_Index',
                          'Pixels_Areas',
                          'Empty_Index',
                          'Luminosity_Index',
                          'Maximum_of_Luminosity',
                          'Y_Minimum',
                          'TypeOfSteel_A300'],
 'HistGradientBoostingClassifier': ['X_Minimum',
                                    'Y_Minimum',
                                    'Pixels_Areas',
                                    'Y_Perimeter',
                                    'Minimum_of_Luminosity',
                                    'Maximum_of_Luminosity',
                                    'Length_of_Conveyer',
                                    'TypeOfSteel_A300',
                                    'Steel_Plate_Thickness',
                                    'Edges_Index',
                                    'Empty_Index',
                                    'Square_Index',
                                    'Edges_Y_Index',
                                    'Outside_Global_Index',
                                    'Log_X_Index',
                                    'Luminosity_Index'],
 'LGBMClassifier': ['Empty_Index',
                    'Edges_Index',
                    'Luminosity_Index',
                    'Length_of_Conveyer',
                    'Minimum_of_Luminosity',
                    'Steel_Plate_Thickness',
                    'Sum_of_Luminosity',
                    'Orientation_Index',
                    'X_Maximum',
                    'Square_Index',
                    'X_Perimeter',
                    'Pixels_Areas',
                    'Maximum_of_Luminosity',
                    'SigmoidOfAreas',
                    'Log_Y_Index',
                    'Edges_Y_Index',
                    'Log_X_Index',
                    'TypeOfSteel_A300',
                    'TypeOfSteel_A400',
                    'Outside_Global_Index'],
 'RandomForestClassifier': ['Log_X_Index',
                            'Sum_of_Luminosity',
                            'Minimum_of_Luminosity',
                            'X_Minimum',
                            'Length_of_Conveyer',
                            'SigmoidOfAreas',
                            'Steel_Plate_Thickness',
                            'Edges_Index',
                            'Luminosity_Index',
                            'Empty_Index',
                            'Orientation_Index',
                            'Y_Maximum',
                            'Y_Perimeter',
                            'Edges_Y_Index',
                            'TypeOfSteel_A300'],
'XGBClassifier': ['X_Minimum',
                   'Y_Minimum',
                   'Pixels_Areas',
                   'X_Perimeter',
                   'Minimum_of_Luminosity',
                   'Maximum_of_Luminosity',
                   'Length_of_Conveyer',
                   'TypeOfSteel_A300',
                   'Steel_Plate_Thickness',
                   'Edges_Index',
                   'Empty_Index',
                   'Square_Index',
                   'Outside_X_Index',
                   'Edges_Y_Index',
                   'Outside_Global_Index',
                   'Orientation_Index',
                   'Luminosity_Index']}


In [20]:
def evaluate_models_roc(models, X, y, important_features, cv_split, experiment_name):
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train ROC AUC', 
                                        'MLA Test ROC AUC', 
                                        'MLA Test ROC AUC Std', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train ROC': 0,
                'MLA Test ROC': 0,
                'MLA Test ROC Std': 0,
                'MLA Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='roc_auc_ovr', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train ROC AUC': cv_results['train_score'].mean(),
            'MLA Test ROC AUC': cv_results['test_score'].mean(),
            'MLA Test ROC AUC Std': cv_results['test_score'].std(),
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {MLA_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            results_list.append(result)

    MLA_compare = pd.DataFrame(results_list)

    MLA_compare.sort_values(by=['MLA Test ROC AUC'], ascending=False, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [None]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [None]:
baseline_models = evaluate_models_roc(models, X, y, baseline_features, sk10, experiment_name)
baseline_models

In [None]:
no_corr_features = {}

for model in models:
    model_name = model.__class__.__name__

    no_corr_features[model_name] = list(df_reduced_spear.columns)

In [None]:
no_corr_models = evaluate_models_roc(models, df_reduced_spear, y, no_corr_features, sk10, f'{experiment_name}_corr')
no_corr_models

In [None]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

feat_importance_models = evaluate_models_roc(models, X, y, feat_importance_features, sk10, f'{experiment_name}_featimp')
feat_importance_models

In [None]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

kbest_models = evaluate_models_roc(models, X, y, kbest_features, sk10, f'{experiment_name}_kbest')
kbest_models

In [None]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

rfecv_models = evaluate_models_roc(models, X, y, rfecv_features, sk10, f'{experiment_name}_rfecv')
rfecv_models

In [38]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

sfs_models = evaluate_models_roc(models, X, y, sfs_features, sk10, f'{experiment_name}_sfs')
sfs_models

Done with BaggingClassifier.
Done with LGBMClassifier.
Done with ExtraTreesClassifier.
Done with AdaBoostClassifier.
Done with HistGradientBoostingClassifier.
Done with RandomForestClassifier.
Done with XGBClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.979978,0.897758,0.003937,0 min 6.05 sec
6,HistGradientBoostingClassifier,"{'categorical_features': None, 'early_stopping...",0.956406,0.897722,0.004217,0 min 5.58 sec
1,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.99153,0.893426,0.005765,0 min 45.89 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.999999,0.891984,0.00443,0 min 11.80 sec
5,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.999999,0.887401,0.005769,0 min 7.02 sec
4,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",0.999628,0.843235,0.007434,0 min 4.81 sec
3,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.801962,0.801268,0.010524,0 min 1.75 sec


# Ensembling

In [39]:
model1 = LGBMClassifier(n_jobs=-1, random_state=5)
model2 = XGBClassifier(random_state=5)
model3 = RandomForestClassifier(random_state=5)
model4 = AdaBoostClassifier(random_state=5)
model5 = BaggingClassifier(random_state=5)
model6 = ExtraTreesClassifier(random_state=5)
model7 = HistGradientBoostingClassifier(random_state=5)

- Features for Competiton dataset (Experiment Set 1)

In [None]:
# model1_feats = ['X_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Orientation_Index', 'Luminosity_Index']
# model2_feats = ['X_Minimum', 'Pixels_Areas', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index']
# model3_feats = ['X_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Luminosity_Index']
# model4_feats = ['X_Perimeter', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index']
# model5_feats = ['X_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Orientation_Index']
# model6_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Orientation_Index', 'Luminosity_Index']
# model7_feats = ['X_Minimum', 'Pixels_Areas', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Orientation_Index']

- Features for Competition + Original dataset before SFS (Experiment Set 2)

In [None]:
# model1_feats = ['Empty_Index', 'Edges_Index', 'Luminosity_Index', 'Length_of_Conveyer', 'Minimum_of_Luminosity', 'Edges_X_Index', 'Steel_Plate_Thickness', 'X_Minimum', 'Y_Minimum', 'Sum_of_Luminosity', 'Orientation_Index', 'X_Maximum', 'Square_Index', 'Outside_X_Index', 'X_Perimeter', 'Pixels_Areas', 'Maximum_of_Luminosity', 'SigmoidOfAreas', 'Log_Y_Index', 'Y_Maximum', 'Edges_Y_Index', 'Y_Perimeter', 'Log_X_Index', 'LogOfAreas', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Outside_Global_Index']
# model2_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index']
# model3_feats = ['Outside_X_Index', 'Log_X_Index', 'Pixels_Areas', 'Sum_of_Luminosity', 'LogOfAreas', 'Minimum_of_Luminosity', 'X_Minimum', 'Length_of_Conveyer', 'X_Perimeter', 'SigmoidOfAreas', 'X_Maximum', 'Steel_Plate_Thickness', 'Edges_Index', 'Luminosity_Index', 'Empty_Index', 'Orientation_Index', 'Square_Index', 'Y_Maximum', 'Edges_X_Index', 'Y_Perimeter', 'Log_Y_Index', 'Maximum_of_Luminosity', 'Edges_Y_Index', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Outside_Global_Index']
# model4_feats = ['Steel_Plate_Thickness', 'Log_Y_Index', 'Maximum_of_Luminosity', 'Outside_X_Index', 'Luminosity_Index', 'Edges_Index', 'Empty_Index', 'Pixels_Areas', 'Orientation_Index', 'Y_Perimeter', 'X_Perimeter', 'LogOfAreas', 'Length_of_Conveyer', 'Minimum_of_Luminosity', 'X_Maximum', 'Log_X_Index', 'Edges_Y_Index', 'X_Minimum', 'SigmoidOfAreas', 'Sum_of_Luminosity', 'Y_Maximum', 'Outside_Global_Index', 'TypeOfSteel_A300', 'Square_Index', 'TypeOfSteel_A400', 'Edges_X_Index']
# model5_feats = ['X_Minimum', 'X_Maximum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas']
# model6_feats = ['LogOfAreas', 'Log_X_Index', 'Length_of_Conveyer', 'Minimum_of_Luminosity', 'X_Minimum', 'Steel_Plate_Thickness', 'SigmoidOfAreas', 'X_Maximum', 'Edges_Y_Index', 'Edges_Index', 'Outside_X_Index', 'Orientation_Index', 'Log_Y_Index', 'Pixels_Areas', 'Square_Index', 'Sum_of_Luminosity', 'Edges_X_Index', 'Empty_Index', 'Luminosity_Index', 'X_Perimeter', 'Y_Perimeter', 'Maximum_of_Luminosity', 'Y_Maximum', 'Y_Minimum', 'TypeOfSteel_A300']
# model7_feats = ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index', 'SigmoidOfAreas']

- Features for Competition + Original dataset with SFS except for XGBoost (Experiment Set 2)

In [25]:
# model1_feats = ['Empty_Index', 'Edges_Index', 'Luminosity_Index', 'Length_of_Conveyer', 'Minimum_of_Luminosity', 'Steel_Plate_Thickness', 'Sum_of_Luminosity', 'Orientation_Index', 'X_Maximum', 'Square_Index', 'X_Perimeter', 'Pixels_Areas', 'Maximum_of_Luminosity', 'SigmoidOfAreas', 'Log_Y_Index', 'Edges_Y_Index', 'Log_X_Index', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Outside_Global_Index']
# # model2_feats = 
# model3_feats = ['Log_X_Index', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'X_Minimum', 'Length_of_Conveyer', 'SigmoidOfAreas', 'Steel_Plate_Thickness', 'Edges_Index', 'Luminosity_Index', 'Empty_Index', 'Orientation_Index', 'Y_Maximum', 'Y_Perimeter', 'Edges_Y_Index', 'TypeOfSteel_A300']
# model4_feats = ['Steel_Plate_Thickness', 'Log_Y_Index', 'Maximum_of_Luminosity', 'Edges_Index', 'Empty_Index', 'Length_of_Conveyer', 'Log_X_Index']
# model5_feats = ['X_Minimum', 'X_Maximum', 'Y_Maximum', 'Pixels_Areas', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Outside_Global_Index', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index']
# model6_feats = ['Length_of_Conveyer', 'X_Minimum', 'Steel_Plate_Thickness', 'Edges_Y_Index', 'Edges_Index', 'Outside_X_Index', 'Orientation_Index', 'Log_Y_Index', 'Pixels_Areas', 'Empty_Index', 'Luminosity_Index', 'Maximum_of_Luminosity', 'Y_Minimum', 'TypeOfSteel_A300']
# model7_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Y_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Log_X_Index', 'Luminosity_Index']

- Features for Competition + Original dataset down to SFS for all models (Experiment Set 2)

In [40]:
model1_feats = ['Empty_Index', 'Edges_Index', 'Luminosity_Index', 'Length_of_Conveyer', 'Minimum_of_Luminosity', 'Steel_Plate_Thickness', 'Sum_of_Luminosity', 'Orientation_Index', 'X_Maximum', 'Square_Index', 'X_Perimeter', 'Pixels_Areas', 'Maximum_of_Luminosity', 'SigmoidOfAreas', 'Log_Y_Index', 'Edges_Y_Index', 'Log_X_Index', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Outside_Global_Index']
model2_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index']
model3_feats = ['Log_X_Index', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'X_Minimum', 'Length_of_Conveyer', 'SigmoidOfAreas', 'Steel_Plate_Thickness', 'Edges_Index', 'Luminosity_Index', 'Empty_Index', 'Orientation_Index', 'Y_Maximum', 'Y_Perimeter', 'Edges_Y_Index', 'TypeOfSteel_A300']
model4_feats = ['Steel_Plate_Thickness', 'Log_Y_Index', 'Maximum_of_Luminosity', 'Edges_Index', 'Empty_Index', 'Length_of_Conveyer', 'Log_X_Index']
model5_feats = ['X_Minimum', 'X_Maximum', 'Y_Maximum', 'Pixels_Areas', 'Sum_of_Luminosity', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Outside_Global_Index', 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index']
model6_feats = ['Length_of_Conveyer', 'X_Minimum', 'Steel_Plate_Thickness', 'Edges_Y_Index', 'Edges_Index', 'Outside_X_Index', 'Orientation_Index', 'Log_Y_Index', 'Pixels_Areas', 'Empty_Index', 'Luminosity_Index', 'Maximum_of_Luminosity', 'Y_Minimum', 'TypeOfSteel_A300']
model7_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Y_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Log_X_Index', 'Luminosity_Index']

In [42]:
model1_results, model2_results, model3_results, model4_results, model5_results, model6_results, model7_results, y_test_list = [], [], [], [], [], [], [], []

X_lgbm = X[model1_feats]
X_xgb = X[model2_feats]
X_rf = X[model3_feats]
X_ada = X[model4_feats]
X_bag = X[model5_feats]
X_extrat = X[model6_feats]
X_hist = X[model7_feats]

for i, (train_index, test_index) in enumerate(sk10.split(X, y)):
    X_train_lgbm, X_test_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
    X_train_xgb, X_test_xgb = X_xgb.iloc[train_index], X_xgb.iloc[test_index]
    X_train_rf, X_test_rf = X_rf.iloc[train_index], X_rf.iloc[test_index]
    X_train_ada, X_test_ada = X_ada.iloc[train_index], X_ada.iloc[test_index]
    X_train_bag, X_test_bag = X_bag.iloc[train_index], X_bag.iloc[test_index]
    X_train_extrat, X_test_extrat = X_extrat.iloc[train_index], X_extrat.iloc[test_index]
    X_train_hist, X_test_hist = X_hist.iloc[train_index], X_hist.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model1.fit(X_train_lgbm, y_train)
    model1_results.append(model1.predict_proba(X_test_lgbm))

    model2.fit(X_train_xgb, y_train)
    model2_results.append(model2.predict_proba(X_test_xgb))

    model3.fit(X_train_rf, y_train)
    model3_results.append(model3.predict_proba(X_test_rf))

    model4.fit(X_train_ada, y_train)
    model4_results.append(model4.predict_proba(X_test_ada))

    model5.fit(X_train_bag, y_train)
    model5_results.append(model5.predict_proba(X_test_bag))

    model6.fit(X_train_extrat, y_train)
    model6_results.append(model6.predict_proba(X_test_extrat))

    model7.fit(X_train_hist, y_train)
    model7_results.append(model7.predict_proba(X_test_hist))

    y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

Done with fold 1.
Done with fold 2.
Done with fold 3.
Done with fold 4.
Done with fold 5.
Done with fold 6.
Done with fold 7.
Done with fold 8.
Done with fold 9.
Done with fold 10.


In [43]:
model1_weights, model2_weights, model3_weights, model4_weights, model5_weights, model6_weights, model7_weights, scores = [], [], [], [], [], [], [], []

for i in tqdm(range(20000)):
    weight_1 = np.random.random_sample(size=1)[0]
    weight_2 = np.random.random_sample(size=1)[0]
    weight_3 = np.random.random_sample(size=1)[0]
    weight_4 = np.random.random_sample(size=1)[0]
    weight_5 = np.random.random_sample(size=1)[0]
    weight_6 = np.random.random_sample(size=1)[0]
    weight_7 = np.random.random_sample(size=1)[0]

    model1_weights.append(weight_1)
    model2_weights.append(weight_2)
    model3_weights.append(weight_3)
    model4_weights.append(weight_4)
    model5_weights.append(weight_5)
    model6_weights.append(weight_6)
    model7_weights.append(weight_7)

    scores_in = []

    for j in range(10):
        weighted_pred = weight_1 * model1_results[j] + weight_2 * model2_results[j] + weight_3 * model3_results[j] + weight_4 * model4_results[j] + weight_5 * model5_results[j] + weight_6 * model6_results[j] + weight_7 * model7_results[j]
        weighted_pred_normalized = weighted_pred / np.sum(weighted_pred, axis=1, keepdims=True)
        scores_in.append(roc_auc_score(y_test_list[j], weighted_pred_normalized, multi_class='ovr'))
        
    scores.append(np.mean(scores_in))

  0%|          | 0/20000 [00:00<?, ?it/s]

In [44]:
results_df = pd.DataFrame()
results_df['model_1'] = model1_weights
results_df['model_2'] = model2_weights
results_df['model_3'] = model3_weights
results_df['model_4'] = model4_weights
results_df['model_5'] = model5_weights
results_df['model_6'] = model6_weights
results_df['model_7'] = model7_weights
results_df['score'] = scores
results_df = results_df.sort_values(by='score', ascending=False).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,score
0,0.698985,0.45624,0.531145,0.146376,0.139772,0.583415,0.98501,0.902473
1,0.559022,0.30181,0.433673,0.075449,0.116961,0.51758,0.975691,0.902455
2,0.918062,0.337989,0.449073,0.234843,0.084851,0.512557,0.868723,0.902452
3,0.545957,0.402629,0.482854,0.175717,0.078166,0.443152,0.933761,0.902448
4,0.772595,0.20301,0.55952,0.201157,0.175968,0.334191,0.921509,0.902446
5,0.773935,0.334659,0.511749,0.181744,0.060183,0.263307,0.819449,0.902444
6,0.396317,0.393288,0.335559,0.115729,0.079242,0.30883,0.667064,0.902428
7,0.639602,0.455706,0.457779,0.016306,0.174064,0.488514,0.822653,0.902426
8,0.768137,0.497433,0.79563,0.26104,0.122253,0.425164,0.943631,0.902425
9,0.663485,0.412257,0.475086,0.204215,0.099234,0.578763,0.805495,0.902421


# Get Submission (Ensemble)

In [45]:
model1 = model1.fit(X_lgbm, y)
model2 = model2.fit(X_xgb, y)
model3 = model3.fit(X_rf, y)
model4 = model4.fit(X_ada, y)
model5 = model5.fit(X_bag, y)
model6 = model6.fit(X_extrat, y)
model7 = model7.fit(X_hist, y)

In [54]:
ensemble_pred = (
                results_df['model_1'][0] * model1.predict_proba(test[model1_feats]) +
                results_df['model_2'][0] * model2.predict_proba(test[model2_feats]) +
                results_df['model_3'][0] * model3.predict_proba(test[model3_feats]) +
                results_df['model_4'][0] * model4.predict_proba(test[model4_feats]) +
                results_df['model_5'][0] * model5.predict_proba(test[model5_feats]) +
                results_df['model_6'][0] * model6.predict_proba(test[model6_feats]) +
                results_df['model_7'][0] * model7.predict_proba(test[model7_feats])
                 )

ensemble_df = pd.DataFrame(ensemble_pred)

# If all models predict 0, instead of getting NaN, fill in 0
ensemble_df = ensemble_df.div(ensemble_df.sum(axis=1), axis=0).fillna(0)
ensemble_df.columns = label_encoder.classes_

In [55]:
ensemble_df.head()

Unnamed: 0,Bumps,Dirtiness,K_Scatch,Other_Faults,Pastry,Stains,Z_Scratch
0,0.407706,0.110497,0.021503,0.970669,1.998177,0.001584,0.030806
1,0.578527,0.474291,0.048646,1.309888,1.063548,0.014306,0.051736
2,1.155716,0.030397,0.232398,1.894214,0.022261,0.02001,0.185947
3,1.360948,0.09918,0.011702,1.628763,0.36788,0.032596,0.039873
4,2.236575,0.036326,0.01226,1.130867,0.057298,0.03369,0.033927


In [56]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], ensemble_df], axis=1)
submission_df.head()

Unnamed: 0,id,Bumps,Dirtiness,K_Scatch,Other_Faults,Pastry,Stains,Z_Scratch
0,19219,0.407706,0.110497,0.021503,0.970669,1.998177,0.001584,0.030806
1,19220,0.578527,0.474291,0.048646,1.309888,1.063548,0.014306,0.051736
2,19221,1.155716,0.030397,0.232398,1.894214,0.022261,0.02001,0.185947
3,19222,1.360948,0.09918,0.011702,1.628763,0.36788,0.032596,0.039873
4,19223,2.236575,0.036326,0.01226,1.130867,0.057298,0.03369,0.033927


In [57]:
submission_df.to_csv('submission_ensemble_0.902473.csv', index=False)