In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import optuna
import random
from tqdm.notebook import tqdm

from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.feature_selection import RFECV, SelectKBest, f_classif
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, label_binarize, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.inspection import permutation_importance

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

experiment_name = 'multi-models_with_original_dataset'

In [2]:
train = pd.read_csv('train.csv')
original = pd.read_csv('Fault.csv', delimiter='\t')
test = pd.read_csv('test.csv')

In [3]:
train.shape, original.shape, test.shape

((19219, 35), (1940, 34), (12814, 28))

In [4]:
# Remove id column and check the dataset
train.drop('id', axis=1, inplace=True)
train.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,584,590,909972,909977,16,8,5,2274,113,140,1358,0,1,50,0.7393,0.4,0.5,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,1687,1,0,80,0.7772,0.2878,0.2581,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,1400,0,1,40,0.0557,0.5282,0.9895,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,1387,0,1,40,0.7202,0.3333,0.3333,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,1692,0,1,300,0.1211,0.5347,0.0842,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [5]:
test.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,127,1656,0,1,150,0.3877,0.4896,0.3273,0.0095,0.5652,1.0,1.0,2.841,1.1139,1.6628,0.6727,-0.2261,0.9172
1,19220,1257,1271,419960,419973,370,26,28,39293,92,132,1354,0,1,40,0.1629,0.4136,0.0938,0.0047,0.2414,1.0,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,19221,1358,1372,117715,117724,289,36,32,29386,101,134,1360,0,1,40,0.0609,0.6234,0.4762,0.0155,0.6,0.75,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,19222,158,168,232415,232440,80,10,11,8586,107,140,1690,1,0,100,0.4439,0.3333,0.8182,0.0037,0.8,1.0,1.0,1.9031,0.699,1.0414,0.1818,-0.0738,0.2051
4,19223,559,592,544375,544389,140,19,15,15524,103,134,1688,1,0,60,0.8191,0.2619,0.4286,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.417


In [6]:
original_dataset_cols = list(train.columns)

In [7]:
features_list = ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
                 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
                 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
                 'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
                 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
                 'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
                 'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
                 'SigmoidOfAreas']

In [8]:
original.columns = original_dataset_cols
original.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,645,651,2538079,2538108,108,10,30,11397,84,123,1687,1,0,80,0.7647,0.3793,0.2069,0.0036,0.6,0.9667,1.0,2.0334,0.7782,1.4624,0.7931,-0.1756,0.2984,1,0,0,0,0,0,0
1,829,835,1553913,1553931,71,8,19,7972,99,125,1623,1,0,100,0.971,0.3426,0.3333,0.0037,0.75,0.9474,1.0,1.8513,0.7782,1.2553,0.6667,-0.1228,0.215,1,0,0,0,0,0,0
2,853,860,369370,369415,176,13,45,18996,99,126,1353,0,1,290,0.7287,0.4413,0.1556,0.0052,0.5385,1.0,1.0,2.2455,0.8451,1.6532,0.8444,-0.1568,0.5212,1,0,0,0,0,0,0
3,1289,1306,498078,498335,2409,60,260,246930,37,126,1353,0,1,185,0.0695,0.4486,0.0662,0.0126,0.2833,0.9885,1.0,3.3818,1.2305,2.4099,0.9338,-0.1992,1.0,1,0,0,0,0,0,0
4,430,441,100250,100337,630,20,87,62357,64,127,1387,0,1,40,0.62,0.3417,0.1264,0.0079,0.55,1.0,1.0,2.7993,1.0414,1.9395,0.8736,-0.2267,0.9874,1,0,0,0,0,0,0


In [9]:
# Concat the train and original dataset
combined_df = pd.concat([train, original], axis=0).reset_index(drop=True)

In [10]:
# Got function from https://www.kaggle.com/code/thomasmeiner/ps4e3-eda-feature-engineering-model

def reformat_data(df: pd.DataFrame) -> pd.DataFrame:
    target_cols = [
        "Pastry", #4
        "Z_Scratch", #6
        "K_Scatch", #2
        "Stains", #5
        "Dirtiness", #1
        "Bumps", #0
        "Other_Faults", #3
    ]
    non_target_cols = df.drop(target_cols, axis=1).columns.to_list()
    
    binary_dfs = []
    
    for col in target_cols:
        temp_df = df.loc[:, non_target_cols + [col]]
        temp_df = temp_df.loc[temp_df[col] == 1].copy() # keep positives only
        temp_df[col] = col # target value is class name now
        temp_df = temp_df.rename(columns={col: "target"}) # make target col name uniform for final concat
        binary_dfs.append(temp_df)
        
    reformatted_df = pd.concat(binary_dfs)
    return reformatted_df

In [11]:
# def reformat_data(df: pd.DataFrame) -> pd.DataFrame:
#     target_cols = [
#         "Pastry",
#         "Z_Scratch",
#         "K_Scatch",
#         "Stains",
#         "Dirtiness",
#         "Bumps",
#         "Other_Faults",
#     ]
#     non_target_cols = df.drop(target_cols, axis=1).columns.to_list()

#     binary_dfs = []

#     for col in target_cols:
#         temp_df = df.loc[:, non_target_cols + [col]]
#         temp_df = temp_df.loc[temp_df[col] == 1].copy() # sub sample to lowest class
#         temp_df[col] = col # target value is class name now
#         temp_df = temp_df.rename(columns={col: "target"}) # make target col name uniform for final concat
#         binary_dfs.append(temp_df)

#     # collect non_defect rows
#     temp_df = df.loc[
#         (df["Pastry"] == 0) &
#         (df["Z_Scratch"] == 0) &
#         (df["K_Scatch"] == 0) &
#         (df["Stains"] == 0) &
#         (df["Dirtiness"] == 0) &
#         (df["Bumps"] == 0) &
#         (df["Other_Faults"] == 0)
#     ]
#     temp_df = temp_df.loc[: , non_target_cols]
#     temp_df["target"] = "No defect"
#     binary_dfs.append(temp_df)

#     reformatted_df = pd.concat(binary_dfs)
#     return reformatted_df

In [12]:
combined_df = reformat_data(combined_df)
combined_df.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
7,1673,1687,294065,294091,571,38,57,53142,77,110,1692,0,1,300,0.1491,0.4326,0.9643,0.0142,0.5686,0.7179,1.0,2.7528,1.3802,1.7559,0.0357,-0.2661,0.9408,Pastry
24,1538,1549,849219,849235,275,19,32,28986,71,117,1626,1,0,70,0.1494,0.399,0.375,0.0088,0.6316,1.0,1.0,2.4393,1.0792,1.5051,0.625,-0.2988,0.633,Pastry
35,1310,1316,435871,435916,153,16,32,17101,104,132,1352,0,1,40,0.0532,0.3854,0.3333,0.0044,0.375,0.9688,1.0,2.1847,0.7782,1.5051,0.6667,-0.0916,0.4025,Pastry
60,765,774,6571361,6571375,59,9,12,6682,77,133,1360,0,1,100,0.3613,0.3571,0.28,0.0052,0.7778,1.0,1.0,1.7708,0.8451,1.0792,0.72,-0.1522,0.1892,Pastry
67,1677,1686,1319063,1319076,91,10,15,5608,57,95,1692,1,0,70,0.0024,0.3583,0.6667,0.0047,0.8,1.0,1.0,1.959,0.9031,1.1761,0.3333,-0.3868,0.266,Pastry


In [13]:
TARGET = 'target'

In [14]:
label_encoder = LabelEncoder()

label_encoder.fit(combined_df[TARGET])

combined_df[TARGET] = label_encoder.transform(combined_df[TARGET])

combined_df.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,target
7,1673,1687,294065,294091,571,38,57,53142,77,110,1692,0,1,300,0.1491,0.4326,0.9643,0.0142,0.5686,0.7179,1.0,2.7528,1.3802,1.7559,0.0357,-0.2661,0.9408,4
24,1538,1549,849219,849235,275,19,32,28986,71,117,1626,1,0,70,0.1494,0.399,0.375,0.0088,0.6316,1.0,1.0,2.4393,1.0792,1.5051,0.625,-0.2988,0.633,4
35,1310,1316,435871,435916,153,16,32,17101,104,132,1352,0,1,40,0.0532,0.3854,0.3333,0.0044,0.375,0.9688,1.0,2.1847,0.7782,1.5051,0.6667,-0.0916,0.4025,4
60,765,774,6571361,6571375,59,9,12,6682,77,133,1360,0,1,100,0.3613,0.3571,0.28,0.0052,0.7778,1.0,1.0,1.7708,0.8451,1.0792,0.72,-0.1522,0.1892,4
67,1677,1686,1319063,1319076,91,10,15,5608,57,95,1692,1,0,70,0.0024,0.3583,0.6667,0.0047,0.8,1.0,1.0,1.959,0.9031,1.1761,0.3333,-0.3868,0.266,4


In [15]:
combined_df.shape, test.shape

((20362, 28), (12814, 28))

In [16]:
X = combined_df.drop([TARGET], axis=1)
y = combined_df[TARGET]

n_splits = 3
sk10 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [24]:
lgbm_params_1 = {'objective': 'multiclass', 'num_class': 7, 'n_jobs': -1, 'random_state': 5, 'class_weight': None, 'boosting': 'gbdt', 'colsample_bytree': 0.3732513300418112, 'learning_rate': 0.11407449915410499, 'max_depth': 41, 'min_child_samples': 46, 'min_child_weight': 5.402556610174332, 'min_split_gain': 0.7031617243606461, 'n_estimators': 168, 'num_leaves': 4, 'reg_alpha': 0.6688125857557925, 'reg_lambda': 0.3006431833867248, 'subsample': 0.7700917000679017}
lgbm_params_2 = {'objective': 'multiclass', 'num_class': 7, 'n_jobs': -1, 'random_state': 5, 'class_weight': None, 'boosting': 'dart', 'colsample_bytree': 0.5772149549047157, 'learning_rate': 0.33002721528421214, 'max_depth': 28, 'min_child_samples': 23, 'min_child_weight': 7.99839845350044, 'min_split_gain': 0.9239893181066746, 'n_estimators': 699, 'num_leaves': 21, 'reg_alpha': 0.4760265812563772, 'reg_lambda': 0.15634739914542023, 'subsample': 0.4590800039755062}
lgbm_params_3 = {'objective': 'multiclass', 'num_class': 7, 'n_jobs': -1, 'random_state': 5, 'class_weight': None, 'boosting': 'dart', 'colsample_bytree': 0.31193732162008925, 'learning_rate': 0.03107980185406725, 'max_depth': 58, 'min_child_samples': 26, 'min_child_weight': 0.9044244579504741, 'min_split_gain': 0.6988647291560484, 'n_estimators': 814, 'num_leaves': 10, 'reg_alpha': 0.8952949360981323, 'reg_lambda': 0.12911118443546726, 'subsample': 0.9423432553265046}
lgbm_params_4 = {'objective': 'multiclass', 'num_class': 7, 'n_jobs': -1, 'random_state': 5, 'class_weight': None, 'boosting': 'dart', 'colsample_bytree': 0.35885033102300595, 'learning_rate': 0.21902422874173558, 'max_depth': 3, 'min_child_samples': 34, 'min_child_weight': 4.768878441321062, 'min_split_gain': 0.8804385788228134, 'n_estimators': 334, 'num_leaves': 19, 'reg_alpha': 0.9922927799932719, 'reg_lambda': 0.010366800790489683, 'subsample': 0.5713199333429264}

In [25]:
models = [
    LGBMClassifier(n_jobs=-1, random_state=5),
    LGBMClassifier(**lgbm_params_1),
    LGBMClassifier(**lgbm_params_2),
    LGBMClassifier(**lgbm_params_3),
    LGBMClassifier(**lgbm_params_4),
    # XGBClassifier(random_state=5),
    # RandomForestClassifier(random_state=5),
    # ExtraTreesClassifier(random_state=5),
    # HistGradientBoostingClassifier(random_state=5),
    # CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100),
    ]

In [26]:
def evaluate_models_roc(models, X, y, important_features, cv_split, experiment_name):
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train ROC AUC', 
                                        'MLA Test ROC AUC', 
                                        'MLA Test ROC AUC Std', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train ROC': 0,
                'MLA Test ROC': 0,
                'MLA Test ROC Std': 0,
                'MLA Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='roc_auc_ovr', 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train ROC AUC': cv_results['train_score'].mean(),
            'MLA Test ROC AUC': cv_results['test_score'].mean(),
            'MLA Test ROC AUC Std': cv_results['test_score'].std(),
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {MLA_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            results_list.append(result)

    MLA_compare = pd.DataFrame(results_list)

    MLA_compare.sort_values(by=['MLA Test ROC AUC'], ascending=False, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [27]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X_lgbm.columns)

In [28]:
%%time

baseline_models = evaluate_models_roc(models, X, y, baseline_features, sk10, f'{experiment_name}_lgbm')
baseline_models

Done with LGBMClassifier.
Done with LGBMClassifier.
Done with LGBMClassifier.
Done with LGBMClassifier.
Done with LGBMClassifier.
CPU times: total: 46.9 ms
Wall time: 3min 1s


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
3,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.933004,0.900265,0.002224,2 min 35.32 sec
4,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.934512,0.899455,0.002012,0 min 34.30 sec
2,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.958327,0.898667,0.001489,1 min 31.75 sec
1,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.921078,0.897596,0.001941,0 min 2.68 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",0.988834,0.893696,0.002014,0 min 4.93 sec


In [None]:
%%time

baseline_models = evaluate_models_roc(models, X, y, baseline_features, sk10, f'{experiment_name}')
baseline_models

- Remove Correlated Features

In [None]:
# Remove correlated features (leaving just 1 of each pair)
# Leave features highly correlated with the target
df_no_corr = X.copy()
correlation_matrix_spear = df_no_corr.corr(method='spearman').abs()

# Select upper triangle of correlation matrix
upper_spear = correlation_matrix_spear.where(np.triu(np.ones(correlation_matrix_spear.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than a threshold (e.g., 0.9 in this case)
to_drop_spear = [column for column in upper_spear.columns if any(upper_spear[column] >= 0.9)]

# Drop features
df_reduced_spear = df_no_corr.drop(to_drop_spear, axis=1)

# Get list of low correlation features excluding TARGET
low_corr_feats_spear = list(df_reduced_spear.columns)

with open('low_corr_spear.txt', 'w') as f:
    f.write(str(low_corr_feats_spear))
    f.write('\n')

# Print the high correlation features effect
# Both pre and post drop dfs contain the TARGET
print(f"Dropped {len(to_drop_spear)} highly correlated features.\nOld Shape of the dataset was {df_no_corr.shape}\nNew shape of the dataset is {df_reduced_spear.shape}")

In [None]:
%%time

no_corr_features = {}

for model in models:
    model_name = model.__class__.__name__

    no_corr_features[model_name] = list(df_reduced_spear.columns)

In [None]:
%%time

no_corr_models = evaluate_models_roc(models, df_reduced_spear, y, no_corr_features, sk10, f'{experiment_name}_corr')
no_corr_models

- Feature Importances

In [None]:
# feat_importance_features = {}

# for model in models:
#     model_name = model.__class__.__name__

#     try:
#         # Initialize array to store feature importances
#         feature_importances = np.zeros(X.shape[1])

#         # Loop through each fold and calculate the feature importances
#         for train_index, test_index in sk10.split(X, y):
#             X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#             y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#             model.fit(X_train, y_train)

#             # Get the feature importances and them to the total
#             feature_importances += model.feature_importances_

#         feature_importances /= n_splits

#         feature_importances_dict = dict(zip(X.columns, feature_importances))

#         df = pd.DataFrame.from_dict(feature_importances_dict, orient='index')

#         # Resetting index with a name for the column
#         df = df.reset_index().rename(columns={'index': 'Feature', 0: 'Avg_Feat_Importance'})
#         df.sort_values(by='Avg_Feat_Importance', ascending=False, inplace=True)

#         # Save to CSV
#         df.to_csv(f'{model_name}_feature_importances.csv')

#         fi_threshold = 0

#         fi_feats = df[df['Avg_Feat_Importance'] > fi_threshold]['Feature'].tolist()

#         feat_importance_features[model_name] = fi_feats
#         print(f'Done with {model_name}')

#     except AttributeError:
#         feat_importance_features[model_name] = list(X.columns)
#         print(f'{model_name} does not have feature_importances_')

In [None]:
# with open('featimp_features.txt', mode='w') as f:
#     pprint(feat_importance_features, stream=f)

- Permutation Importance

In [None]:
# Generate a random feature for X
np.random.seed(5)
df_reduced_spear['random_control_feature'] = np.round(np.random.uniform(-2, 2, df_reduced_spear.shape[0]), 6)
df_reduced_spear.shape

In [None]:
%%time

perm_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

perm_importances = {model.__class__.__name__: [] for model in models}

for i, (train_idx, test_idx) in enumerate(perm_cv.split(df_reduced_spear, y)):
    X_train, X_test = df_reduced_spear.iloc[train_idx], df_reduced_spear.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for model in models:
        model_name = model.__class__.__name__
        model.fit(X_train, y_train)
        # Calculate permutation importance
        result = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=5, n_jobs=-1, scoring='roc_auc_ovr')
        perm_importances[model_name].append(result.importances_mean)
        print(f'Done with {model_name}.')
    
    print(f'Done with Fold {i+1}', end='\n\n')

In [None]:
%%time

# Average importances across folds and export to CSV
for model_name, importances in perm_importances.items():
    avg_importance = np.mean(importances, axis=0)
    importance_df = pd.DataFrame({'Feature': df_reduced_spear.columns, 'Importance': avg_importance})
    importance_df.sort_values(by='Importance', ascending=False, inplace=True)
    # Export to CSV
    importance_df.to_csv(f'.\permutation_importances\{model_name}_permutation_importance.csv', index=False)

print('Done with Permuation Importances', end='\n\n')

In [None]:
directory = 'permutation_importances'

# Initialize a dictionary for the features
perm_important_features = {}

for model in models:
    model_name = model.__class__.__name__
    csv_path = os.path.join(directory, f'{model_name}_permutation_importance.csv')
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)

        # Check for 'random_control_feature' and its importance
        if 'random_control_feature' in df['Feature'].values:
            random_feature_importance = df.loc[df['Feature'] == 'random_control_feature', 'Importance'].iloc[0]
        else:
            random_feature_importance = 0

        # Determine the threshold
        threshold = max(0, random_feature_importance)

        # Filter features where importance is greater than 0
        important_feats_filtered = df[df['Importance'] > threshold]['Feature'].tolist()

        # Reorder important_feats based on the predefined features_list
        important_feats_ordered = [feat for feat in features_list if feat in important_feats_filtered]

        # Add to importance dictionary
        perm_important_features[model_name] = important_feats_ordered

    else:
        print(f'CSV file for {model_name} not found.')

print('Done getting important features dictionary')

In [None]:
with open('perm_important_features_lgbm.txt', mode='w') as f:
    pprint(perm_important_features, stream=f)

In [None]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

perm_importance_models = evaluate_models_roc(models, X, y, perm_important_features, sk10, f'{experiment_name}_permimp')
perm_importance_models

- SelectKBest with f_classif

In [None]:
best_features_list = []
kbest_features = {}

for model in models:
    model_name = model.__class__.__name__

    # Select whichever one had a better CV score generally
    # Also, consider computational expense and accuracy balance
    
    features = perm_important_features[model_name]
    # features = list(df_reduced_spear.columns)

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_kbest = X[features]
    best_score = 0
    best_k = 0
    best_features = []

    # Iterate over k from 1 to number of features
    for k in range(1, len(features) + 1):
        print(f'currently running {k} features on {model_name}')
        # Apply SelectKBest
        selector = SelectKBest(f_classif, k=k)
        X_new = selector.fit_transform(X_kbest, y)

        # Get the selected feature names
        selected_features = X_kbest.columns[selector.get_support()]

        # Evaluate the model
        # model = LGBMClassifier(n_jobs=-1, random_state=5)
        roc_auc_scores = cross_validate(model, X_new, y, cv=sk10, scoring='roc_auc_ovr', n_jobs=-1)
        mean_roc_auc_scores = roc_auc_scores['test_score'].mean()

        if mean_roc_auc_scores > best_score:
            best_k = k
            best_score = mean_roc_auc_scores
            best_features = list(selected_features)

    best_features_list.append({'k': best_k,
                    'Selected Features': best_features,
                    'ROC AUC Score': best_score,
                    'Model Name': model_name})
    
    kbest_features[model_name] = best_features

best_features_df = pd.DataFrame(best_features_list)

best_features_df.sort_values(by='ROC AUC Score', ascending=False, inplace=True)

In [None]:
with open('kbest_features.txt', mode='w') as f:
    pprint(kbest_features, stream=f)

In [None]:
best_features_df

- RFECV

In [None]:
%%time

# Initialize empty dictionary for RFECV features
rfecv_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__
		
    features = perm_important_features[MLA_name]

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_rfecv = X[features]

    try:
        print(f'Starting with {MLA_name}')
        # Create the RFECV object and rank each feature
        selector = RFECV(alg, cv=sk10, step=1, scoring='roc_auc_ovr', verbose=2)
        selector = selector.fit(X_rfecv, y)

        selected_features = list(X_rfecv.columns[selector.support_])

        # Reorder selected_features based on the predefined features_list
        selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        rfecv_features[MLA_name] = selected_features_ordered

        print(f'Done with {MLA_name}', end='\n\n')
    
    except ValueError:
        # In case of an error, keep the original order but filtered by features_list
        features_filtered = [feat for feat in features_list if feat in features]
        rfecv_features[MLA_name] = features_filtered
        print(f'{MLA_name} does not have coef_ or feature_importances_', end='\n\n')

In [None]:
with open('rfecv_features.txt', mode='w') as f:
    pprint(rfecv_features, stream=f)

In [None]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

rfecv_models = evaluate_models_roc(models, X, y, rfecv_features, sk10, f'{experiment_name}_rfecv')
rfecv_models

- SFS

In [None]:
%%time

# Initialize empty dictionary for SFS features
sfs_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__

    try:
            
        # features = kbest_features[MLA_name]
        # features = feat_importance_features[MLA_name]
        features = rfecv_features[MLA_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {MLA_name}')

        sfs = SFS(alg,
            k_features='best',
            forward=False,
            floating=False,
            scoring='roc_auc_ovr',
            verbose=2,
            n_jobs=-1,
            cv=sk10)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        selected_features = list(selected_sfs_feats)

        # Reorder selected_features based on the predefined features_list
        selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        sfs_features[MLA_name] = selected_features_ordered

        print(f'Done with {MLA_name}', end='\n\n')

    except KeyError:
        print(f'{MLA_name} not in the dictionary.')

In [None]:
with open('sfs_features.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [None]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

sfs_models = evaluate_models_roc(models, X, y, sfs_features, sk10, f'{experiment_name}_sfs')
sfs_models

In [None]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

feat_importance_models = evaluate_models_roc(models, X, y, feat_importance_features, sk10, f'{experiment_name}_featimp')
feat_importance_models

In [None]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

kbest_models = evaluate_models_roc(models, X, y, kbest_features, sk10, f'{experiment_name}_kbest')
kbest_models

### Post Model Features

In [None]:
model1 = LGBMClassifier(n_jobs=-1, random_state=5)
model2 = XGBClassifier(random_state=5)
model3 = RandomForestClassifier(random_state=5)
model4 = ExtraTreesClassifier(random_state=5)
model5 = HistGradientBoostingClassifier(random_state=5)
model6 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)

- Features for Competition + Original dataset down to SFS for all models (Experiment Set 2)

In [17]:
model1_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Orientation_Index', 'Luminosity_Index']
model2_feats = ['X_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index']
model3_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Orientation_Index', 'Luminosity_Index']
model4_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Orientation_Index']
model5_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Orientation_Index', 'Luminosity_Index']
model6_feats = ['X_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index']

X_lgbm = X[model1_feats]
X_xgb = X[model2_feats]
X_rf = X[model3_feats]
X_extrat = X[model4_feats]
X_hist = X[model5_feats]
X_cat = X[model6_feats]

# Hyperparameter Tuning

- LGBM

In [19]:
import optuna

def objective(trial):
    # class_weight_option = trial.suggest_categorical('class_weight', ['none', 'balanced', 'custom'])
    # if class_weight_option == 'none':
    #     class_weight = None
    # elif class_weight_option == 'balanced':
    #     class_weight = 'balanced'
    # else:
    #     # For multi-class, you could define a range or specific values to test
    #     weight_for_class_0 = trial.suggest_float('weight_for_class_0', 0.1, 10.0)
    #     weight_for_class_1 = trial.suggest_float('weight_for_class_1', 0.1, 10.0)
    #     weight_for_class_2 = trial.suggest_float('weight_for_class_2', 0.1, 10.0)
    #     weight_for_class_3 = trial.suggest_float('weight_for_class_3', 0.1, 10.0)
    #     weight_for_class_4 = trial.suggest_float('weight_for_class_4', 0.1, 10.0)
    #     weight_for_class_5 = trial.suggest_float('weight_for_class_5', 0.1, 10.0)
    #     weight_for_class_6 = trial.suggest_float('weight_for_class_6', 0.1, 10.0)
    #     class_weight = {0: weight_for_class_0, 1: weight_for_class_1, 2: weight_for_class_2, 3: weight_for_class_3, 4: weight_for_class_4, 5: weight_for_class_5, 6: weight_for_class_6}

    param = {
        'objective': 'multiclass',
        'num_class': 7,
        # 'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart']), # Selected from observing the other tuning trials
        # 'class_weight': class_weight,
        'class_weight': None, # Selected from observing the other tuning trials
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.8),
        'learning_rate': trial.suggest_float('learning_rate', 0.000001, 0.5),
        'max_depth': trial.suggest_int('max_depth', -1, 64),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.5, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'n_jobs': -1,
        'num_leaves': trial.suggest_int('num_leaves', 2, 150),
        'random_state': 5,
        'reg_alpha': trial.suggest_float('reg_alpha', 0.4, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.5),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        }

    roc_auc_scores = []
    
    for train_index, test_index in sk10.split(X_lgbm, y):
        X_train, X_test = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = LGBMClassifier(**param)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
        preds = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, preds, multi_class='ovr', average='macro')
        roc_auc_scores.append(roc_auc)
    
    return np.mean(roc_auc_scores)

# Using median pruner
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=3, interval_steps=1)

study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=50)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

# 0.893695902

[I 2024-03-29 21:21:50,494] A new study created in memory with name: no-name-9b64a4b6-064b-4f78-818e-e20b760c702c
[I 2024-03-29 21:22:17,010] Trial 0 finished with value: 0.8976615929369792 and parameters: {'boosting': 'dart', 'colsample_bytree': 0.5378419005311236, 'learning_rate': 0.19389439422812849, 'max_depth': 5, 'min_child_samples': 46, 'min_child_weight': 9.712332134025896, 'min_split_gain': 0.5086730611030881, 'n_estimators': 252, 'num_leaves': 36, 'reg_alpha': 0.5735842293464366, 'reg_lambda': 0.11437356348065236, 'subsample': 0.6211258812680587}. Best is trial 0 with value: 0.8976615929369792.
[I 2024-03-29 21:23:06,391] Trial 1 finished with value: 0.8966261881554763 and parameters: {'boosting': 'dart', 'colsample_bytree': 0.7633258782246661, 'learning_rate': 0.45924154140117035, 'max_depth': 15, 'min_child_samples': 75, 'min_child_weight': 9.533023304028491, 'min_split_gain': 0.5840019466567259, 'n_estimators': 463, 'num_leaves': 50, 'reg_alpha': 0.5743634611942869, 'reg_l

Number of finished trials: 50
Best trial: {'boosting': 'dart', 'colsample_bytree': 0.35885033102300595, 'learning_rate': 0.21902422874173558, 'max_depth': 3, 'min_child_samples': 34, 'min_child_weight': 4.768878441321062, 'min_split_gain': 0.8804385788228134, 'n_estimators': 334, 'num_leaves': 19, 'reg_alpha': 0.9922927799932719, 'reg_lambda': 0.010366800790489683, 'subsample': 0.5713199333429264}


In [20]:
optuna.visualization.plot_optimization_history(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [21]:
optuna.visualization.plot_param_importances(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [22]:
optuna.visualization.plot_slice(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [23]:
optuna.visualization.plot_parallel_coordinate(study)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

# Ensembling

In [None]:
%%time

model1_results, model2_results, model3_results, model4_results, model5_results, model6_results, y_test_list = [], [], [], [], [], [], []


for i, (train_index, test_index) in enumerate(sk10.split(X, y)):
    X_train_lgbm, X_test_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
    X_train_xgb, X_test_xgb = X_xgb.iloc[train_index], X_xgb.iloc[test_index]
    X_train_rf, X_test_rf = X_rf.iloc[train_index], X_rf.iloc[test_index]
    X_train_extrat, X_test_extrat = X_extrat.iloc[train_index], X_extrat.iloc[test_index]
    X_train_hist, X_test_hist = X_hist.iloc[train_index], X_hist.iloc[test_index]
    X_train_cat, X_test_cat = X_cat.iloc[train_index], X_cat.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model1.fit(X_train_lgbm, y_train)
    model1_results.append(model1.predict_proba(X_test_lgbm))

    model2.fit(X_train_xgb, y_train)
    model2_results.append(model2.predict_proba(X_test_xgb))

    model3.fit(X_train_rf, y_train)
    model3_results.append(model3.predict_proba(X_test_rf))

    model4.fit(X_train_extrat, y_train)
    model4_results.append(model4.predict_proba(X_test_extrat))

    model5.fit(X_train_hist, y_train)
    model5_results.append(model5.predict_proba(X_test_hist))

    model6.fit(X_train_cat, y_train)
    model6_results.append(model6.predict_proba(X_test_cat))

    # y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

In [None]:
%%time

model1_weights, model2_weights, model3_weights, model4_weights, model5_weights, model6_weights, scores = [], [], [], [], [], [], []

for i in tqdm(range(20000)):
    weight_1 = np.random.random_sample(size=1)[0]
    weight_2 = np.random.random_sample(size=1)[0]
    weight_3 = np.random.random_sample(size=1)[0]
    weight_4 = np.random.random_sample(size=1)[0]
    weight_5 = np.random.random_sample(size=1)[0]
    weight_6 = np.random.random_sample(size=1)[0]

    model1_weights.append(weight_1)
    model2_weights.append(weight_2)
    model3_weights.append(weight_3)
    model4_weights.append(weight_4)
    model5_weights.append(weight_5)
    model6_weights.append(weight_6)

    scores_in = []

    for j in range(n_splits):
        weighted_pred = weight_1 * model1_results[j] + weight_2 * model2_results[j] + weight_3 * model3_results[j] + weight_4 * model4_results[j] + weight_5 * model5_results[j] + weight_6 * model6_results[j]
        weighted_pred_normalized = weighted_pred / np.sum(weighted_pred, axis=1, keepdims=True)
        scores_in.append(roc_auc_score(y_test_list[j], weighted_pred_normalized, multi_class='ovr'))
        
    scores.append(np.mean(scores_in))

In [None]:
results_df = pd.DataFrame()
results_df['model_1'] = model1_weights
results_df['model_2'] = model2_weights
results_df['model_3'] = model3_weights
results_df['model_4'] = model4_weights
results_df['model_5'] = model5_weights
results_df['model_6'] = model6_weights
results_df['score'] = scores
results_df = results_df.sort_values(by='score', ascending=False).reset_index(drop=True)
results_df.head(10)

# Get Submission (Random Weight Ensemble)

In [None]:
%%time

model1_final = model1.fit(X_lgbm, y)
model2_final = model2.fit(X_xgb, y)
model3_final = model3.fit(X_rf, y)
model4_final = model4.fit(X_extrat, y)
model5_final = model5.fit(X_hist, y)
model6_final = model6.fit(X_cat, y)

In [None]:
ensemble_pred = (
                results_df['model_1'][0] * model1_final.predict_proba(test[model1_feats]) +
                results_df['model_2'][0] * model2_final.predict_proba(test[model2_feats]) +
                results_df['model_3'][0] * model3_final.predict_proba(test[model3_feats]) +
                results_df['model_4'][0] * model4_final.predict_proba(test[model4_feats]) +
                results_df['model_5'][0] * model5_final.predict_proba(test[model5_feats]) +
                results_df['model_6'][0] * model6_final.predict_proba(test[model6_feats])
                 )

ensemble_df = pd.DataFrame(ensemble_pred)

# If all models predict 0, instead of getting NaN, fill in 0
ensemble_df = ensemble_df.div(ensemble_df.sum(axis=1), axis=0).fillna(0)
ensemble_df.columns = label_encoder.classes_

In [None]:
ensemble_df.head()

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], ensemble_df], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv('submission_ensemble_3fold_0.900483.csv', index=False)

# Get submission (Stacking)

In [None]:
%%time

from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
import numpy as np


roc_auc_scores = []

# Define the base models
base_models = [
    ('model1', model1_final),
    ('model2', model2_final),
    ('model3', model3_final),
    ('model4', model4_final),
    ('model5', model5_final),
    ('model6', model6_final)
]

# Initialize the Stacking Classifier with LogisticRegression as the final estimator
final_estimator = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
# final_estimator = LGBMClassifier(n_jobs=-1, random_state=5)
# final_estimator = XGBClassifier(random_state=5)
# final_estimator = RandomForestClassifier(random_state=5)
# final_estimator = ExtraTreesClassifier(random_state=5)
# final_estimator = HistGradientBoostingClassifier(random_state=5)
# final_estimator = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=final_estimator, passthrough=False, cv=3)

for i, (train_index, test_index) in enumerate(sk10.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict_proba(X_test)

    # Assuming your classes are 0, 1, 2, etc., adjust as necessary
    y_test_binarized = label_binarize(y_test, classes=np.unique(y))
    roc_auc = roc_auc_score(y_test_binarized, y_pred, multi_class='ovr')

    roc_auc_scores.append(roc_auc)

    print(f'Done with fold {i+1}.')
    
print(f'The average stacking score is {np.mean(roc_auc_scores)}')

- Logistic Reg - 0.886778
- LGBM - 0.885863
- XGB - 0.881636
- RF - 0.883835
- ET - 0.884523
- Hist - 0.886572
- Cat - 0.886183

In [None]:
# Predictions on unseen test data
y_test_pred = stacking_clf.predict_proba(test)

stacking_df = pd.DataFrame(y_test_pred)

ensemble_df.head()

In [None]:
%%time

model1_results, model2_results, model3_results, model4_results, model5_results, model6_results, y_test_list = [], [], [], [], [], [], []

# # Placeholder for OOF predictions for each model
# # Assuming you have a dataset with N samples
# N = len(y)  # y_train is your target variable array
# oof_preds1 = np.zeros((N, 1))
# oof_preds2 = np.zeros((N, 1))
# oof_preds3 = np.zeros((N, 1))
# oof_preds4 = np.zeros((N, 1))
# oof_preds5 = np.zeros((N, 1))
# oof_preds6 = np.zeros((N, 1))

# # Similarly, for test predictions, accumulate them over folds
# # Assuming you have a test set with M samples
# M = len(test)  # x_test needs to be defined by you
# test_preds1 = np.zeros((M, 1))
# test_preds2 = np.zeros((M, 1))
# test_preds3 = np.zeros((M, 1))
# test_preds4 = np.zeros((M, 1))
# test_preds5 = np.zeros((M, 1))
# test_preds6 = np.zeros((M, 1))

target_length = len(y)
no_classes = len(np.unique(y))
test_length = len(test)

# Initialize arrays for OOF and test predictions with dimensions for multiclass for each model
lgbm_oof_preds = np.zeros((target_length, no_classes))
lgbm_test_preds = np.zeros((test_length, no_classes))

xgb_oof_preds = np.zeros((target_length, no_classes))
xgb_test_preds = np.zeros((test_length, no_classes))

rf_oof_preds = np.zeros((target_length, no_classes))
rf_test_preds = np.zeros((test_length, no_classes))

extrat_oof_preds = np.zeros((target_length, no_classes))
extrat_test_preds = np.zeros((test_length, no_classes))

hist_oof_preds = np.zeros((target_length, no_classes))
hist_test_preds = np.zeros((test_length, no_classes))

cat_oof_preds = np.zeros((target_length, no_classes))
cat_test_preds = np.zeros((test_length, no_classes))

X_lgbm = X[model1_feats]
X_xgb = X[model2_feats]
X_rf = X[model3_feats]
X_extrat = X[model4_feats]
X_hist = X[model5_feats]
X_cat = X[model6_feats]


In [None]:

for i, (train_index, test_index) in enumerate(sk10.split(X, y)):

    # Placeholder arrays for the fold's predicition
    fold_oof_preds_lgbm = np.zeros((len(test_index), no_classes))
    fold_test_preds_lgbm = np.zeros((test_length, no_classes))

    fold_oof_preds_xgb = np.zeros((len(test_index), no_classes))
    fold_test_preds_xgb = np.zeros((test_length, no_classes))

    fold_oof_preds_rf = np.zeros((len(test_index), no_classes))
    fold_test_preds_rf = np.zeros((test_length, no_classes))

    fold_oof_preds_extrat = np.zeros((len(test_index), no_classes))
    fold_test_preds_extrat = np.zeros((test_length, no_classes))

    fold_oof_preds_hist = np.zeros((len(test_index), no_classes))
    fold_test_preds_hist = np.zeros((test_length, no_classes))

    fold_oof_preds_cat = np.zeros((len(test_index), no_classes))
    fold_test_preds_cat = np.zeros((test_length, no_classes))

    # Get each models train and test for X and y
    X_train_lgbm, X_test_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
    X_train_xgb, X_test_xgb = X_xgb.iloc[train_index], X_xgb.iloc[test_index]
    X_train_rf, X_test_rf = X_rf.iloc[train_index], X_rf.iloc[test_index]
    X_train_extrat, X_test_extrat = X_extrat.iloc[train_index], X_extrat.iloc[test_index]
    X_train_hist, X_test_hist = X_hist.iloc[train_index], X_hist.iloc[test_index]
    X_train_cat, X_test_cat = X_cat.iloc[train_index], X_cat.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    ########
    # LGBM #
    ########
    model1.fit(X_train_lgbm, y_train)
    fold_oof_preds_lgbm = model1.predict_proba(X_test_lgbm)

    # Update the OOF prediction for this fold
    lgbm_oof_preds[test_index] = fold_oof_preds_lgbm

    # Predict on the test set and accumulate predictions
    fold_test_preds_lgbm += model1.predict_proba(test.loc[:, model1_feats]) / sk10.n_splits

    lgbm_test_preds += fold_test_preds_lgbm


    ###########
    # XGBOOST #
    ###########
    model2.fit(X_train_xgb, y_train)
    fold_oof_preds_xgb = model2.predict_proba(X_test_xgb)

    # Update the OOF prediction for this fold
    xgb_oof_preds[test_index] = fold_oof_preds_xgb

    # Predict on the test set and accumulate predictions
    fold_test_preds_xgb += model2.predict_proba(test.loc[:, model2_feats]) / sk10.n_splits

    xgb_test_preds += fold_test_preds_xgb


    #################
    # RANDOM FOREST #
    #################
    model3.fit(X_train_rf, y_train)
    fold_oof_preds_rf = model3.predict_proba(X_test_rf)

    # Update the OOF prediction for this fold
    rf_oof_preds[test_index] = fold_oof_preds_rf

    # Predict on the test set and accumulate predictions
    fold_test_preds_rf += model3.predict_proba(test.loc[:, model3_feats]) / sk10.n_splits

    rf_test_preds += fold_test_preds_rf

    
    ###############
    # EXTRA TREES #
    ###############
    model4.fit(X_train_extrat, y_train)
    fold_oof_preds_extrat = model4.predict_proba(X_test_extrat)

    # Update the OOF prediction for this fold
    extrat_oof_preds[test_index] = fold_oof_preds_extrat

    # Predict on the test set and accumulate predictions
    fold_test_preds_extrat += model4.predict_proba(test.loc[:, model4_feats]) / sk10.n_splits

    extrat_test_preds += fold_test_preds_extrat


    #################
    # HIST GRADIENT #
    #################
    model5.fit(X_train_hist, y_train)
    fold_oof_preds_hist = model5.predict_proba(X_test_hist)

    # Update the OOF prediction for this fold
    hist_oof_preds[test_index] = fold_oof_preds_hist

    # Predict on the test set and accumulate predictions
    fold_test_preds_hist += model5.predict_proba(test.loc[:, model5_feats]) / sk10.n_splits

    hist_test_preds += fold_test_preds_hist


    ############
    # CATBOOST #
    ############
    model6.fit(X_train_cat, y_train)
    fold_oof_preds_cat = model6.predict_proba(X_test_cat)

    # Update the OOF prediction for this fold
    cat_oof_preds[test_index] = fold_oof_preds_cat

    # Predict on the test set and accumulate predictions
    fold_test_preds_cat += model6.predict_proba(test.loc[:, model6_feats]) / sk10.n_splits

    cat_test_preds += fold_test_preds_cat
    # y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

In [None]:
# roc_auc_scores = [roc_auc_score((y == class_id).astype(int), oof_preds[:, class_id], multi_class='ovr') for class_id in range(no_classes)]
lgbm_roc_auc = roc_auc_score(y, lgbm_oof_preds, multi_class='ovr', average='macro')
print("Average LGBM ROC AUC Score:", lgbm_roc_auc)

xgb_roc_auc = roc_auc_score(y, xgb_oof_preds, multi_class='ovr', average='macro')
print("Average XGBoost ROC AUC Score:", xgb_roc_auc)

rf_roc_auc = roc_auc_score(y, rf_oof_preds, multi_class='ovr', average='macro')
print("Average Random Forest ROC AUC Score:", rf_roc_auc)

extrat_roc_auc = roc_auc_score(y, extrat_oof_preds, multi_class='ovr', average='macro')
print("Average Extra Trees ROC AUC Score:", extrat_roc_auc)

hist_roc_auc = roc_auc_score(y, hist_oof_preds, multi_class='ovr', average='macro')
print("Average Hist Gradient ROC AUC Score:", hist_roc_auc)

cat_roc_auc = roc_auc_score(y, cat_oof_preds, multi_class='ovr', average='macro')
print("Average CatBoost ROC AUC Score:", cat_roc_auc)

# 0.89369590207664
# 0.00201442835387733
# 0.886778 - StackingClassifier

In [None]:
from sklearn.linear_model import LogisticRegression

# After running the fitting and prediction with the first level of machine learning models
x_train = np.concatenate(( lgbm_oof_preds, xgb_oof_preds, rf_oof_preds, extrat_oof_preds, hist_oof_preds, cat_oof_preds), axis=1)
test_stack = np.concatenate(( lgbm_test_preds, xgb_test_preds, rf_test_preds, extrat_test_preds, hist_test_preds, cat_test_preds), axis=1)

# Assuming the second-level stacking is to be done with XGboost (pre-tuned). Yes! You can tune second-level stack

stacking_estimator = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')

xgb = stacking_estimator.fit(x_train, y)
final_predictions = xgb.predict_proba(test_stack)

In [None]:
oof_preds = np.zeros((x_train.shape[0], no_classes))
test_preds = np.zeros(test_stack.shape[0])

for i, (train_index, test_index) in enumerate(sk10.split(x_train, y)):
    X_train, X_test = x_train[train_index], x_train[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model2.fit(X_train, y_train)
    y_pred = model2.predict_proba(X_test)

    # Assign predictions for this fold to the appropriate indices in oof_preds
    oof_preds[test_index, :] = y_pred
    
    print(f'Done with fold {i+1}.')

# Calculate ROC AUC on the OOF predictions
roc_auc = roc_auc_score(y, oof_preds, multi_class='ovr', average='macro')
print(f'The stacking score is {roc_auc}')

- Logistic Reg - 0.8883102077923056
- LGBM - 0.8880225088607244
- XGB - 0.8846028966376445
- RF - 
- ET - 
- Hist - 
- Cat - 

In [None]:
final_predictions_df = pd.DataFrame(final_predictions)
final_predictions_df.columns = label_encoder.classes_

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], final_predictions_df], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv('submission_stacking_3fold_0.88831.csv', index=False)