In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import optuna
import random
from tqdm.notebook import tqdm

from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.model_selection import KFold, cross_validate
from sklearn.feature_selection import RFECV, SelectKBest, f_regression
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, label_binarize, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.inspection import permutation_importance

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from pprint import pprint
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

experiment_name = 'all_models'

In [2]:
train = pd.read_csv('df_train.csv')
original = pd.read_csv('original_df.csv')
test = pd.read_csv('df_test.csv')

In [3]:
train.shape, original.shape, test.shape

((90615, 11), (4177, 11), (60411, 10))

In [4]:
test.head()

Unnamed: 0,Sex_F,Sex_I,Sex_M,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight
0,0.0,0.0,1.0,0.645,0.475,0.155,1.238,0.6185,0.3125,0.3005
1,0.0,0.0,1.0,0.58,0.46,0.16,0.983,0.4785,0.2195,0.275
2,0.0,0.0,1.0,0.56,0.42,0.14,0.8395,0.3525,0.1845,0.2405
3,0.0,0.0,1.0,0.57,0.49,0.145,0.874,0.3525,0.1865,0.235
4,0.0,1.0,0.0,0.415,0.325,0.11,0.358,0.1575,0.067,0.105


In [5]:
features_list = ['Sex_F', 'Sex_I', 'Sex_M', 'Length', 'Diameter', 'Height',
       'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight',
       'Rings']

In [None]:
# # Concat the train and original dataset
# combined_df = pd.concat([train, original], axis=0).reset_index(drop=True)

In [6]:
TARGET = 'Rings'

In [None]:
# combined_df.shape, test.shape

In [7]:
X = train.drop([TARGET], axis=1)
y = train[TARGET]

n_splits = 3
sk10 = KFold(n_splits=n_splits, shuffle=True, random_state=5)

In [8]:
# lgbm_params_1 = 
# lgbm_params_2 = 
# lgbm_params_3 = 
# lgbm_params_4 = 
# hist_params_1 = 
# hist_params_2 = 
# hist_params_3 = 
# hist_params_4 = 
# hist_params_5 = 
# extrat_params_1 = 
# extrat_params_2 = 
# extrat_params_3 = 
# extrat_params_4 = 
# rf_params_1 = 
# rf_params_2 = 
# rf_params_3 = 
# cat_params_1 =
# cat_params_2 = 
# cat_params_3 = 
# cat_params_4 = 

In [9]:
models = [
    LGBMRegressor(n_jobs=-1, random_state=5),
    # LGBMRegressor(**lgbm_params_1),
    # LGBMRegressor(**lgbm_params_2),
    # LGBMRegressor(**lgbm_params_3),
    # LGBMRegressor(**lgbm_params_4),
    XGBRegressor(random_state=5),
    RandomForestRegressor(random_state=5),
    # RandomForestRegressor(**rf_params_1),
    # RandomForestRegressor(**rf_params_2),
    ExtraTreesRegressor(random_state=5),
    # ExtraTreesRegressor(**extrat_params_1),
    # ExtraTreesRegressor(**extrat_params_2),
    # ExtraTreesRegressor(**extrat_params_3),
    # ExtraTreesRegressor(**extrat_params_4),
    HistGradientBoostingRegressor(random_state=5),
    # HistGradientBoostingRegressor(**hist_params_1),
    # HistGradientBoostingRegressor(**hist_params_2),
    # HistGradientBoostingRegressor(**hist_params_3),
    # HistGradientBoostingRegressor(**hist_params_4),
    # HistGradientBoostingRegressor(**hist_params_5),
    CatBoostRegressor(random_state=5, verbose=False, early_stopping_rounds=100),
    # CatBoostRegressor(**cat_params_1),
    # CatBoostRegressor(**cat_params_2),
    # CatBoostRegressor(**cat_params_3),
    # CatBoostRegressor(**cat_params_4),
    ]

In [10]:
def rmsle(y_true, y_pred):
    """
    Computes the Root Mean Squared Logarithmic Error (RMSLE).
    """
    assert len(y_true) == len(y_pred)

    # # Add post processing step if required
    # y_pred_processed = np.floor(y_pred)
    
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [15]:
def evaluate_models(models, X, y, important_features, cv_split, experiment_name):
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train ROC AUC', 
                                        'MLA Test ROC AUC', 
                                        'MLA Test ROC AUC Std', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train ROC': 0,
                'MLA Test ROC': 0,
                'MLA Test ROC Std': 0,
                'MLA Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring=rmsle_scorer, 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train ROC AUC': -cv_results['train_score'].mean(),
            'MLA Test ROC AUC': -cv_results['test_score'].mean(),
            'MLA Test ROC AUC Std': cv_results['test_score'].std(),
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {MLA_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            results_list.append(result)

    MLA_compare = pd.DataFrame(results_list)

    MLA_compare.sort_values(by=['MLA Test ROC AUC'], ascending=True, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [12]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [13]:
%%time

baseline_models = evaluate_models(models, X, y, baseline_features, sk10, f'{experiment_name}')
baseline_models

Done with LGBMRegressor.
Done with XGBRegressor.
Done with HistGradientBoostingRegressor.
Done with CatBoostRegressor.
Done with ExtraTreesRegressor.
Done with RandomForestRegressor.
CPU times: total: 406 ms
Wall time: 1min 47s


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
3,ExtraTreesRegressor,"{'bootstrap': False, 'ccp_alpha': 0.0, 'criter...",0.000648,0.155476,0.000299,0 min 47.63 sec
2,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.05921,0.154395,0.000395,1 min 19.49 sec
1,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.134005,0.151059,0.00053,0 min 12.80 sec
4,HistGradientBoostingRegressor,"{'categorical_features': None, 'early_stopping...",0.145466,0.151014,9.7e-05,0 min 2.46 sec
0,LGBMRegressor,"{'boosting_type': 'gbdt', 'class_weight': None...",0.145066,0.150627,0.000205,0 min 0.90 sec
5,CatBoostRegressor,"{'loss_function': 'RMSE', 'verbose': False, 'r...",0.140177,0.149725,0.000251,1 min 2.73 sec


- Remove Correlated Features

In [None]:
# Remove correlated features (leaving just 1 of each pair)
# Leave features highly correlated with the target
df_no_corr = X.copy()
correlation_matrix_spear = df_no_corr.corr(method='spearman').abs()

# Select upper triangle of correlation matrix
upper_spear = correlation_matrix_spear.where(np.triu(np.ones(correlation_matrix_spear.shape), k=1).astype(bool))

# Find index of feature columns with correlation greater than a threshold (e.g., 0.9 in this case)
to_drop_spear = [column for column in upper_spear.columns if any(upper_spear[column] >= 0.9)]

# Drop features
df_reduced_spear = df_no_corr.drop(to_drop_spear, axis=1)

# Get list of low correlation features excluding TARGET
low_corr_feats_spear = list(df_reduced_spear.columns)

with open('low_corr_spear.txt', 'w') as f:
    f.write(str(low_corr_feats_spear))
    f.write('\n')

# Print the high correlation features effect
# Both pre and post drop dfs contain the TARGET
print(f"Dropped {len(to_drop_spear)} highly correlated features.\nOld Shape of the dataset was {df_no_corr.shape}\nNew shape of the dataset is {df_reduced_spear.shape}")

In [None]:
%%time

no_corr_features = {}

for model in models:
    model_name = model.__class__.__name__

    no_corr_features[model_name] = list(df_reduced_spear.columns)

In [None]:
%%time

no_corr_models = evaluate_models(models, df_reduced_spear, y, no_corr_features, sk10, f'{experiment_name}_corr')
no_corr_models

- Feature Importances

In [None]:
# feat_importance_features = {}

# for model in models:
#     model_name = model.__class__.__name__

#     try:
#         # Initialize array to store feature importances
#         feature_importances = np.zeros(X.shape[1])

#         # Loop through each fold and calculate the feature importances
#         for train_index, test_index in sk10.split(X, y):
#             X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#             y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#             model.fit(X_train, y_train)

#             # Get the feature importances and them to the total
#             feature_importances += model.feature_importances_

#         feature_importances /= n_splits

#         feature_importances_dict = dict(zip(X.columns, feature_importances))

#         df = pd.DataFrame.from_dict(feature_importances_dict, orient='index')

#         # Resetting index with a name for the column
#         df = df.reset_index().rename(columns={'index': 'Feature', 0: 'Avg_Feat_Importance'})
#         df.sort_values(by='Avg_Feat_Importance', ascending=False, inplace=True)

#         # Save to CSV
#         df.to_csv(f'{model_name}_feature_importances.csv')

#         fi_threshold = 0

#         fi_feats = df[df['Avg_Feat_Importance'] > fi_threshold]['Feature'].tolist()

#         feat_importance_features[model_name] = fi_feats
#         print(f'Done with {model_name}')

#     except AttributeError:
#         feat_importance_features[model_name] = list(X.columns)
#         print(f'{model_name} does not have feature_importances_')

In [None]:
# with open('featimp_features.txt', mode='w') as f:
#     pprint(feat_importance_features, stream=f)

- Permutation Importance

In [16]:
# Generate a random feature for X
np.random.seed(5)
X['random_control_feature'] = np.round(np.random.uniform(-2, 2, X.shape[0]), 6)
X.shape

(90615, 11)

In [17]:
%%time

perm_cv = KFold(n_splits=5, shuffle=True, random_state=5)

perm_importances = {model.__class__.__name__: [] for model in models}

for i, (train_idx, test_idx) in enumerate(perm_cv.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for model in models:
        model_name = model.__class__.__name__
        model.fit(X_train, y_train)
        # Calculate permutation importance
        result = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=5, n_jobs=-1, scoring=rmsle_scorer)
        perm_importances[model_name].append(result.importances_mean)
        print(f'Done with {model_name}.')
    
    print(f'Done with Fold {i+1}', end='\n\n')

Done with LGBMRegressor.
Done with XGBRegressor.
Done with RandomForestRegressor.
Done with ExtraTreesRegressor.
Done with HistGradientBoostingRegressor.
Done with CatBoostRegressor.
Done with Fold 1

Done with LGBMRegressor.
Done with XGBRegressor.
Done with RandomForestRegressor.
Done with ExtraTreesRegressor.
Done with HistGradientBoostingRegressor.
Done with CatBoostRegressor.
Done with Fold 2

Done with LGBMRegressor.
Done with XGBRegressor.
Done with RandomForestRegressor.
Done with ExtraTreesRegressor.
Done with HistGradientBoostingRegressor.
Done with CatBoostRegressor.
Done with Fold 3

Done with LGBMRegressor.
Done with XGBRegressor.
Done with RandomForestRegressor.
Done with ExtraTreesRegressor.
Done with HistGradientBoostingRegressor.
Done with CatBoostRegressor.
Done with Fold 4

Done with LGBMRegressor.
Done with XGBRegressor.
Done with RandomForestRegressor.
Done with ExtraTreesRegressor.
Done with HistGradientBoostingRegressor.
Done with CatBoostRegressor.
Done with Fol

In [18]:
%%time

# Average importances across folds and export to CSV
for model_name, importances in perm_importances.items():
    avg_importance = np.mean(importances, axis=0)
    importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': avg_importance})
    importance_df.sort_values(by='Importance', ascending=False, inplace=True)
    # Export to CSV
    importance_df.to_csv(f'.\permutation_importances\{model_name}_permutation_importance.csv', index=False)

print('Done with Permuation Importances', end='\n\n')

Done with Permuation Importances

CPU times: total: 0 ns
Wall time: 84.2 ms


In [19]:
directory = 'permutation_importances'

# Initialize a dictionary for the features
perm_important_features = {}

for model in models:
    model_name = model.__class__.__name__
    csv_path = os.path.join(directory, f'{model_name}_permutation_importance.csv')
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)

        # Check for 'random_control_feature' and its importance
        if 'random_control_feature' in df['Feature'].values:
            random_feature_importance = df.loc[df['Feature'] == 'random_control_feature', 'Importance'].iloc[0]
        else:
            random_feature_importance = 0

        # Determine the threshold
        threshold = max(0, random_feature_importance)

        # Filter features where importance is greater than 0
        important_feats_filtered = df[df['Importance'] > threshold]['Feature'].tolist()

        # Reorder important_feats based on the predefined features_list
        important_feats_ordered = [feat for feat in features_list if feat in important_feats_filtered]

        # Add to importance dictionary
        perm_important_features[model_name] = important_feats_ordered

    else:
        print(f'CSV file for {model_name} not found.')

print('Done getting important features dictionary')

Done getting important features dictionary


In [20]:
with open('perm_important_features.txt', mode='w') as f:
    pprint(perm_important_features, stream=f)

In [21]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

perm_importance_models = evaluate_models(models, X, y, perm_important_features, sk10, f'{experiment_name}_permimp')
perm_importance_models

Done with LGBMRegressor.
Done with HistGradientBoostingRegressor.
Done with XGBRegressor.
Done with CatBoostRegressor.
Done with ExtraTreesRegressor.
Done with RandomForestRegressor.
CPU times: total: 406 ms
Wall time: 1min 48s


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
5,CatBoostRegressor,"{'loss_function': 'RMSE', 'verbose': False, 'r...",0.140177,0.149725,0.000251,0 min 55.66 sec
0,LGBMRegressor,"{'boosting_type': 'gbdt', 'class_weight': None...",0.145066,0.150627,0.000205,0 min 1.42 sec
4,HistGradientBoostingRegressor,"{'categorical_features': None, 'early_stopping...",0.145527,0.150935,6.4e-05,0 min 3.23 sec
1,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.134005,0.151059,0.00053,0 min 15.93 sec
2,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.05921,0.154395,0.000395,1 min 6.42 sec
3,ExtraTreesRegressor,"{'bootstrap': False, 'ccp_alpha': 0.0, 'criter...",0.001807,0.155432,0.000298,0 min 49.31 sec


- SelectKBest with f_reg

In [None]:
best_features_list = []
kbest_features = {}

for model in models:
    model_name = model.__class__.__name__

    # Select whichever one had a better CV score generally
    # Also, consider computational expense and accuracy balance
    
    features = perm_important_features[model_name]
    # features = list(df_reduced_spear.columns)

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_kbest = X[features]
    best_score = 0
    best_k = 0
    best_features = []

    # Iterate over k from 1 to number of features
    for k in range(1, len(features) + 1):
        print(f'currently running {k} features on {model_name}')
        # Apply SelectKBest
        selector = SelectKBest(f_regression, k=k)
        X_new = selector.fit_transform(X_kbest, y)

        # Get the selected feature names
        selected_features = X_kbest.columns[selector.get_support()]

        # Evaluate the model
        # model = LGBMClassifier(n_jobs=-1, random_state=5)
        rmsle_scores = cross_validate(model, X_new, y, cv=sk10, scoring=rmsle_scorer, n_jobs=-1)
        mean_rmsle_scores = rmsle_scores['test_score'].mean()

        if mean_rmsle_scores > best_score:
            best_k = k
            best_score = mean_rmsle_scores
            best_features = list(selected_features)

    best_features_list.append({'k': best_k,
                    'Selected Features': best_features,
                    'RMSLE Score': best_score,
                    'Model Name': model_name})
    
    kbest_features[model_name] = best_features

best_features_df = pd.DataFrame(best_features_list)

best_features_df.sort_values(by='RMSLE Score', ascending=False, inplace=True)

In [None]:
with open('kbest_features.txt', mode='w') as f:
    pprint(kbest_features, stream=f)

In [None]:
best_features_df

- RFECV

In [None]:
%%time

# Initialize empty dictionary for RFECV features
rfecv_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__
		
    features = perm_important_features[MLA_name]

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_rfecv = X[features]

    try:
        print(f'Starting with {MLA_name}')
        # Create the RFECV object and rank each feature
        selector = RFECV(alg, cv=sk10, step=1, scoring=rmsle_scorer, verbose=2)
        selector = selector.fit(X_rfecv, y)

        selected_features = list(X_rfecv.columns[selector.support_])

        # Reorder selected_features based on the predefined features_list
        selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        rfecv_features[MLA_name] = selected_features_ordered

        print(f'Done with {MLA_name}', end='\n\n')
    
    except ValueError:
        # In case of an error, keep the original order but filtered by features_list
        features_filtered = [feat for feat in features_list if feat in features]
        rfecv_features[MLA_name] = features_filtered
        print(f'{MLA_name} does not have coef_ or feature_importances_', end='\n\n')

In [None]:
with open('rfecv_features.txt', mode='w') as f:
    pprint(rfecv_features, stream=f)

In [None]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

rfecv_models = evaluate_models(models, X, y, rfecv_features, sk10, f'{experiment_name}_rfecv')
rfecv_models

- SFS

In [22]:
%%time

# Initialize empty dictionary for SFS features
sfs_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__

    try:

        features = perm_important_features[MLA_name]    
        # features = kbest_features[MLA_name]
        # features = feat_importance_features[MLA_name]
        # features = rfecv_features[MLA_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {MLA_name}')

        sfs = SFS(alg,
            k_features='best',
            forward=False,
            floating=False,
            scoring=rmsle_scorer,
            verbose=2,
            n_jobs=-1,
            cv=sk10)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        selected_features = list(selected_sfs_feats)

        # Reorder selected_features based on the predefined features_list
        selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        sfs_features[MLA_name] = selected_features_ordered

        print(f'Done with {MLA_name}', end='\n\n')

    except KeyError:
        print(f'{MLA_name} not in the dictionary.')

Running backward feature selection with LGBMRegressor


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    5.0s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.1s finished

[2024-04-02 19:33:41] Features: 9/1 -- score: -0.1505774857817751[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    3.6s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    5.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    5.4s finished

[2024-04-02 19:33:46] Features: 8/1 -- score: -0.1506809363794596[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    3.7s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    4.4s finished

[2024-04-02 19:33:51]

Done with LGBMRegressor

Running backward feature selection with XGBRegressor


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   43.4s remaining:   18.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   59.9s finished

[2024-04-02 19:35:20] Features: 9/1 -- score: -0.15089004363661665[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:   39.4s remaining:   49.2s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   53.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   53.9s finished

[2024-04-02 19:36:14] Features: 8/1 -- score: -0.1507842735894188[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:   36.5s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   38.6s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   38.6s finished

[2024-04-02 19:36:53

Done with XGBRegressor

Running backward feature selection with RandomForestRegressor


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  4.1min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.8min finished

[2024-04-02 19:46:10] Features: 9/1 -- score: -0.15441935024262463[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  3.5min remaining:  4.4min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  5.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  5.4min finished

[2024-04-02 19:51:37] Features: 8/1 -- score: -0.15459442393040407[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:  3.3min remaining:  5.5min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  3.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  3.6min finished

[2024-04-02 19:55:1

Done with RandomForestRegressor

Running backward feature selection with ExtraTreesRegressor


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:  2.7min remaining:  4.5min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  3.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  3.0min finished

[2024-04-02 20:12:34] Features: 7/1 -- score: -0.15628512569663558[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   7 | elapsed:  2.4min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:  2.5min finished

[2024-04-02 20:15:03] Features: 6/1 -- score: -0.15737156331777188[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  1.8min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  1.9min finished

[2024-04-02 20:16:58] Features: 5/1 -- score: -0.15942566068120625[Parallel(n_jobs=-1)]: Using backe

Done with ExtraTreesRegressor

Running backward feature selection with HistGradientBoostingRegressor


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    9.7s remaining:   12.2s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   13.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   13.5s finished

[2024-04-02 20:20:56] Features: 8/1 -- score: -0.15102402758904832[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    8.8s remaining:   14.8s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    9.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    9.8s finished

[2024-04-02 20:21:06] Features: 7/1 -- score: -0.15108858565278072[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   7 | elapsed:    7.7s remaining:    5.8s
[Parallel(n_jobs=-1)]: Done   7 out of   7 | elapsed:    7.9s finished

[2024-04-02 20:21:1

Done with HistGradientBoostingRegressor

Running backward feature selection with CatBoostRegressor


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  2.9min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.7min finished

[2024-04-02 20:25:54] Features: 9/1 -- score: -0.1496777058106973[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  2.7min remaining:  3.4min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  3.2min finished

[2024-04-02 20:29:09] Features: 8/1 -- score: -0.14978546383553085[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:  2.7min remaining:  4.5min
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  2.8min finished

[2024-04-02 20:31:54

Done with CatBoostRegressor

CPU times: total: 3min 19s
Wall time: 1h 6min 54s


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   38.8s finished

[2024-04-02 20:40:27] Features: 1/1 -- score: -0.17149652085226685

In [23]:
with open('sfs_features.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [24]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

sfs_models = evaluate_models(models, X, y, sfs_features, sk10, f'{experiment_name}_sfs')
sfs_models

Done with LGBMRegressor.
Done with XGBRegressor.
Done with HistGradientBoostingRegressor.
Done with ExtraTreesRegressor.
Done with CatBoostRegressor.
Done with RandomForestRegressor.
CPU times: total: 625 ms
Wall time: 1min 31s


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC AUC,MLA Test ROC AUC,MLA Test ROC AUC Std,MLA Time
5,CatBoostRegressor,"{'loss_function': 'RMSE', 'verbose': False, 'r...",0.140225,0.149678,0.000216,0 min 41.68 sec
0,LGBMRegressor,"{'boosting_type': 'gbdt', 'class_weight': None...",0.145088,0.150577,0.000155,0 min 1.85 sec
1,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.134222,0.150784,0.000279,0 min 13.62 sec
4,HistGradientBoostingRegressor,"{'categorical_features': None, 'early_stopping...",0.145527,0.150935,6.4e-05,0 min 2.97 sec
2,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.05921,0.154395,0.000395,1 min 25.46 sec
3,ExtraTreesRegressor,"{'bootstrap': False, 'ccp_alpha': 0.0, 'criter...",0.001807,0.155432,0.000298,0 min 49.18 sec


## Single Model Prediction

In [None]:
sfs_features = ['Sex_F', 'Sex_I', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']

In [None]:
model1 = LGBMRegressor(n_jobs=-1, random_state=5)

In [None]:
model1_final = model1.fit(X[sfs_features], y)

In [None]:
pred = model1_final.predict(test[sfs_features])
pred

In [None]:
pred_df = pd.DataFrame(pred, columns=['Rings'])
pred_df.head()

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], pred_df], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv('submission_lgbm_postprocess_asint_0.150577.csv', index=False)

### Post Model Features

In [None]:
# model1 = LGBMClassifier(n_jobs=-1, random_state=5)
# model2 = XGBClassifier(random_state=5)
# model3 = RandomForestClassifier(random_state=5)
# model4 = ExtraTreesClassifier(random_state=5)
# model5 = HistGradientBoostingClassifier(random_state=5)
# model6 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)

In [None]:
model1 = LGBMClassifier(n_jobs=-1, random_state=5)
model1b = LGBMClassifier(**lgbm_params_1)
model1c = LGBMClassifier(**lgbm_params_2)
model1d = LGBMClassifier(**lgbm_params_3)
model1e = LGBMClassifier(**lgbm_params_4)
model2 = XGBClassifier(random_state=5)
model3 = RandomForestClassifier(random_state=5)
model3b = RandomForestClassifier(**rf_params_1)
model3c = RandomForestClassifier(**rf_params_2)
model4 = ExtraTreesClassifier(random_state=5)
model4b = ExtraTreesClassifier(**extrat_params_1)
model4c = ExtraTreesClassifier(**extrat_params_2)
model4d = ExtraTreesClassifier(**extrat_params_3)
model4e = ExtraTreesClassifier(**extrat_params_4)
model5 = HistGradientBoostingClassifier(random_state=5)
model5b = HistGradientBoostingClassifier(**hist_params_1)
model5c = HistGradientBoostingClassifier(**hist_params_2)
model5d = HistGradientBoostingClassifier(**hist_params_3)
model5e = HistGradientBoostingClassifier(**hist_params_4)
model5f = HistGradientBoostingClassifier(**hist_params_5)
model6 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)
# model6b = CatBoostClassifier(**cat_params_1)
# model6c = CatBoostClassifier(**cat_params_2)
# model6d = CatBoostClassifier(**cat_params_3)
model6e = CatBoostClassifier(**cat_params_4)

- Features for Competition + Original dataset down to SFS for all models (Experiment Set 2)

In [None]:
model1_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_X_Index', 'Edges_Y_Index', 'Orientation_Index', 'Luminosity_Index']
model2_feats = ['X_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index']
model3_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Orientation_Index', 'Luminosity_Index']
model4_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Orientation_Index']
model5_feats = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Minimum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Orientation_Index', 'Luminosity_Index']
model6_feats = ['X_Minimum', 'Pixels_Areas', 'X_Perimeter', 'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer', 'TypeOfSteel_A300', 'Steel_Plate_Thickness', 'Edges_Index', 'Empty_Index', 'Outside_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'Orientation_Index', 'Luminosity_Index']

X_lgbm = X[model1_feats]
X_xgb = X[model2_feats]
X_rf = X[model3_feats]
X_extrat = X[model4_feats]
X_hist = X[model5_feats]
X_cat = X[model6_feats]

# Hyperparameter Tuning

- LGBM

In [None]:
def objective(trial):
    # Raw Parameters for individual tunings
    class_weight_option = trial.suggest_categorical('class_weight', ['none', 'balanced', 'custom'])
    if class_weight_option == 'none':
        class_weight = None
    elif class_weight_option == 'balanced':
        class_weight = 'balanced'
    else:
        # For multi-class, you could define a range or specific values to test
        weight_for_class_0 = trial.suggest_float('weight_for_class_0', 0.1, 10.0)
        weight_for_class_1 = trial.suggest_float('weight_for_class_1', 0.1, 10.0)
        weight_for_class_2 = trial.suggest_float('weight_for_class_2', 0.1, 10.0)
        weight_for_class_3 = trial.suggest_float('weight_for_class_3', 0.1, 10.0)
        weight_for_class_4 = trial.suggest_float('weight_for_class_4', 0.1, 10.0)
        weight_for_class_5 = trial.suggest_float('weight_for_class_5', 0.1, 10.0)
        weight_for_class_6 = trial.suggest_float('weight_for_class_6', 0.1, 10.0)
        class_weight = {0: weight_for_class_0, 1: weight_for_class_1, 2: weight_for_class_2, 3: weight_for_class_3, 4: weight_for_class_4, 5: weight_for_class_5, 6: weight_for_class_6}

    param = {
        'objective': 'multiclass',
        'num_class': 7,
        'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        # 'class_weight': class_weight,
        # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
        # 'learning_rate': trial.suggest_float('learning_rate', 0.000001, 0.5),
        # 'max_depth': trial.suggest_int('max_depth', -1, 64),
        # 'min_child_samples': trial.suggest_int('min_child_samples', 5, 500),
        # 'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0),
        # 'min_split_gain': trial.suggest_float('min_split_gain', 0.5, 1.0),
        # 'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        # 'n_jobs': -1,
        # 'num_leaves': trial.suggest_int('num_leaves', 2, 1000),
        # 'random_state': 5,
        # 'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        # 'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        # 'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        }

    # # Group Parameters after individual tunings (change after testing the individual params)
    # param = {
    #     'objective': 'multiclass',
    #     'num_class': 7,
    #     'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
    #     'class_weight': class_weight,
    #     'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0),
    #     'learning_rate': trial.suggest_float('learning_rate', 0.000001, 0.5),
    #     'max_depth': trial.suggest_int('max_depth', -1, 64),
    #     'min_child_samples': trial.suggest_int('min_child_samples', 5, 500),
    #     'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0),
    #     'min_split_gain': trial.suggest_float('min_split_gain', 0.5, 1.0),
    #     'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
    #     'n_jobs': -1,
    #     'num_leaves': trial.suggest_int('num_leaves', 2, 1000),
    #     'random_state': 5,
    #     'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
    #     'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
    #     'subsample': trial.suggest_float('subsample', 0.1, 1.0),
    #     }
    
    roc_auc_scores = []
    
    for train_index, test_index in sk10.split(X_lgbm, y):
        X_train, X_test = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = LGBMClassifier(**param)
        model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=100, verbose=False)
        preds = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, preds, multi_class='ovr', average='macro')
        roc_auc_scores.append(roc_auc)
    
    return np.mean(roc_auc_scores)

# Using median pruner
pruner = optuna.pruners.MedianPruner(n_startup_trials=2, n_warmup_steps=1, interval_steps=1)

study = optuna.create_study(direction='maximize', pruner=pruner)
study.optimize(objective, n_trials=50)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

# 0.893695902

- HistGradient

In [None]:
def objective(trial):
    # # Raw Parameters for individual tunings
    # param = {
    #     # 'learning_rate': trial.suggest_float('learning_rate', 0.000001, 0.15),
    #     # 'max_iter': trial.suggest_int('max_iter', 50, 5000),
    #     # 'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20, 1000),
    #     # 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 1000),
    #     # 'l2_regularization': trial.suggest_float('l2_regularization', 0.1, 1.0),
    #     # 'max_bins': trial.suggest_int('max_bins', 10, 255),
    #     'max_depth': trial.suggest_int('max_depth', 2, 64),
    #     'random_state': 5,
    # }

    # Group Parameters after individual tunings (change after testing the individual params)
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.06),
        'max_iter': 100,
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 50),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 300, 800),
        'l2_regularization': trial.suggest_float('l2_regularization', 0.8, 1.0),
        'max_bins': trial.suggest_int('max_bins', 50, 150),
        'max_depth': 25,
        'random_state': 5,
    }

    roc_auc_scores = []

    for i, (train_index, test_index) in enumerate(sk10.split(X_hist, y)):
        X_train, X_test = X_hist.iloc[train_index], X_hist.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = HistGradientBoostingClassifier(**param)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, preds, multi_class='ovr', average='macro')
        roc_auc_scores.append(roc_auc)

        # Report intermediate objective value
        trial.report(roc_auc, i)

        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned('ROC score lower than threshold.')

        # # Check if performance is below threshold
        # if roc_auc < performance_threshold:
        #     raise optuna.exceptions.TrialPruned('ROC score lower than threshold.')

    return np.mean(roc_auc_scores)

pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=1, interval_steps=1)

study = optuna.create_study(direction='maximize', pruner=pruner)
# study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

- ExtraTrees

In [None]:
def objective(trial):
    # # Raw Parameters for individual tunings
    # param = {
    #     # 'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 0.1),
    #     # 'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample']),
    #     # 'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss']),
    #     # 'max_depth': trial.suggest_int('max_depth', 10, 3000),
    #     # 'max_features': trial.suggest_categorical('max_features', [None, 'sqrt', 'log2']),
    #     # 'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 100, 3000, log=True) or None,
    #     # 'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.1),
    #     # 'min_samples_leaf': trial.suggest_int('min_samples_leaf',1, 500),
    #     # 'min_samples_split': trial.suggest_int('min_samples_split', 2, 500),
    #     # 'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
    #     'n_estimators': trial.suggest_int('n_estimators', 50, 3000),
    #     'random_state': 5,
    #     'n_jobs': -1,
    # }

    # Group Parameters after individual tunings (change after testing the individual params)
    param = {
        'ccp_alpha': 0.0,
        'class_weight': None,
        'criterion': 'gini',
        'max_depth': None,
        'max_features': 'sqrt',
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 1000, 3000, log=True) or None,
        'min_impurity_decrease': 0.0,
        'min_samples_leaf': trial.suggest_int('min_samples_leaf',1, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.002),
        'n_estimators': trial.suggest_int('n_estimators', 2500, 3000),
        'random_state': 5,
        'n_jobs': -1,
    }

    roc_auc_scores = []

    for i, (train_index, test_index) in enumerate(sk10.split(X_extrat, y)):
        X_train, X_test = X_extrat.iloc[train_index], X_extrat.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = ExtraTreesClassifier(**param)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, preds, multi_class='ovr', average='macro')
        roc_auc_scores.append(roc_auc)

        # Report intermediate objective value
        trial.report(roc_auc, i)

        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned('ROC score lower than threshold.')

        # # Check if performance is below threshold
        # if roc_auc < performance_threshold:
        #     raise optuna.exceptions.TrialPruned('ROC score lower than threshold.')

    return np.mean(roc_auc_scores)

pruner = optuna.pruners.MedianPruner(n_startup_trials=2, n_warmup_steps=0, interval_steps=1)

study = optuna.create_study(direction='maximize', pruner=pruner)
# study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

- RandomForest

In [None]:
def objective(trial):
    # # # Raw Parameters for individual tunings
    # param = {
    #     # 'ccp_alpha': trial.suggest_float('ccp_alpha', 0.0, 0.1),
    #     # 'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced', 'balanced_subsample']),
    #     # 'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
    #     # 'max_depth': trial.suggest_int('max_depth', 10, 1000, log=True) or None,
    #     # 'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
    #     # 'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 10, 1000, log=True) or None,
    #     # 'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),
    #     # 'min_impurity_decrease': trial.suggest_float('min_impurity_decrease', 0.0, 0.05),
    #     # 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 100),
    #     # 'min_samples_split': trial.suggest_int('min_samples_split', 2, 200),
    #     # 'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.1),
    #     'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
    #     'random_state': 5,
    #     'n_jobs': -1,
    # }

    # Group Parameters after individual tunings (change after testing the individual params)
    param = {
        'ccp_alpha': 0.0,
        'class_weight': None,
        'max_depth': trial.suggest_int('max_depth', 10, 30, log=True),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 300, 700, log=True) or None,
        'max_samples': trial.suggest_float('max_samples', 0.1, 1.0),
        'min_impurity_decrease': 0.0,
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_weight_fraction_leaf': 0.0,
        'max_features': 'log2',
        'criterion': 'entropy',
        'n_estimators': 1000,
        'random_state': 5,
        'n_jobs': -1,
    }

    roc_auc_scores = []

    for i, (train_index, test_index) in enumerate(sk10.split(X_rf, y)):
        X_train, X_test = X_rf.iloc[train_index], X_rf.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = RandomForestClassifier(**param)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, preds, multi_class='ovr', average='macro')
        roc_auc_scores.append(roc_auc)

        # Report intermediate objective value
        trial.report(roc_auc, i)

        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned('ROC score lower than threshold.')

        # # Check if performance is below threshold
        # if roc_auc < performance_threshold:
        #     raise optuna.exceptions.TrialPruned('ROC score lower than threshold.')

    return np.mean(roc_auc_scores)

pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=1, interval_steps=1)

study = optuna.create_study(direction='maximize', pruner=pruner)
# study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

- CatBoost

In [None]:
def objective(trial):
    # Raw Parameters for individual tunings
    param = {
        'iterations': trial.suggest_int('iterations', 50, 2000),
        # 'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        # 'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.3, log=True),
        # 'depth': trial.suggest_int('max_depth', 1, 10),
        # 'subsample': trial.suggest_float('subsample', 0.05, 1),
        # 'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.05, 1.0),
        # 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        # 'l2_leaf_reg': trial.suggest_float('l2_reg', 1e-2, 10),
        # 'random_strength': trial.suggest_float('random_strength', 1e-2, 10),
        # 'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_state': 5,
        'verbose': False,
    }

    # Group Parameters after individual tunings (change after testing the individual params)
    # param = {
    #     'iterations': trial.suggest_int('iterations', 50, 2000),
    #     'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    #     'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.3, log=True),
    #     'depth': trial.suggest_int('max_depth', 1, 10),
    #     'subsample': trial.suggest_float('subsample', 0.05, 1),
    #     'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.05, 1.0),
    #     'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
    #     'l2_leaf_reg': trial.suggest_float('l2_reg', 1e-2, 10),
    #     'random_strength': trial.suggest_float('random_strength', 1e-2, 10),
    #     'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
    #     'random_state': 5,
    #     'verbose': False,
    # }

    roc_auc_scores = []

    for i, (train_index, test_index) in enumerate(sk10.split(X_rf, y)):
        X_train, X_test = X_rf.iloc[train_index], X_rf.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model = CatBoostClassifier(**param)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_test)
        roc_auc = roc_auc_score(y_test, preds, multi_class='ovr', average='macro')
        roc_auc_scores.append(roc_auc)

        # Report intermediate objective value
        trial.report(roc_auc, i)

        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned('ROC score lower than threshold.')

        # # Check if performance is below threshold
        # if roc_auc < performance_threshold:
        #     raise optuna.exceptions.TrialPruned('ROC score lower than threshold.')

    return np.mean(roc_auc_scores)

pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=1, interval_steps=1)

study = optuna.create_study(direction='maximize', pruner=pruner)
# study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

# Ensembling

In [None]:
%%time

model1_results, model1b_results, model1c_results, model1d_results, model1e_results, model2_results, model3_results, model3b_results, model3c_results, model4_results, model4b_results, model4c_results, model4d_results, model4e_results, model5_results, model5b_results, model5c_results, model5d_results, model5e_results, model5f_results, model6_results, model6b_results, model6c_results, model6d_results, model6e_results, y_test_list = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []


for i, (train_index, test_index) in enumerate(sk10.split(X, y)):
    X_train_lgbm, X_test_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
    X_train_xgb, X_test_xgb = X_xgb.iloc[train_index], X_xgb.iloc[test_index]
    X_train_rf, X_test_rf = X_rf.iloc[train_index], X_rf.iloc[test_index]
    X_train_extrat, X_test_extrat = X_extrat.iloc[train_index], X_extrat.iloc[test_index]
    X_train_hist, X_test_hist = X_hist.iloc[train_index], X_hist.iloc[test_index]
    X_train_cat, X_test_cat = X_cat.iloc[train_index], X_cat.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model1.fit(X_train_lgbm, y_train)
    model1_results.append(model1.predict_proba(X_test_lgbm))

    model1b.fit(X_train_lgbm, y_train)
    model1b_results.append(model1b.predict_proba(X_test_lgbm))

    model1c.fit(X_train_lgbm, y_train)
    model1c_results.append(model1c.predict_proba(X_test_lgbm))

    model1d.fit(X_train_lgbm, y_train)
    model1d_results.append(model1d.predict_proba(X_test_lgbm))

    model1e.fit(X_train_lgbm, y_train)
    model1e_results.append(model1e.predict_proba(X_test_lgbm))

    print('Running XGBoost')

    model2.fit(X_train_xgb, y_train)
    model2_results.append(model2.predict_proba(X_test_xgb))

    print('Running Random Forest')

    model3.fit(X_train_rf, y_train)
    model3_results.append(model3.predict_proba(X_test_rf))

    model3b.fit(X_train_rf, y_train)
    model3b_results.append(model3b.predict_proba(X_test_rf))

    model3c.fit(X_train_rf, y_train)
    model3c_results.append(model3c.predict_proba(X_test_rf))

    print('Running ExtraTrees')

    model4.fit(X_train_extrat, y_train)
    model4_results.append(model4.predict_proba(X_test_extrat))

    model4b.fit(X_train_extrat, y_train)
    model4b_results.append(model4b.predict_proba(X_test_extrat))

    model4c.fit(X_train_extrat, y_train)
    model4c_results.append(model4c.predict_proba(X_test_extrat))

    model4d.fit(X_train_extrat, y_train)
    model4d_results.append(model4d.predict_proba(X_test_extrat))

    model4e.fit(X_train_extrat, y_train)
    model4e_results.append(model4e.predict_proba(X_test_extrat))

    print('Running Hist Gradient')

    model5.fit(X_train_hist, y_train)
    model5_results.append(model5.predict_proba(X_test_hist))

    model5b.fit(X_train_hist, y_train)
    model5b_results.append(model5b.predict_proba(X_test_hist))

    model5c.fit(X_train_hist, y_train)
    model5c_results.append(model5c.predict_proba(X_test_hist))

    model5d.fit(X_train_hist, y_train)
    model5d_results.append(model5d.predict_proba(X_test_hist))

    model5e.fit(X_train_hist, y_train)
    model5e_results.append(model5e.predict_proba(X_test_hist))

    model5f.fit(X_train_hist, y_train)
    model5f_results.append(model5f.predict_proba(X_test_hist))

    print('Running CatBoost')

    model6.fit(X_train_cat, y_train)
    model6_results.append(model6.predict_proba(X_test_cat))

    # model6b.fit(X_train_cat, y_train)
    # model6b_results.append(model6b.predict_proba(X_test_cat))

    # model6c.fit(X_train_cat, y_train)
    # model6c_results.append(model6c.predict_proba(X_test_cat))

    # model6d.fit(X_train_cat, y_train)
    # model6d_results.append(model6d.predict_proba(X_test_cat))

    model6e.fit(X_train_cat, y_train)
    model6e_results.append(model6e.predict_proba(X_test_cat))

    y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

In [None]:
%%time

# model1_weights, model1b_weights, model1c_weights, model1d_weights, model1e_weights, model2_weights, model3_weights, model3b_weights, model3c_weights, model4_weights, model4b_weights, model4c_weights, model4d_weights, model4e_weights, model5_weights, model5b_weights, model5c_weights, model5d_weights, model5e_weights, model5f_weights, model6_weights, model6b_weights, model6c_weights, model6d_weights, model6e_weights, scores = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []

model1_weights, model1b_weights, model1c_weights, model1d_weights, model1e_weights, model2_weights, model3_weights, model3b_weights, model3c_weights, model4_weights, model4b_weights, model4c_weights, model4d_weights, model4e_weights, model5_weights, model5b_weights, model5c_weights, model5d_weights, model5e_weights, model5f_weights, model6_weights, model6e_weights, scores = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []

scores_in = []

for i in tqdm(range(20)):
    weight_1 = np.random.random_sample(size=1)[0]
    weight_1b = np.random.random_sample(size=1)[0]
    weight_1c = np.random.random_sample(size=1)[0]
    weight_1d = np.random.random_sample(size=1)[0]
    weight_1e = np.random.random_sample(size=1)[0]
    weight_2 = np.random.random_sample(size=1)[0]
    weight_3 = np.random.random_sample(size=1)[0]
    weight_3b = np.random.random_sample(size=1)[0]
    weight_3c = np.random.random_sample(size=1)[0]
    weight_4 = np.random.random_sample(size=1)[0]
    weight_4b = np.random.random_sample(size=1)[0]
    weight_4c = np.random.random_sample(size=1)[0]
    weight_4d = np.random.random_sample(size=1)[0]
    weight_4e = np.random.random_sample(size=1)[0]
    weight_5 = np.random.random_sample(size=1)[0]
    weight_5b = np.random.random_sample(size=1)[0]
    weight_5c = np.random.random_sample(size=1)[0]
    weight_5d = np.random.random_sample(size=1)[0]
    weight_5e = np.random.random_sample(size=1)[0]
    weight_5f = np.random.random_sample(size=1)[0]
    weight_6 = np.random.random_sample(size=1)[0]
    # weight_6b = np.random.random_sample(size=1)[0]
    # weight_6c = np.random.random_sample(size=1)[0]
    # weight_6d = np.random.random_sample(size=1)[0]
    weight_6e = np.random.random_sample(size=1)[0]

    model1_weights.append(weight_1)
    model1b_weights.append(weight_1b)
    model1c_weights.append(weight_1c)
    model1d_weights.append(weight_1d)
    model1e_weights.append(weight_1e)
    model2_weights.append(weight_2)
    model3_weights.append(weight_3)
    model3b_weights.append(weight_3b)
    model3c_weights.append(weight_3c)
    model4_weights.append(weight_4)
    model4b_weights.append(weight_4b)
    model4c_weights.append(weight_4c)
    model4d_weights.append(weight_4d)
    model4e_weights.append(weight_4e)
    model5_weights.append(weight_5)
    model5b_weights.append(weight_5b)
    model5c_weights.append(weight_5c)
    model5d_weights.append(weight_5d)
    model5e_weights.append(weight_5e)
    model5f_weights.append(weight_5f)
    model6_weights.append(weight_6)
    # model6b_weights.append(weight_6b)
    # model6c_weights.append(weight_6c)
    # model6d_weights.append(weight_6d)
    model6e_weights.append(weight_6e)

    # scores_in = []

    for j in range(n_splits):
        weighted_pred = (weight_1 * model1_results[j])
        + (weight_1b * model1b_results[j])
        + (weight_1c * model1c_results[j])
        + (weight_1d * model1d_results[j])
        + (weight_1e * model1e_results[j])
        + (weight_2 * model2_results[j])
        + (weight_3 * model3_results[j])
        + (weight_3b * model3b_results[j])
        + (weight_3c * model3c_results[j])
        + (weight_4 * model4_results[j])
        + (weight_4b * model4b_results[j])
        + (weight_4c * model4c_results[j])
        + (weight_4d * model4d_results[j])
        + (weight_4e * model4e_results[j])
        + (weight_5 * model5_results[j])
        + (weight_5b * model5b_results[j])
        + (weight_5c * model5c_results[j])
        + (weight_5d * model5d_results[j])
        + (weight_5e * model5e_results[j])
        + (weight_5f * model5f_results[j])
        + (weight_6 * model6_results[j])
        # + (weight_6b * model6b_results[j])
        # + (weight_6c * model6c_results[j])
        # + (weight_6d * model6d_results[j])
        + (weight_6e * model6e_results[j])

        weighted_pred_normalized = weighted_pred / np.sum(weighted_pred, axis=1, keepdims=True)

        scores_in.append(roc_auc_score(y_test_list[j], weighted_pred_normalized, multi_class='ovr'))
        
    scores.append(np.mean(scores_in))

In [None]:
# Concatenate the predictions for each model and the true labels
all_predictions = [np.concatenate(model_results) for model_results in [model1_results, model1b_results, model1c_results, model1d_results, model1e_results, model2_results, model3_results, model3b_results, model3c_results, model4_results, model4b_results, model4c_results, model4d_results, model4e_results, model5_results, model5b_results, model5c_results, model5d_results, model5e_results, model5f_results, model6_results, model6e_results]]
all_true_labels = np.concatenate(y_test_list)


In [None]:
from optuna.samplers import RandomSampler, TPESampler

# Define the objective function
def objective(trial):
    # Generate weights for each model's prediction
    weights = [trial.suggest_float(f'w{i}', -1, 1) for i in range(len(all_predictions))]

    # Compute weighted sum of predictions
    weighted_sum = np.zeros_like(all_predictions[0])
    for weight, prediction in zip(weights, all_predictions):
        weighted_sum += weight * prediction

    # Normalize the weighted sum to ensure it forms a proper probability distribution
    weighted_sum_normalized = np.divide(weighted_sum, np.sum(weighted_sum, axis=1, keepdims=True))
    
    # Compute and return the multi-class ROC AUC score
    # Note: You might need to adjust the `average` parameter based on how you want to average the AUCs
    score = roc_auc_score(all_true_labels, weighted_sum_normalized, multi_class='ovr', average='macro')
    return score

# def switch_sampler(study, trials):
#     if len(study.trials) == 250:
#         study.sampler = TPESampler(seed=5)
#     # elif len(study.trials) == 50:
#     #     study.sampler = TPESampler()
#     # elif len(study.trials) == 100:
#     #     study.sampler = TPESampler()
#     # elif len(study.trials) == 150:
#     #     study.sampler = TPESampler()

# sampler = RandomSampler(seed=5)

# # Create an Optuna study and optimize the objective
# study = optuna.create_study(direction='maximize', sampler=sampler)
# study.optimize(objective, n_trials=500, callbacks=[switch_sampler])

# Create an Optuna study and optimize the objective
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

# Print the optimal weights found
print("Optimal weights:", study.best_params)

In [None]:
# Extract optimal weights from Optuna study
optimal_weights = study.best_params

# Assuming `optimal_weights` is a dictionary with model identifiers as keys
# and the optimized weight as values, you can directly use it to create a DataFrame
# For the 'scores', you would use the best score achieved during the Optuna study
optuna_results_df = pd.DataFrame([optimal_weights])
optuna_results_df['score'] = study.best_value

optuna_results_df.columns = ['model_1', 'model_1b', 'model_1c', 'model_1d', 'model_1e', 'model_2', 'model_3', 'model_3b', 'model_3c', 'model_4', 'model_4b', 'model_4c', 'model_4d', 'model_4e', 'model_5', 'model_5b', 'model_5c', 'model_5d', 'model_5e', 'model_5f', 'model_6', 'model_6e', 'score']

# Since you only have one row of data (the best combination of weights),
# sorting by 'score' or getting the top rows doesn't apply as it's already the best
optuna_results_df.head()

In [None]:
results_df = pd.DataFrame()
results_df['model_1'] = model1_weights
results_df['model_1b'] = model1b_weights
results_df['model_1c'] = model1c_weights
results_df['model_1d'] = model1d_weights
results_df['model_1e'] = model1e_weights
results_df['model_2'] = model2_weights
results_df['model_3'] = model3_weights
results_df['model_3b'] = model3b_weights
results_df['model_3c'] = model3c_weights
results_df['model_4'] = model4_weights
results_df['model_4b'] = model4b_weights
results_df['model_4c'] = model4c_weights
results_df['model_4d'] = model4d_weights
results_df['model_4e'] = model4e_weights
results_df['model_5'] = model5_weights
results_df['model_5b'] = model5b_weights
results_df['model_5c'] = model5c_weights
results_df['model_5d'] = model5d_weights
results_df['model_5e'] = model5e_weights
results_df['model_5f'] = model5f_weights
results_df['model_6'] = model6_weights
# results_df['model_6b'] = model6b_weights
# results_df['model_6c'] = model6c_weights
# results_df['model_6d'] = model6d_weights
results_df['model_6e'] = model6e_weights
results_df['score'] = scores

results_df = results_df.sort_values(by='score', ascending=False).reset_index(drop=True)
results_df.head(10)

In [None]:
results_df.to_csv('random_weights_normalized.csv', index=False)

# Get Submission (Random Weight Ensemble)

In [None]:
%%time

print('Running LGBM')
model1_final = model1.fit(X_lgbm, y)
model1b_final = model1b.fit(X_lgbm, y)
model1c_final = model1c.fit(X_lgbm, y)
model1d_final = model1d.fit(X_lgbm, y)
model1e_final = model1e.fit(X_lgbm, y)

print('Running XGBoost')
model2_final = model2.fit(X_xgb, y)

print('Running Random Forest')
model3_final = model3.fit(X_rf, y)
model3b_final = model3b.fit(X_rf, y)
model3c_final = model3c.fit(X_rf, y)

print('Running ExtraTrees')
model4_final = model4.fit(X_extrat, y)
model4b_final = model4b.fit(X_extrat, y)
model4c_final = model4c.fit(X_extrat, y)
model4d_final = model4d.fit(X_extrat, y)
model4e_final = model4e.fit(X_extrat, y)

print('Running HistGradient')
model5_final = model5.fit(X_hist, y)
model5b_final = model5b.fit(X_hist, y)
model5c_final = model5c.fit(X_hist, y)
model5d_final = model5d.fit(X_hist, y)
model5e_final = model5e.fit(X_hist, y)
model5f_final = model5f.fit(X_hist, y)

print('Running CatBoost')
model6_final = model6.fit(X_cat, y)
# model6b_final = model6b.fit(X_cat, y)
# model6c_final = model6c.fit(X_cat, y)
# model6d_final = model6d.fit(X_cat, y)
model6e_final = model6e.fit(X_cat, y)

In [None]:
%%time

ensemble_pred = (
                results_df['model_1'][0] * model1_final.predict_proba(test[model1_feats]) +
                results_df['model_1b'][0] * model1b_final.predict_proba(test[model1_feats]) +
                results_df['model_1c'][0] * model1c_final.predict_proba(test[model1_feats]) +
                results_df['model_1d'][0] * model1d_final.predict_proba(test[model1_feats]) +
                results_df['model_1e'][0] * model1e_final.predict_proba(test[model1_feats]) +
                results_df['model_2'][0] * model2_final.predict_proba(test[model2_feats]) +
                results_df['model_3'][0] * model3_final.predict_proba(test[model3_feats]) +
                results_df['model_3b'][0] * model3b_final.predict_proba(test[model3_feats]) +
                results_df['model_3c'][0] * model3c_final.predict_proba(test[model3_feats]) +
                results_df['model_4'][0] * model4_final.predict_proba(test[model4_feats]) +
                results_df['model_4b'][0] * model4b_final.predict_proba(test[model4_feats]) +
                results_df['model_4c'][0] * model4c_final.predict_proba(test[model4_feats]) +
                results_df['model_4d'][0] * model4d_final.predict_proba(test[model4_feats]) +
                results_df['model_4e'][0] * model4e_final.predict_proba(test[model4_feats]) +
                results_df['model_5'][0] * model5_final.predict_proba(test[model5_feats]) +
                results_df['model_5b'][0] * model5b_final.predict_proba(test[model5_feats]) +
                results_df['model_5c'][0] * model5c_final.predict_proba(test[model5_feats]) +
                results_df['model_5d'][0] * model5d_final.predict_proba(test[model5_feats]) +
                results_df['model_5e'][0] * model5e_final.predict_proba(test[model5_feats]) +
                results_df['model_5f'][0] * model5f_final.predict_proba(test[model5_feats]) +
                results_df['model_6'][0] * model6_final.predict_proba(test[model6_feats]) +
                # results_df['model_6b'][0] * model6b_final.predict_proba(test[model6_feats]) +
                # results_df['model_6c'][0] * model6c_final.predict_proba(test[model6_feats]) +
                # results_df['model_6d'][0] * model6d_final.predict_proba(test[model6_feats]) +
                results_df['model_6e'][0] * model6e_final.predict_proba(test[model6_feats])
                 )

ensemble_df = pd.DataFrame(ensemble_pred)

# If all models predict 0, instead of getting NaN, fill in 0
ensemble_df = ensemble_df.div(ensemble_df.sum(axis=1), axis=0).fillna(0)
ensemble_df.columns = label_encoder.classes_

In [None]:
%%time

optuna_ensemble_pred = (
                optuna_results_df['model_1'][0] * model1_final.predict_proba(test[model1_feats]) +
                optuna_results_df['model_1b'][0] * model1b_final.predict_proba(test[model1_feats]) +
                optuna_results_df['model_1c'][0] * model1c_final.predict_proba(test[model1_feats]) +
                optuna_results_df['model_1d'][0] * model1d_final.predict_proba(test[model1_feats]) +
                optuna_results_df['model_1e'][0] * model1e_final.predict_proba(test[model1_feats]) +
                optuna_results_df['model_2'][0] * model2_final.predict_proba(test[model2_feats]) +
                optuna_results_df['model_3'][0] * model3_final.predict_proba(test[model3_feats]) +
                optuna_results_df['model_3b'][0] * model3b_final.predict_proba(test[model3_feats]) +
                optuna_results_df['model_3c'][0] * model3c_final.predict_proba(test[model3_feats]) +
                optuna_results_df['model_4'][0] * model4_final.predict_proba(test[model4_feats]) +
                optuna_results_df['model_4b'][0] * model4b_final.predict_proba(test[model4_feats]) +
                optuna_results_df['model_4c'][0] * model4c_final.predict_proba(test[model4_feats]) +
                optuna_results_df['model_4d'][0] * model4d_final.predict_proba(test[model4_feats]) +
                optuna_results_df['model_4e'][0] * model4e_final.predict_proba(test[model4_feats]) +
                optuna_results_df['model_5'][0] * model5_final.predict_proba(test[model5_feats]) +
                optuna_results_df['model_5b'][0] * model5b_final.predict_proba(test[model5_feats]) +
                optuna_results_df['model_5c'][0] * model5c_final.predict_proba(test[model5_feats]) +
                optuna_results_df['model_5d'][0] * model5d_final.predict_proba(test[model5_feats]) +
                optuna_results_df['model_5e'][0] * model5e_final.predict_proba(test[model5_feats]) +
                optuna_results_df['model_5f'][0] * model5f_final.predict_proba(test[model5_feats]) +
                optuna_results_df['model_6'][0] * model6_final.predict_proba(test[model6_feats]) +
                # optuna_results_df['model_6b'][0] * model6b_final.predict_proba(test[model6_feats]) +
                # optuna_results_df['model_6c'][0] * model6c_final.predict_proba(test[model6_feats]) +
                # optuna_results_df['model_6d'][0] * model6d_final.predict_proba(test[model6_feats]) +
                optuna_results_df['model_6e'][0] * model6e_final.predict_proba(test[model6_feats])
                 )

optuna_ensemble_df = pd.DataFrame(optuna_ensemble_pred)

# If all models predict 0, instead of getting NaN, fill in 0
optuna_ensemble_df = optuna_ensemble_df.div(optuna_ensemble_df.sum(axis=1), axis=0).fillna(0)
optuna_ensemble_df.columns = label_encoder.classes_

In [None]:
optuna_ensemble_df.head()

In [None]:
ensemble_df.head()

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], optuna_ensemble_df], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv('submission_optuna_weights_ensemble_3fold_0.902197.csv', index=False)

# Get submission (Stacking)

In [None]:
%%time

from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
import numpy as np


roc_auc_scores = []

# Define the base models
base_models = [
    ('model1', model1_final),
    ('model2', model2_final),
    ('model3', model3_final),
    ('model4', model4_final),
    ('model5', model5_final),
    ('model6', model6_final)
]

# Initialize the Stacking Classifier with LogisticRegression as the final estimator
final_estimator = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
# final_estimator = LGBMClassifier(n_jobs=-1, random_state=5)
# final_estimator = XGBClassifier(random_state=5)
# final_estimator = RandomForestClassifier(random_state=5)
# final_estimator = ExtraTreesClassifier(random_state=5)
# final_estimator = HistGradientBoostingClassifier(random_state=5)
# final_estimator = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100)

stacking_clf = StackingClassifier(estimators=base_models, final_estimator=final_estimator, passthrough=False, cv=3)

for i, (train_index, test_index) in enumerate(sk10.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict_proba(X_test)

    # Assuming your classes are 0, 1, 2, etc., adjust as necessary
    y_test_binarized = label_binarize(y_test, classes=np.unique(y))
    roc_auc = roc_auc_score(y_test_binarized, y_pred, multi_class='ovr')

    roc_auc_scores.append(roc_auc)

    print(f'Done with fold {i+1}.')
    
print(f'The average stacking score is {np.mean(roc_auc_scores)}')

- Logistic Reg - 0.886778
- LGBM - 0.885863
- XGB - 0.881636
- RF - 0.883835
- ET - 0.884523
- Hist - 0.886572
- Cat - 0.886183

In [None]:
# Predictions on unseen test data
y_test_pred = stacking_clf.predict_proba(test)

stacking_df = pd.DataFrame(y_test_pred)

ensemble_df.head()

In [None]:
%%time

model1_results, model2_results, model3_results, model4_results, model5_results, model6_results, y_test_list = [], [], [], [], [], [], []

# # Placeholder for OOF predictions for each model
# # Assuming you have a dataset with N samples
# N = len(y)  # y_train is your target variable array
# oof_preds1 = np.zeros((N, 1))
# oof_preds2 = np.zeros((N, 1))
# oof_preds3 = np.zeros((N, 1))
# oof_preds4 = np.zeros((N, 1))
# oof_preds5 = np.zeros((N, 1))
# oof_preds6 = np.zeros((N, 1))

# # Similarly, for test predictions, accumulate them over folds
# # Assuming you have a test set with M samples
# M = len(test)  # x_test needs to be defined by you
# test_preds1 = np.zeros((M, 1))
# test_preds2 = np.zeros((M, 1))
# test_preds3 = np.zeros((M, 1))
# test_preds4 = np.zeros((M, 1))
# test_preds5 = np.zeros((M, 1))
# test_preds6 = np.zeros((M, 1))

target_length = len(y)
no_classes = len(np.unique(y))
test_length = len(test)

# Initialize arrays for OOF and test predictions with dimensions for multiclass for each model
lgbm_oof_preds = np.zeros((target_length, no_classes))
lgbm_test_preds = np.zeros((test_length, no_classes))

xgb_oof_preds = np.zeros((target_length, no_classes))
xgb_test_preds = np.zeros((test_length, no_classes))

rf_oof_preds = np.zeros((target_length, no_classes))
rf_test_preds = np.zeros((test_length, no_classes))

extrat_oof_preds = np.zeros((target_length, no_classes))
extrat_test_preds = np.zeros((test_length, no_classes))

hist_oof_preds = np.zeros((target_length, no_classes))
hist_test_preds = np.zeros((test_length, no_classes))

cat_oof_preds = np.zeros((target_length, no_classes))
cat_test_preds = np.zeros((test_length, no_classes))

X_lgbm = X[model1_feats]
X_xgb = X[model2_feats]
X_rf = X[model3_feats]
X_extrat = X[model4_feats]
X_hist = X[model5_feats]
X_cat = X[model6_feats]


In [None]:

for i, (train_index, test_index) in enumerate(sk10.split(X, y)):

    # Placeholder arrays for the fold's predicition
    fold_oof_preds_lgbm = np.zeros((len(test_index), no_classes))
    fold_test_preds_lgbm = np.zeros((test_length, no_classes))

    fold_oof_preds_xgb = np.zeros((len(test_index), no_classes))
    fold_test_preds_xgb = np.zeros((test_length, no_classes))

    fold_oof_preds_rf = np.zeros((len(test_index), no_classes))
    fold_test_preds_rf = np.zeros((test_length, no_classes))

    fold_oof_preds_extrat = np.zeros((len(test_index), no_classes))
    fold_test_preds_extrat = np.zeros((test_length, no_classes))

    fold_oof_preds_hist = np.zeros((len(test_index), no_classes))
    fold_test_preds_hist = np.zeros((test_length, no_classes))

    fold_oof_preds_cat = np.zeros((len(test_index), no_classes))
    fold_test_preds_cat = np.zeros((test_length, no_classes))

    # Get each models train and test for X and y
    X_train_lgbm, X_test_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
    X_train_xgb, X_test_xgb = X_xgb.iloc[train_index], X_xgb.iloc[test_index]
    X_train_rf, X_test_rf = X_rf.iloc[train_index], X_rf.iloc[test_index]
    X_train_extrat, X_test_extrat = X_extrat.iloc[train_index], X_extrat.iloc[test_index]
    X_train_hist, X_test_hist = X_hist.iloc[train_index], X_hist.iloc[test_index]
    X_train_cat, X_test_cat = X_cat.iloc[train_index], X_cat.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    ########
    # LGBM #
    ########
    model1.fit(X_train_lgbm, y_train)
    fold_oof_preds_lgbm = model1.predict_proba(X_test_lgbm)

    # Update the OOF prediction for this fold
    lgbm_oof_preds[test_index] = fold_oof_preds_lgbm

    # Predict on the test set and accumulate predictions
    fold_test_preds_lgbm += model1.predict_proba(test.loc[:, model1_feats]) / sk10.n_splits

    lgbm_test_preds += fold_test_preds_lgbm


    ###########
    # XGBOOST #
    ###########
    model2.fit(X_train_xgb, y_train)
    fold_oof_preds_xgb = model2.predict_proba(X_test_xgb)

    # Update the OOF prediction for this fold
    xgb_oof_preds[test_index] = fold_oof_preds_xgb

    # Predict on the test set and accumulate predictions
    fold_test_preds_xgb += model2.predict_proba(test.loc[:, model2_feats]) / sk10.n_splits

    xgb_test_preds += fold_test_preds_xgb


    #################
    # RANDOM FOREST #
    #################
    model3.fit(X_train_rf, y_train)
    fold_oof_preds_rf = model3.predict_proba(X_test_rf)

    # Update the OOF prediction for this fold
    rf_oof_preds[test_index] = fold_oof_preds_rf

    # Predict on the test set and accumulate predictions
    fold_test_preds_rf += model3.predict_proba(test.loc[:, model3_feats]) / sk10.n_splits

    rf_test_preds += fold_test_preds_rf

    
    ###############
    # EXTRA TREES #
    ###############
    model4.fit(X_train_extrat, y_train)
    fold_oof_preds_extrat = model4.predict_proba(X_test_extrat)

    # Update the OOF prediction for this fold
    extrat_oof_preds[test_index] = fold_oof_preds_extrat

    # Predict on the test set and accumulate predictions
    fold_test_preds_extrat += model4.predict_proba(test.loc[:, model4_feats]) / sk10.n_splits

    extrat_test_preds += fold_test_preds_extrat


    #################
    # HIST GRADIENT #
    #################
    model5.fit(X_train_hist, y_train)
    fold_oof_preds_hist = model5.predict_proba(X_test_hist)

    # Update the OOF prediction for this fold
    hist_oof_preds[test_index] = fold_oof_preds_hist

    # Predict on the test set and accumulate predictions
    fold_test_preds_hist += model5.predict_proba(test.loc[:, model5_feats]) / sk10.n_splits

    hist_test_preds += fold_test_preds_hist


    ############
    # CATBOOST #
    ############
    model6.fit(X_train_cat, y_train)
    fold_oof_preds_cat = model6.predict_proba(X_test_cat)

    # Update the OOF prediction for this fold
    cat_oof_preds[test_index] = fold_oof_preds_cat

    # Predict on the test set and accumulate predictions
    fold_test_preds_cat += model6.predict_proba(test.loc[:, model6_feats]) / sk10.n_splits

    cat_test_preds += fold_test_preds_cat
    # y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

In [None]:
# roc_auc_scores = [roc_auc_score((y == class_id).astype(int), oof_preds[:, class_id], multi_class='ovr') for class_id in range(no_classes)]
lgbm_roc_auc = roc_auc_score(y, lgbm_oof_preds, multi_class='ovr', average='macro')
print("Average LGBM ROC AUC Score:", lgbm_roc_auc)

xgb_roc_auc = roc_auc_score(y, xgb_oof_preds, multi_class='ovr', average='macro')
print("Average XGBoost ROC AUC Score:", xgb_roc_auc)

rf_roc_auc = roc_auc_score(y, rf_oof_preds, multi_class='ovr', average='macro')
print("Average Random Forest ROC AUC Score:", rf_roc_auc)

extrat_roc_auc = roc_auc_score(y, extrat_oof_preds, multi_class='ovr', average='macro')
print("Average Extra Trees ROC AUC Score:", extrat_roc_auc)

hist_roc_auc = roc_auc_score(y, hist_oof_preds, multi_class='ovr', average='macro')
print("Average Hist Gradient ROC AUC Score:", hist_roc_auc)

cat_roc_auc = roc_auc_score(y, cat_oof_preds, multi_class='ovr', average='macro')
print("Average CatBoost ROC AUC Score:", cat_roc_auc)

# 0.89369590207664
# 0.00201442835387733
# 0.886778 - StackingClassifier

In [None]:
from sklearn.linear_model import LogisticRegression

# After running the fitting and prediction with the first level of machine learning models
x_train = np.concatenate(( lgbm_oof_preds, xgb_oof_preds, rf_oof_preds, extrat_oof_preds, hist_oof_preds, cat_oof_preds), axis=1)
test_stack = np.concatenate(( lgbm_test_preds, xgb_test_preds, rf_test_preds, extrat_test_preds, hist_test_preds, cat_test_preds), axis=1)

# Assuming the second-level stacking is to be done with XGboost (pre-tuned). Yes! You can tune second-level stack

stacking_estimator = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')

xgb = stacking_estimator.fit(x_train, y)
final_predictions = xgb.predict_proba(test_stack)

In [None]:
oof_preds = np.zeros((x_train.shape[0], no_classes))
test_preds = np.zeros(test_stack.shape[0])

for i, (train_index, test_index) in enumerate(sk10.split(x_train, y)):
    X_train, X_test = x_train[train_index], x_train[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model2.fit(X_train, y_train)
    y_pred = model2.predict_proba(X_test)

    # Assign predictions for this fold to the appropriate indices in oof_preds
    oof_preds[test_index, :] = y_pred
    
    print(f'Done with fold {i+1}.')

# Calculate ROC AUC on the OOF predictions
roc_auc = roc_auc_score(y, oof_preds, multi_class='ovr', average='macro')
print(f'The stacking score is {roc_auc}')

- Logistic Reg - 0.8883102077923056
- LGBM - 0.8880225088607244
- XGB - 0.8846028966376445
- RF - 
- ET - 
- Hist - 
- Cat - 

In [None]:
final_predictions_df = pd.DataFrame(final_predictions)
final_predictions_df.columns = label_encoder.classes_

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], final_predictions_df], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv('submission_stacking_3fold_0.88831.csv', index=False)