In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import pandas as pd
import numpy as np
import random
from tqdm.notebook import tqdm
import optuna

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_validate, RandomizedSearchCV

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor
from category_encoders import TargetEncoder

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from pprint import pprint

pd.set_option('display.max_columns', None)

experiment_name = 'cat_lgbm_ensemble'

In [2]:
train = pd.read_csv(r'.\train.csv')
test = pd.read_csv(r'.\test.csv')

train.head(2)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0


In [3]:
TARGET = 'Exited'
binary_cols = ['Gender', 'HasCrCard', 'IsActiveMember']
categorical_cols = ['Geography', 'NumOfProducts']
text_cols = ['Surname']
drop_cols = ['id']

In [4]:
numerical_cols = train.drop(categorical_cols + binary_cols + drop_cols + [TARGET], axis=1).select_dtypes(include=np.number).columns
numerical_cols

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'EstimatedSalary'],
      dtype='object')

# LGBM Preprocess

In [5]:
transformer_lgbm = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', drop='if_binary'), binary_cols),
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist'), categorical_cols),
    (TargetEncoder(), text_cols),
    (StandardScaler(), numerical_cols),
    remainder='passthrough')

df_to_ohe_lgbm = train.drop(drop_cols, axis=1)
test_ohe_lgbm = test.drop('id', axis=1)

# Split the features and the target variable
X = df_to_ohe_lgbm.drop(TARGET, axis=1)
y = df_to_ohe_lgbm[TARGET]

# Fit the transformer_lgbm
transformer_lgbm.fit(X, y)

transformed_lgbm = transformer_lgbm.transform(X)

# Get the transformed_lgbm feature names
transformed_lgbm_feat_names = [name.split('__')[-1] for name in transformer_lgbm.get_feature_names_out()]

# Create DataFrame of the transformed_lgbm features
df_to_ohe_transformed_lgbm = pd.DataFrame(transformed_lgbm, columns=transformed_lgbm_feat_names)
df_to_ohe_transformed_lgbm.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
100819,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.188406,0.84839,0.156618,1.339108,-0.007253,-0.883163,0.624065


In [6]:
transformed_new_data_lgbm = transformer_lgbm.transform(test_ohe_lgbm)

# Create DataFrame of the transformed features
test_transformed_lgbm = pd.DataFrame(transformed_new_data_lgbm, columns=transformed_lgbm_feat_names)
test_transformed_lgbm.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
76336,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.251497,0.595524,-0.929482,-1.141952,1.418187,1.408093,-1.310315


In [7]:
# Reset indexes before assignment to reassign indices to both DataFrames, ensuring they are aligned
df_to_ohe_transformed_lgbm.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
df_to_ohe_transformed_lgbm[TARGET] = train[TARGET]

df_to_ohe_transformed_lgbm.head()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary,Exited
0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.318008,-0.239126,0.144135,-0.578074,-0.719973,-0.883163,1.369486,0
1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.326667,0.800755,-0.367706,-0.578074,-1.432694,-0.883163,-1.254085,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.222052,0.035085,0.268974,0.211354,1.774548,-0.883163,1.437422,0
3,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.181991,0.692068,-0.941966,-0.465299,-1.076334,1.486918,-0.557018,0
4,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.146341,1.038788,0.743362,-0.578074,-0.007253,-0.883163,-1.93877,0


# CatBoost Preprocess

In [8]:
transformer_cat = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', drop='if_binary'), binary_cols),
    (OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist'), categorical_cols),
    (TargetEncoder(), text_cols),
    (MaxAbsScaler(), numerical_cols),
    remainder='passthrough')

df_to_ohe_cat = train.drop(drop_cols, axis=1)
test_ohe_cat = test.drop('id', axis=1)

# transformed = transformer_cat.fit_transform(df_to_ohe_cat)

# Split the features and the target variable
X = df_to_ohe_cat.drop(TARGET, axis=1)
y = df_to_ohe_cat[TARGET]

# Fit the transformer_cat
transformer_cat.fit(X, y)

transformed_cat = transformer_cat.transform(X)

# Get the transformed_cat feature names
transformed_cat_feat_names = [name.split('__')[-1] for name in transformer_cat.get_feature_names_out()]

# Create DataFrame of the transformed_cat features
df_to_ohe_transformed_cat = pd.DataFrame(transformed_cat, columns=transformed_cat_feat_names)
df_to_ohe_transformed_cat.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
141866,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.160653,0.990443,0.804706,0.456522,0.3,0.0,0.474393


In [9]:
transformed_new_data_cat = transformer_cat.transform(test_ohe_cat)

# Create DataFrame of the transformed features
test_transformed_cat = pd.DataFrame(transformed_new_data_cat, columns=transformed_cat_feat_names)
test_transformed_cat.sample()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary
100551,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.216381,0.998459,0.88,0.293478,0.2,0.0,0.715275


In [10]:
# Reset indexes before assignment to reassign indices to both DataFrames, ensuring they are aligned
df_to_ohe_transformed_cat.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
df_to_ohe_transformed_cat[TARGET] = train[TARGET]

df_to_ohe_transformed_cat.head()

Unnamed: 0,Gender_Male,HasCrCard_1.0,IsActiveMember_1.0,Geography_France,Geography_Germany,Geography_Spain,NumOfProducts_1,NumOfProducts_2,NumOfProducts_3,NumOfProducts_4,Surname,CustomerId,CreditScore,Age,Tenure,Balance,EstimatedSalary,Exited
0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.318008,0.9911,0.785882,0.358696,0.3,0.0,0.907284,0
1,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.326667,0.995794,0.737647,0.358696,0.1,0.0,0.247527,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.222052,0.992338,0.797647,0.434783,1.0,0.0,0.924368,0
3,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.181991,0.995304,0.683529,0.369565,0.2,0.593398,0.42282,0
4,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.146341,0.996869,0.842353,0.358696,0.5,0.0,0.075347,0


# Model Pre-Works

- Mutual Information Features

In [11]:
cat_features = ['Age',
    'NumOfProducts_2',
    'IsActiveMember_1.0',
    'Surname',
    'Geography_Germany',
    'NumOfProducts_3',
    'Balance',
    'Gender_Male',
    'HasCrCard_1.0',
    'EstimatedSalary',
    'CreditScore',
    'NumOfProducts_4',
    'Geography_Spain']

lgbm_features = ['Age',
    'NumOfProducts_2',
    'NumOfProducts_1',
    'IsActiveMember_1.0',
    'Surname',
    'Geography_Germany',
    'Balance',
    'Gender_Male',
    'HasCrCard_1.0',
    'EstimatedSalary',
    'CreditScore',
    'NumOfProducts_4',
    'Tenure']

- Cross Validation Scheme

In [12]:
sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)

- LGBM Feature Selection

In [13]:
X_lgbm = df_to_ohe_transformed_lgbm[lgbm_features]
y_lgbm = df_to_ohe_transformed_lgbm[TARGET]

test_features_lgbm = test_transformed_lgbm.copy()

- CatBoost Feature Selection

In [14]:
X_cat = df_to_ohe_transformed_cat[cat_features]
y_cat = df_to_ohe_transformed_cat[TARGET]

test_features_cat = test_transformed_cat.copy()

- Models Setup (LGBM Models)

In [15]:
params_1 = {'n_estimators': 3414, 'max_depth': 40, 'num_leaves': 25, 'learning_rate': 0.0062645811308150124, 'reg_alpha': 8.873443530076086, 'reg_lambda': 5.885447781455638, 'subsample': 0.7884773833791413, 'colsample_bytree': 0.5089526054590882, 'random_state':5}
params_2 = {'n_estimators': 3124, 'max_depth': 34, 'num_leaves': 102, 'learning_rate': 0.005412223162683512, 'reg_alpha': 3.507801184150934, 'reg_lambda': 8.456634587433385, 'subsample': 0.7681541520182114, 'colsample_bytree': 0.3465406359524291, 'random_state':5}
params_3 = {'subsample': 0.487015585813217, 'reg_alpha': 0.534558240211952, 'num_leaves': 70, 'learning_rate': 0.11404268301840845, 'colsample_bytree': 0.5130103309743103, 'random_state':5}
params_4 = {'subsample': 0.4583793258901043, 'reg_alpha': 6.898808410934859, 'num_leaves': 27, 'learning_rate': 0.16883582860379132, 'colsample_bytree': 0.48557488149384453, 'random_state':5}

model1 = LGBMClassifier(n_jobs=-1, **params_1)
model2 = LGBMClassifier(n_jobs=-1, **params_2)
model3 = LGBMClassifier(n_jobs=-1, **params_3)
model4 = LGBMClassifier(n_jobs=-1, **params_4)

- Models Setup (CatBoost Models)

In [16]:
params_5 = {'learning_rate': 0.1312687813101137, 'iterations': 890, 'depth': 4, 'random_strength': 18, 'bagging_temperature': 0.6148156931768988, 'l2_leaf_reg': 9.92558710415279, 'colsample_bylevel': 0.6541946738300729, 'min_data_in_leaf': 62}
params_6 = {'learning_rate': 0.12269692235899617, 'iterations': 866, 'depth': 4, 'random_strength': 20, 'bagging_temperature': 0.3339550649036276, 'l2_leaf_reg': 8.148875418094395, 'colsample_bylevel': 0.7768533359939037, 'min_data_in_leaf': 69}
params_7 = {'learning_rate': 0.06255691338662156, 'iterations': 1256, 'depth': 5, 'random_strength': 12, 'bagging_temperature': 0.5861400278890232, 'l2_leaf_reg': 8.327302510475775, 'colsample_bylevel': 0.6677190482303746, 'min_data_in_leaf': 77}

model5 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100, eval_metric='AUC', **params_5)
model6 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100, eval_metric='AUC', **params_6)
model7 = CatBoostClassifier(random_state=5, verbose=False, early_stopping_rounds=100, eval_metric='AUC', **params_7)

# Model Fitting

- LGBM

In [17]:
model1_results, model2_results, model3_results, model4_results, model5_results, model6_results, model7_results, y_test_list = [], [], [], [], [], [], [], []

for i, (train_index, test_index) in enumerate(sk.split(X_lgbm, y_lgbm)):
    X_train_lgbm, X_test_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
    X_train_cat, X_test_cat = X_cat.iloc[train_index], X_cat.iloc[test_index]
    y_train, y_test = y_lgbm.iloc[train_index], y_lgbm.iloc[test_index]

    model1.fit(X_train_lgbm, y_train)
    model_1_pred_proba = model1.predict_proba(X_test_lgbm)[:, 1]
    model1_results.append(model_1_pred_proba)

    model2.fit(X_train_lgbm, y_train)
    model_2_pred_proba = model2.predict_proba(X_test_lgbm)[:, 1]
    model2_results.append(model_2_pred_proba)

    model3.fit(X_train_lgbm, y_train)
    model_3_pred_proba = model3.predict_proba(X_test_lgbm)[:, 1]
    model3_results.append(model_3_pred_proba)

    model4.fit(X_train_lgbm, y_train)
    model_4_pred_proba = model4.predict_proba(X_test_lgbm)[:, 1]
    model4_results.append(model_4_pred_proba)

    model5.fit(X_train_cat, y_train)
    model_5_pred_proba = model5.predict_proba(X_test_cat)[:, 1]
    model5_results.append(model_5_pred_proba)

    model6.fit(X_train_cat, y_train)
    model_6_pred_proba = model6.predict_proba(X_test_cat)[:, 1]
    model6_results.append(model_6_pred_proba)

    model7.fit(X_train_cat, y_train)
    model_7_pred_proba = model7.predict_proba(X_test_cat)[:, 1]
    model7_results.append(model_7_pred_proba)
    y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

Done with fold 1.
Done with fold 2.
Done with fold 3.
Done with fold 4.
Done with fold 5.
Done with fold 6.
Done with fold 7.
Done with fold 8.
Done with fold 9.
Done with fold 10.


# Ensembling

- Weighted Ensembling

In [18]:
model1_weights, model2_weights, model3_weights, model4_weights, model5_weights, model6_weights, model7_weights, scores = [], [], [], [], [], [], [], []

for i in tqdm(range(5000)):
    weight_1 = np.random.random_sample(size=1)[0]
    weight_2 = np.random.random_sample(size=1)[0]
    weight_3 = np.random.random_sample(size=1)[0]
    weight_4 = np.random.random_sample(size=1)[0]
    weight_5 = np.random.random_sample(size=1)[0]
    weight_6 = np.random.random_sample(size=1)[0]
    weight_7 = np.random.random_sample(size=1)[0]

    model1_weights.append(weight_1)
    model2_weights.append(weight_2)
    model3_weights.append(weight_3)
    model4_weights.append(weight_4)
    model5_weights.append(weight_4)
    model6_weights.append(weight_4)
    model7_weights.append(weight_4)

    scores_in = []

    for j in range(10):
        weighted_pred = weight_1 * model1_results[j] + weight_2 * model2_results[j] + weight_3 * model3_results[j] + weight_4 * model4_results[j] + weight_5 * model5_results[j] + weight_6 * model6_results[j] + weight_7 * model7_results[j]
        scores_in.append(roc_auc_score(y_test_list[j], weighted_pred))

    scores.append(np.mean(scores_in))

  0%|          | 0/5000 [00:00<?, ?it/s]

In [19]:
results_df = pd.DataFrame()
results_df['model_1'] = model1_weights
results_df['model_2'] = model2_weights
results_df['model_3'] = model3_weights
results_df['model_4'] = model4_weights
results_df['model_5'] = model5_weights
results_df['model_6'] = model6_weights
results_df['model_7'] = model7_weights
results_df['score'] = scores
results_df = results_df.sort_values(by='score', ascending=False).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,score
0,0.320669,0.948636,0.348367,0.036099,0.036099,0.036099,0.036099,0.898304
1,0.41807,0.889776,0.25535,0.015707,0.015707,0.015707,0.015707,0.898301
2,0.433888,0.928093,0.334309,0.078873,0.078873,0.078873,0.078873,0.898301
3,0.141888,0.913882,0.510163,0.041475,0.041475,0.041475,0.041475,0.898299
4,0.382556,0.938201,0.7757,0.050528,0.050528,0.050528,0.050528,0.898299
5,0.626826,0.864845,0.309714,0.003094,0.003094,0.003094,0.003094,0.898299
6,0.33153,0.937383,0.412242,0.009118,0.009118,0.009118,0.009118,0.898299
7,0.526378,0.71367,0.286068,0.050308,0.050308,0.050308,0.050308,0.898299
8,0.405129,0.881968,0.312517,0.058482,0.058482,0.058482,0.058482,0.898298
9,0.796939,0.809409,0.638827,0.002963,0.002963,0.002963,0.002963,0.898297


- Optuna Weights Ensembling

In [20]:
def objective(trial):

    roc_auc_scores = []

    # Suggest weights for ach model
    weight1 = trial.suggest_float('weight1', 0, 1)
    weight2 = trial.suggest_float('weight2', 0, 1)
    weight3 = trial.suggest_float('weight3', 0, 1)
    weight4 = trial.suggest_float('weight4', 0, 1)
    weight5 = trial.suggest_float('weight5', 0, 1)
    weight6 = trial.suggest_float('weight6', 0, 1)
    weight7 = trial.suggest_float('weight7', 0, 1)

    for j in range(10):
        final_pred = weight1 * model1_results[j] + weight2 * model2_results[j] + weight3 * model3_results[j] + weight4 * model4_results[j] + weight5 * model5_results[j] + weight6 * model6_results[j] + weight7 * model7_results[j]

        # Compute and store ROC AUC score for the current fold
        roc_auc_scores.append(roc_auc_score(y_test_list[j], final_pred))

    return np.mean(roc_auc_scores)

n_trials = 5000

# Initialize progress bar
progress_bar = tqdm(total=n_trials, desc='Ensembling', position=0)

# Callback function to update progress bar
def callback(study, trial):
    progress_bar.update(1)

# Create study with study with pruner
# pruner = optuna.pruners.MedianPruner(n_startup_trials=3, n_warmup_steps=3)
study = optuna.create_study(study_name='ensembling', direction='maximize')
study.optimize(objective, n_trials=n_trials, callbacks=[callback])

best_weights = study.best_params
print(best_weights)

progress_bar.close()

Ensembling:   0%|          | 0/5000 [00:00<?, ?it/s]

[I 2024-01-18 07:47:44,766] A new study created in memory with name: ensembling
[I 2024-01-18 07:47:44,842] Trial 0 finished with value: 0.8982239045161048 and parameters: {'weight1': 0.6063026198928508, 'weight2': 0.12723132472615828, 'weight3': 0.6663477015425698, 'weight4': 0.532658279457278, 'weight5': 0.4680949175455682, 'weight6': 0.3025456995263822, 'weight7': 0.9443461697994782}. Best is trial 0 with value: 0.8982239045161048.
[I 2024-01-18 07:47:44,920] Trial 1 finished with value: 0.8982128846514399 and parameters: {'weight1': 0.532164064950811, 'weight2': 0.01618292108354702, 'weight3': 0.5554090796041276, 'weight4': 0.2836823436551238, 'weight5': 0.5738919967709194, 'weight6': 0.8057652014692781, 'weight7': 0.3991445155579221}. Best is trial 0 with value: 0.8982239045161048.
[I 2024-01-18 07:47:44,970] Trial 2 finished with value: 0.8982035872435041 and parameters: {'weight1': 0.6199602221626169, 'weight2': 0.03970834833260184, 'weight3': 0.47935108745610655, 'weight4': 0.6

{'weight1': 0.3701508300577049, 'weight2': 0.9488082612036773, 'weight3': 0.44646418829669776, 'weight4': 0.036670235296457854, 'weight5': 0.8555983069610744, 'weight6': 0.3298182467435091, 'weight7': 0.31364436868957835}


In [23]:
optuna_weights = {'weight1': 0.3701508300577049,
                  'weight2': 0.9488082612036773,
                  'weight3': 0.44646418829669776,
                  'weight4': 0.036670235296457854,
                  'weight5': 0.8555983069610744,
                  'weight6': 0.3298182467435091,
                  'weight7': 0.31364436868957835}

# Final Predictions

In [21]:
model1_final = model1.fit(X_lgbm, y_lgbm)
model2_final = model2.fit(X_lgbm, y_lgbm)
model3_final = model3.fit(X_lgbm, y_lgbm)
model4_final = model4.fit(X_lgbm, y_lgbm)
model5_final = model5.fit(X_cat, y_cat)
model6_final = model6.fit(X_cat, y_cat)
model7_final = model7.fit(X_cat, y_cat)

In [24]:
test_features_lgbm

# # Using random weights results
# ensemble_pred = (
#     results_df['model_1'][0] * model1_final.predict_proba(test_features_lgbm[lgbm_features]) +
#     results_df['model_2'][0] * model2_final.predict_proba(test_features_lgbm[lgbm_features]) +
#     results_df['model_3'][0] * model3_final.predict_proba(test_features_lgbm[lgbm_features]) +
#     results_df['model_4'][0] * model4_final.predict_proba(test_features_lgbm[lgbm_features]) +
#     results_df['model_5'][0] * model5_final.predict_proba(test_features_cat[cat_features]) +
#     results_df['model_6'][0] * model6_final.predict_proba(test_features_cat[cat_features]) +
#     results_df['model_7'][0] * model7_final.predict_proba(test_features_cat[cat_features])
# )

# Using Optuna Weights results
ensemble_pred = (
    optuna_weights['weight1'] * model1_final.predict_proba(test_features_lgbm[lgbm_features]) +
    optuna_weights['weight2'] * model2_final.predict_proba(test_features_lgbm[lgbm_features]) +
    optuna_weights['weight3'] * model3_final.predict_proba(test_features_lgbm[lgbm_features]) +
    optuna_weights['weight4'] * model4_final.predict_proba(test_features_lgbm[lgbm_features]) +
    optuna_weights['weight5'] * model5_final.predict_proba(test_features_cat[cat_features]) +
    optuna_weights['weight6'] * model6_final.predict_proba(test_features_cat[cat_features]) +
    optuna_weights['weight7'] * model7_final.predict_proba(test_features_cat[cat_features])
)

ensemble_df = pd.DataFrame(ensemble_pred)
ensemble_df = ensemble_df.div(ensemble_df.sum(axis=1), axis=0)
ensemble_df.head()

Unnamed: 0,0,1
0,0.979267,0.020733
1,0.15726,0.84274
2,0.979041,0.020959
3,0.7833,0.2167
4,0.616465,0.383535


In [25]:
# Select the probability of 1 column and create submission df
ensemble_df = ensemble_df.iloc[:, 1]

submission = pd.read_csv('sample_submission.csv')
submission.loc[:, 'Exited'] = ensemble_df.values
submission.head()

Unnamed: 0,id,Exited
0,165034,0.020733
1,165035,0.84274
2,165036,0.020959
3,165037,0.2167
4,165038,0.383535


In [None]:
submission.to_csv('submission_optunaensemble_0.898307cv.csv', index=False)