In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgbm 
import catboost as cb
from sklearn.model_selection import train_test_split
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.metrics import log_loss, roc_auc_score
import optuna
from catboost import CatBoostClassifier
import tensorflow as tf 
from sklearn.linear_model import LogisticRegression
from tensorflow import keras
from tensorflow.keras import layers, callbacks
import tensorflow_probability as tfp

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s4e1/train.csv")
original_train = pd.read_csv("/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e1/test.csv")
sample = pd.read_csv("/kaggle/input/playground-series-s4e1/sample_submission.csv")

In [None]:
train.describe().T

In [None]:
original_train.describe().T

In [None]:
for cols in train.columns:
    print (f"The number of unique values in {cols} are {train[cols].nunique()}")

In [None]:
for cols in original_train.columns:
    print (f"The number of unique values in {cols} are {original_train[cols].nunique()}")

In [None]:
train.info()

In [None]:
original_train.drop('RowNumber', axis=1, inplace=True)
train = pd.concat([train, original_train], ignore_index=True)
train.drop_duplicates()

In [None]:
train.dropna(inplace=True)

In [None]:
train.isna().sum()

# BASIC EDA

In [None]:
categorical_columns = train.select_dtypes(include='object').columns.tolist()
numerical_columns = train.select_dtypes(exclude ='object').columns.tolist()

In [None]:
categorical_columns

In [None]:
#Plotting Distribution on one graph
sns.kdeplot(train[numerical_columns])

In [None]:
def plot_kde_for_all_columns(df):
    sns.set(style="whitegrid")
    columns = df.columns
    num_cols = 2
    num_rows = math.ceil(len(columns) / num_cols)
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 2 * num_rows))
    axes = axes.flatten()
    for i, column in enumerate(columns):
        ax = axes[i]
        sns.kdeplot(df[column], ax=ax, fill=True)
        ax.set_title(f'KDE Plot for {column}')
        ax.set_xlabel(column)
    for i in range(len(columns), len(axes)):
        fig.delaxes(axes[i])
    plt.tight_layout()
    plt.show()

In [None]:
plot_kde_for_all_columns(train[numerical_columns])

In [None]:
train

# Approach 1 

## Drop the id, CustomerID, Surname columns and then LabelEncode the other categorical columns

In [None]:
# train.drop(['id', 'CustomerId', 'Surname'], axis=1, inplace = True)
# test.drop(['id', 'CustomerId', 'Surname'], axis=1, inplace = True)

In [None]:
train.drop(['id'], axis=1, inplace = True)
test.drop(['id'], axis=1, inplace = True)

In [None]:
categorical_columns = train.select_dtypes(include='object').columns.tolist()

In [None]:
label_encoder = LabelEncoder()
for cols in categorical_columns:
    train[cols] = label_encoder.fit_transform(train[cols])
    test[cols] = label_encoder.fit_transform(test[cols])

# Modelling

In [None]:
y = train.pop('Exited')
X = train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

In [None]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'num_class': len(set(y_train)),
        'eval_metric': 'mlogloss',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.0, 1.0),
        'min_child_weight': trial.suggest_uniform('min_child_weight', 0, 10),
        'subsample': trial.suggest_uniform('subsample', 0.0, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:,1]
    y_preds =  model.predict(X_test)
    loss = roc_auc_score(y_test, y_pred_proba)

    return loss

run=0

if run==1:

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=30)
    print('Best trial:')
    trial = study.best_trial

    print('Value: {}'.format(trial.value))
    print('Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

In [None]:


def objective(trial):
    parameters = {'objective': 'binary',
                 'metric': 'auc',
                 'is_unbalance': 'true',
                 'boosting': 'gbdt',
                 'num_leaves': trial.suggest_int('num_leaves', 31, 100),
                 'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 0.8, step=0.1),
                 'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 0.8, step=0.1),
                 'bagging_freq': 20,
                 'learning_rate': 0.01,
                 'verbose': -1}

    train_data = lgbm.Dataset(data=X_train, label=y_train)
    valid_data = lgbm.Dataset(data=X_test, label=y_test)
    model = lgbm.train(parameters,
                          train_set = train_data,
                          valid_sets=valid_data,
                          num_boost_round = 5000,
                          callbacks=[lgbm.early_stopping(stopping_rounds = 1000)])
    ypred = model.predict(X_test)
    loss = log_loss(y_test,ypred)
    return loss

run = 2
if run == 3 :
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials = 50)

Below are the parameters for xgboost.

In [None]:
xgb_params = {'booster': 'gbtree',
 'lambda': 0.8611971458776956,
 'alpha': 3.3684132992886347e-07,
 'max_depth': 3,
 'eta': 0.17374299923922656,
 'gamma': 1.2505690952357777e-06,
 'colsample_bytree': 0.8361517621930924,
 'min_child_weight': 2.650197692280842,
 'subsample': 0.645735940099335,
 'n_estimators': 137}

In [None]:
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_model.fit(X,y)

In [None]:
y_preds = xgb_model.predict(X_test)

In [None]:
y_preds.shape

In [None]:
lgb_params = {'n_estimators': 5000,
 'max_depth': 50,
 'learning_rate': 0.03,
 'min_child_weight': 0.81,
 'min_child_samples': 190,
 'subsample': 0.88,
 'subsample_freq': 2,
 'random_state': 42,
 'colsample_bytree': 0.62,
 'num_leaves': 15}

In [None]:
lgbm_model = lgbm.LGBMClassifier(**lgb_params)
lgbm_model.fit(X,y)

In [None]:
cb_model = CatBoostClassifier()
cb_model.fit(X,y)

Neural Network Here

In [None]:
keras.utils.set_random_seed(42)

early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.001,
    patience=100,
    restore_best_weights=True,
)

callbacks_list = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30, verbose=2, mode='min',restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=3, min_lr=0.00001),
    tf.keras.callbacks.TerminateOnNaN()
] 

binary_accuracy_metric = tf.keras.metrics.BinaryAccuracy()

model = keras.Sequential([
    layers.BatchNormalization(input_shape=[X_train.shape[1]]),
    layers.Dense(16, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid'), 
])

def loss_fn(y_true, y_pred):
    return tf.keras.losses.BinaryCrossentropy()(y_true, y_pred)

def metric_fn(y_true, y_pred):
    return binary_accuracy_metric(y_true, y_pred)

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.013, beta_1=0.5),
    loss=loss_fn,
    metrics=[metric_fn],
)

history = model.fit(
    X_train.astype('float32'), y_train.astype('float32'),
    validation_data=(X_test.astype('float32'), y_test.astype('float32')),
    epochs=1,
    class_weight=lgbm_model.class_weight,
    callbacks=callbacks_list,
    verbose=1,
)

## Stacking Ensemble

In [None]:
run = 90
if run =='stacker':
    stacker = StackingClassifier(estimators = [('m1', xgb_model), ('m2', lgbm_model)], final_estimator = LogisticRegression())
    stacker.fit(X_train, y_train)
    l1 = stacker.predict_proba(X_test)
    print(log_loss(y_test,l1 ))

## Voting Ensemble

In [None]:
voter = VotingClassifier(estimators = [('m1', xgb_model), ('m2', lgbm_model), ('m3', cb_model)],voting = 'soft',  weights=[0.2, 0.4, 0.4])
voter.fit(X,y)

# Submission

In [None]:
y_preds =voter.predict_proba(test)

In [None]:
sample['Exited'] = y_preds[:,1]

In [None]:
sample.to_csv('submission.csv', index=False)