In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgbm
from catboost import CatBoostClassifier
import optuna
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
df= pd.read_csv('final_data.csv')

In [3]:
df= df.drop('Unnamed: 0', axis=1)

In [4]:
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,family_members_in_bank,country__Germany,country__Spain,Credit_score_range,balance_bins,estimated_salary_bins,credit_score_by_customer
0,619,0,3.7612,2,0.0,1,1,1,101348.88,1,1,0,0,0,0,5,3777
1,608,0,3.73767,1,83807.86,1,0,1,112542.58,0,17,0,1,0,1,5,3777
2,502,0,3.7612,8,159660.8,3,1,0,113931.57,1,8,0,0,0,9,5,3777
3,699,0,3.688879,1,0.0,2,0,0,93826.63,0,14,0,0,2,0,4,1064
4,850,0,3.78419,2,125510.82,1,1,1,79084.1,0,20,0,1,3,5,3,2408


In [5]:
X= df.drop('Exited', axis=1)
y= df.Exited

In [6]:
oversample= ADASYN()
X, y= oversample.fit_resample(X, y)

In [7]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25, random_state=10, stratify=y)

In [8]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs

In [9]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  3


In [10]:
# remove correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((12036, 13), (4012, 13))

In [11]:
sfs = sfs(ExtraTreesClassifier(n_estimators=100, n_jobs=4, random_state=10), 
           k_features=8, # the more features we want, the longer it will take to run
           forward=False, 
           floating=False, # see the docs for more details in this parameter
           verbose=2, # this indicates how much to print out intermediate steps
           scoring='f1',
           cv=2)

sfs = sfs.fit(np.array(X_train), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:   10.9s finished

[2022-03-09 17:58:51] Features: 12/8 -- score: 0.8517834728863589[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    9.8s finished

[2022-03-09 17:59:01] Features: 11/8 -- score: 0.8554178484193125[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    8.7s finished

[2022-03-09 17:59:10] Features: 10/8 -- score: 0.853987078466029[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   

In [12]:
selected_feat = X_train.columns[list(sfs.k_feature_idx_)]
selected_feat

Index(['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember',
       'EstimatedSalary', 'country__Germany', 'credit_score_by_customer'],
      dtype='object')

In [13]:
X = X[selected_feat]

In [16]:
xgb_param= {'booster': 'dart',
 'lambda': 1.9208316054245495e-06,
 'alpha': 8.317686375849313e-08,
 'max_depth': 6,
 'eta': 0.2646954221898419,
 'gamma': 0.00015552718621314075,
 'grow_policy': 'lossguide',
 'sample_type': 'uniform',
 'normalize_type': 'forest',
 'rate_drop': 1.1593898294259455e-07,
 'skip_drop': 0.000203700988137663}

In [17]:
def my_folds(model_name, params,metric):
    n_folds = 10
    subbed = []
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=10)


    for fold, (train_idx, test_idx) in tqdm(enumerate(kf.split(X_train, y_train))):
        print('=============== Fold No:',fold+1,'===============')
        X_tr, X_tst = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_tr, y_tst = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
        model = model_name(**params,eval_metric= metric, n_estimators= 2500)
        model.fit(X_tr, y_tr,eval_set=[(X_tst, y_tst)], early_stopping_rounds=150, verbose=100)
        print(accuracy_score(y_tst, model.predict(X_tst)))
        subbed.append(accuracy_score(y_tst, model.predict(X_tst)))
        pred = model.predict(X_test)
    print(np.mean(subbed))
    return pred

In [21]:
pred= my_folds(XGBClassifier, xgb_param, 'error')

0it [00:00, ?it/s]

[0]	validation_0-error:0.18355
[100]	validation_0-error:0.08223
[200]	validation_0-error:0.08306
[260]	validation_0-error:0.08306
0.9194352159468439
[0]	validation_0-error:0.18937
[100]	validation_0-error:0.10050
[200]	validation_0-error:0.09635
[300]	validation_0-error:0.09552
[390]	validation_0-error:0.09967
0.9078073089700996
[0]	validation_0-error:0.21013
[100]	validation_0-error:0.11794
[200]	validation_0-error:0.12043
[290]	validation_0-error:0.11130
0.8920265780730897
[0]	validation_0-error:0.17857
[100]	validation_0-error:0.08472
[200]	validation_0-error:0.08389
[300]	validation_0-error:0.08056
[400]	validation_0-error:0.08223
[486]	validation_0-error:0.08056
0.9227574750830565
[0]	validation_0-error:0.19269
[100]	validation_0-error:0.11379
[200]	validation_0-error:0.10963
[300]	validation_0-error:0.11296
[379]	validation_0-error:0.11296
0.893687707641196
[0]	validation_0-error:0.18355
[100]	validation_0-error:0.09468
[200]	validation_0-error:0.09552
[288]	validation_0-error:0.

In [22]:
xgb_pred= pd.DataFrame()
xgb_pred['Prediction'] = pred
xgb_pred.to_csv('xgb_prediction.csv', index=False)