In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
# Lets combine train and test file

In [4]:
train.shape

(165034, 14)

In [5]:
test.shape

(110023, 13)

In [6]:
data = pd.concat([train,test],axis=0,ignore_index=True)

### Missing Values

In [7]:
data.isnull().sum()

id                      0
CustomerId              0
Surname                 0
CreditScore             0
Geography               0
Gender                  0
Age                     0
Tenure                  0
Balance                 0
NumOfProducts           0
HasCrCard               0
IsActiveMember          0
EstimatedSalary         0
Exited             110023
dtype: int64

### Encoding

In [8]:
data.head(2)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0.0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0.0


In [9]:
# droping id, customerId, surname
data.drop(columns=['id','CustomerId','Surname'],inplace = True)

In [10]:
data.head(2)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0.0
1,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0.0


In [11]:
data['Geography'].value_counts()

France     157386
Spain       60126
Germany     57545
Name: Geography, dtype: int64

In [12]:
data_e = pd.get_dummies(data,drop_first=True,dtype=int)

In [13]:
data_e.head(2)

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,668,33.0,3,0.0,2,1.0,0.0,181449.97,0.0,0,0,1
1,627,33.0,1,0.0,2,1.0,1.0,49503.5,0.0,0,0,1


In [14]:
x_pred = data_e[data_e['Exited'].isnull()]

In [15]:
x_pred.drop(columns=['Exited'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_pred.drop(columns=['Exited'],inplace=True)


In [16]:
remaining = data_e[data_e['Exited'].notnull()]

In [17]:
remaining

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,668,33.0,3,0.00,2,1.0,0.0,181449.97,0.0,0,0,1
1,627,33.0,1,0.00,2,1.0,1.0,49503.50,0.0,0,0,1
2,678,40.0,10,0.00,2,1.0,0.0,184866.69,0.0,0,0,1
3,581,34.0,2,148882.54,1,1.0,1.0,84560.88,0.0,0,0,1
4,716,33.0,5,0.00,2,1.0,1.0,15068.83,0.0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
165029,667,33.0,2,0.00,1,1.0,1.0,131834.75,0.0,0,1,0
165030,792,35.0,3,0.00,1,0.0,0.0,131834.45,0.0,0,0,1
165031,565,31.0,5,0.00,1,1.0,1.0,127429.56,0.0,0,0,1
165032,554,30.0,7,161533.00,1,0.0,1.0,71173.03,0.0,0,1,0


In [18]:
# train test split
from sklearn.model_selection import train_test_split

In [19]:
x = remaining.drop(columns=['Exited'])
y = remaining['Exited']

In [20]:
x_train,x_test,y_train,y_test  = train_test_split(x,y,train_size=0.7,random_state=2,stratify = y)

### Descision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,roc_auc_score,roc_curve
from sklearn.model_selection import GridSearchCV

In [22]:
params = {'max_depth':[3,5,7,9,11,13,15],
         'min_samples_split':[10,20,30,40,50]}

In [23]:
gscv = GridSearchCV(estimator=DecisionTreeClassifier(),
                   param_grid=params,
                   scoring = 'roc_auc',
                   cv=10,verbose=1)

In [24]:
gscv.fit(x_train,y_train)

Fitting 10 folds for each of 35 candidates, totalling 350 fits


In [25]:
gscv.best_params_

{'max_depth': 7, 'min_samples_split': 50}

In [26]:
gscv.best_score_

0.8822901178067385

In [27]:
model = DecisionTreeClassifier(**gscv.best_params_)

In [28]:
model.fit(x_train,y_train)

In [29]:
pred = model.predict(x_test)
prob = model.predict_proba(x_test)[:,1]

In [30]:
print(classification_report(y_test,pred))
print(roc_auc_score(y_test,prob))

              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     39035
         1.0       0.73      0.55      0.63     10476

    accuracy                           0.86     49511
   macro avg       0.81      0.75      0.77     49511
weighted avg       0.85      0.86      0.85     49511

0.8811489911272489


In [31]:
# Lets make prediction for the submission
pred_prob = model.predict_proba(x_pred)[:,1]

In [32]:
pred_prob

array([0.01511783, 0.83928571, 0.03498779, ..., 0.01710965, 0.14580741,
       0.13875749])

In [33]:
submission['Exited']=pred_prob

In [34]:
submission

Unnamed: 0,id,Exited
0,165034,0.015118
1,165035,0.839286
2,165036,0.034988
3,165037,0.306208
4,165038,0.287117
...,...,...
110018,275052,0.055152
110019,275053,0.055152
110020,275054,0.017110
110021,275055,0.145807


In [35]:
# submission.to_csv('submission1.csv',index=False)

In [36]:
def model_validation(model,xtrain,ytrain,xtest,ytest):
    global m
    m = model
    m.fit(xtrain,ytrain)
    pred = m.predict(xtest) #hard prediction(0,1)
    prob = m.predict_proba(xtest)[:,1]  #soft prediction(probablity of 1)
    
    print(classification_report(y_test,pred))
    print(roc_auc_score(y_test,prob))
    
    pred_prob = model.predict_proba(x_pred)[:,1]

In [38]:
from sklearn.tree import DecisionTreeClassifier,plot_tree
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier
model_validation(RandomForestClassifier(max_depth=4),
                x_train,y_train,x_test,y_test)

              precision    recall  f1-score   support

         0.0       0.84      0.98      0.91     39035
         1.0       0.83      0.31      0.45     10476

    accuracy                           0.84     49511
   macro avg       0.83      0.65      0.68     49511
weighted avg       0.84      0.84      0.81     49511

0.8754853842458279


In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

In [40]:
model_validation(BaggingClassifier(estimator=KNeighborsClassifier(),n_estimators=100),
                x_train,y_train,x_test,y_test)

              precision    recall  f1-score   support

         0.0       0.79      0.94      0.86     39035
         1.0       0.30      0.09      0.14     10476

    accuracy                           0.76     49511
   macro avg       0.55      0.52      0.50     49511
weighted avg       0.69      0.76      0.71     49511

0.5819109418207966


In [43]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier

In [44]:
model_validation(BaggingClassifier(estimator=GaussianNB(),n_estimators=100),
                x_train_b,y_train_b,x_test,y_test)

NameError: name 'x_train_b' is not defined

In [45]:
from sklearn.ensemble import AdaBoostClassifier

In [46]:
model_validation(AdaBoostClassifier(n_estimators=200),x_train,y_train,x_test,y_test)

              precision    recall  f1-score   support

         0.0       0.88      0.95      0.92     39035
         1.0       0.74      0.53      0.62     10476

    accuracy                           0.86     49511
   macro avg       0.81      0.74      0.77     49511
weighted avg       0.85      0.86      0.85     49511

0.8798277989231719


In [47]:
model_validation(XGBClassifier(n_estimators = 50,eta = 1,gamma=3,reg_lambda=0),x_train,y_train ,x_test,y_test)

              precision    recall  f1-score   support

         0.0       0.89      0.95      0.91     39035
         1.0       0.73      0.55      0.62     10476

    accuracy                           0.86     49511
   macro avg       0.81      0.75      0.77     49511
weighted avg       0.85      0.86      0.85     49511

0.8803758808400428


In [48]:
param = {'n_estimators':[20,50,70,100,150,180,200],
        'eta':[2,1,0,0.5,0.1,0.05],
        'max_depth':[3,5,7,9,11,13]}

In [51]:
gscv = GridSearchCV(estimator=XGBClassifier(),
                   param_grid=param,
                   scoring='roc_auc',
                   cv= 3,
                   verbose = 1)

In [52]:
gscv.fit(x_train,y_train)

Fitting 3 folds for each of 252 candidates, totalling 756 fits


In [54]:
gscv.best_params_

{'eta': 0.05, 'max_depth': 5, 'n_estimators': 200}

In [55]:
model_validation(XGBClassifier(**gscv.best_params_),x_train,y_train ,x_test,y_test)

              precision    recall  f1-score   support

         0.0       0.89      0.95      0.92     39035
         1.0       0.73      0.56      0.64     10476

    accuracy                           0.86     49511
   macro avg       0.81      0.75      0.78     49511
weighted avg       0.86      0.86      0.86     49511

0.8877936653123539


In [56]:
pred_prob = model.predict_proba(x_pred)[:,1]


In [57]:
pred_prob

array([0.01511783, 0.83928571, 0.03498779, ..., 0.01710965, 0.14580741,
       0.13875749])

In [58]:
submission['Exited']=pred_prob

In [59]:
submission

Unnamed: 0,id,Exited
0,165034,0.015118
1,165035,0.839286
2,165036,0.034988
3,165037,0.306208
4,165038,0.287117
...,...,...
110018,275052,0.055152
110019,275053,0.055152
110020,275054,0.017110
110021,275055,0.145807


In [60]:
submission.to_csv('submission2.csv',index=False)