In [1]:
#
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

#Machine Learning Libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

from sklearn import metrics
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_auc_score, roc_curve, recall_score


In [2]:
# Import and Read Data

path = 'Churn_Modelling.csv'
df = pd.read_csv(path)
df.head()
df.shape

(10000, 14)

In [3]:
le = LabelEncoder()
cat_var = df[['Geography','Gender']]
for x in cat_var:
    df[x] = le.fit_transform(df[x])
df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography            int32
Gender               int32
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

## Feature Selection

In [4]:
# Dropping unnecessary columns
df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], axis=1,inplace=True)

In [5]:
# Splitting Data into dependent and Independent Variable
X = df.drop(labels='Exited', axis=1)
Y = df['Exited']

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=40)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(8000, 10) (2000, 10) (8000,) (2000,)


## Model Building

### Baseline Creation

In [29]:
Y_train.value_counts()[0]

6347

In [72]:
majority = Y_train.value_counts()
print(majority)
# baseline_pred = majority * len(Y_test)
# baseline_pred

0    6347
1    1653
Name: Exited, dtype: int64


### Logistic Regression

In [7]:
#Model Building
# Model Fit
log_reg = LogisticRegression().fit(X_train, Y_train)
train_LR = log_reg.predict(X_train)
test_LR = log_reg.predict(X_test)
LRScoretrain = accuracy_score(Y_train, train_LR)
LRScoretest = accuracy_score(Y_test, test_LR)

### Naive Bayes

In [8]:
NB = GaussianNB().fit(X_train, Y_train)
NBtrain = NB.predict(X_train)
NBtest = NB.predict(X_test)

NBScoretrain = accuracy_score(Y_train, NBtrain)
NBScoretest = accuracy_score(Y_test,NBtest)

### Decision Tree

In [9]:
DT = DecisionTreeClassifier().fit(X_train, Y_train)
trainDT = DT.predict(X_train)
testDT = DT.predict(X_test)
DTScoretrain = accuracy_score(Y_train, trainDT)
DTScoretest = accuracy_score(Y_test, testDT)

### Random Forest

In [12]:
RF = RandomForestClassifier().fit(X_train, Y_train)
RFtrain = RF.predict(X_train)
RFtest = RF.predict(X_test)
RFScoretrain = accuracy_score(Y_train, RFtrain)
RFScoretest = accuracy_score(Y_test, RFtest)

### Support Vector Machine

In [51]:
SVM = SVC().fit(X_train, Y_train)
SVMtrain = SVM.predict(X_train)
SVMtest = SVM.predict(X_test)
SVMtrainscore = accuracy_score(Y_train, SVMtrain)
SVMtestscore = accuracy_score(Y_test, SVMtest)

### XGBoost

In [65]:
xgbr = xgb.XGBClassifier().fit(X_train, Y_train)
xgbtrain = xgbr.predict(X_train)
xgbtest = xgbr.predict(X_test)
xgbscoretrain = accuracy_score(Y_train, xgbtrain)
xgbscoretest = accuracy_score(Y_test, xgbtest)

In [66]:
accuracyscorestrain = pd.DataFrame({'Model':['Logistic Regression','Naive Bayes', 
                                             'Decision Tree', 'Random Forest', 'Support Vector Machine','XGBoost'],
             'Accuracy_train':[LRScoretrain, NBScoretrain, DTScoretrain, RFScoretrain, SVMtrainscore, xgbscoretrain]
             })

In [67]:
accuracyscorestest = pd.DataFrame({'Model':['Logistic Regression','Naive Bayes', 
                                            'Decision Tree', 'Random Forest', 'Support Vector Machine', 'XGBoost'],
             'Accuracy_test':[LRScoretest, NBScoretest, DTScoretest, RFScoretest, SVMtestscore, xgbscoretest]
             })

In [68]:
accuracyscores = accuracyscorestrain.merge(accuracyscorestest)
accuracyscores.round(2)

Unnamed: 0,Model,Accuracy_train,Accuracy_test
0,Logistic Regression,0.79,0.8
1,Naive Bayes,0.78,0.79
2,Decision Tree,1.0,0.79
3,Random Forest,1.0,0.87
4,Support Vector Machine,0.79,0.81
5,XGBoost,0.96,0.87
