In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
df = pd.read_csv('../Datasets/Churn_Modelling.csv')

In [6]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
df.isna().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [8]:
X = df.iloc[:, 3:13]
y = df.iloc[:, 13]

In [12]:
geography = pd.get_dummies(X['Geography'], drop_first=True)
gender = pd.get_dummies(X['Gender'], drop_first=True)

In [13]:
X = pd.concat([X, geography, gender], axis=1)

In [14]:
X.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain,Male
0,619,France,Female,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,1,0


In [15]:
X = X.drop(columns=['Geography', 'Gender'], axis=1)

In [16]:
X.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Germany,Spain,Male
0,619,42,2,0.0,1,1,1,101348.88,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,1,0


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [22]:
logistic_model = LogisticRegression()

logistic_model.fit(X_train, y_train)

LogisticRegression()

In [23]:
logistic_model.score(X_test, y_test)

0.811

In [24]:
y_pred = logistic_model.predict(X_test)

In [26]:
accuracy_score(y_test, y_pred)

0.811

In [107]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [79]:
def train_models(models):
    results = {}
    models_list = []
    score_list = []
    for model in models:
        # Initialize the model
        classifier = models[model]
        # Fit the data
        classifier.fit(X_train, y_train)
        # Predict the score
        train_score = classifier.score(X_test, y_test)
        models_list.append(model)
        score_list.append(train_score)
        
    results['model'] = models_list
    results['score'] = score_list
    return results

In [86]:
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'GaussianNB': GaussianNB(),
    'RandomForestClassifier': RandomForestClassifier(),
    'XGBClassifier': xgb.XGBClassifier()
}

In [87]:
data = train_models(models)





In [89]:
score_df = pd.DataFrame(data)

In [90]:
score_df

Unnamed: 0,model,score
0,LogisticRegression,0.811
1,DecisionTreeClassifier,0.807
2,GaussianNB,0.8295
3,RandomForestClassifier,0.8685
4,XGBClassifier,0.8545


In [115]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [122]:
param_grid = {
    'n_estimators': [10, 50, 100, 150, 200, 250, 300],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 4, 6, 8, 10, 20],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6]
}

In [123]:
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)

In [124]:
# grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                         'min_samples_split': [2, 4, 6, 8, 10, 20],
                         'n_estimators': [10, 50, 100, 150, 200, 250, 300]})

In [125]:
grid_search.best_params_

{'criterion': 'gini',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 250}

In [126]:
rf_model = RandomForestClassifier(criterion='gini', min_samples_leaf=2, min_samples_split=10, n_estimators=250)
rf_model.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=2, min_samples_split=10,
                       n_estimators=250)

In [127]:
rf_model.score(X_test, y_test)

0.869