**Import necessary libraries**

In [424]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

**Load the dataset**

In [426]:
data = pd.read_csv("Customer-Churn-Records.csv")
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0,0,5,GOLD,425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0,0,1,DIAMOND,300
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0,0,5,PLATINUM,771
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1,1,3,SILVER,564
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1,1,2,GOLD,339


**Data Inspection**

In [428]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RowNumber           10000 non-null  int64  
 1   CustomerId          10000 non-null  int64  
 2   Surname             10000 non-null  object 
 3   CreditScore         10000 non-null  int64  
 4   Geography           10000 non-null  object 
 5   Gender              10000 non-null  object 
 6   Age                 10000 non-null  int64  
 7   Tenure              10000 non-null  int64  
 8   Balance             10000 non-null  float64
 9   NumOfProducts       10000 non-null  int64  
 10  HasCrCard           10000 non-null  int64  
 11  IsActiveMember      10000 non-null  int64  
 12  EstimatedSalary     10000 non-null  float64
 13  Exited              10000 non-null  int64  
 14  Complain            10000 non-null  int64  
 15  Satisfaction Score  10000 non-null  int64  
 16  Card 

In [429]:
data.shape

(10000, 18)

In [430]:
data.dtypes

RowNumber               int64
CustomerId              int64
Surname                object
CreditScore             int64
Geography              object
Gender                 object
Age                     int64
Tenure                  int64
Balance               float64
NumOfProducts           int64
HasCrCard               int64
IsActiveMember          int64
EstimatedSalary       float64
Exited                  int64
Complain                int64
Satisfaction Score      int64
Card Type              object
Point Earned            int64
dtype: object

In [431]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Point Earned
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2038,0.2044,3.0138,606.5151
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402842,0.403283,1.405919,225.924839
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0,0.0,1.0,119.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0,0.0,2.0,410.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0,0.0,3.0,605.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0,0.0,4.0,801.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0,1.0,5.0,1000.0


**Data cleaning**

In [433]:
data.isnull().sum()

RowNumber             0
CustomerId            0
Surname               0
CreditScore           0
Geography             0
Gender                0
Age                   0
Tenure                0
Balance               0
NumOfProducts         0
HasCrCard             0
IsActiveMember        0
EstimatedSalary       0
Exited                0
Complain              0
Satisfaction Score    0
Card Type             0
Point Earned          0
dtype: int64

In [434]:
data.duplicated().sum()

0

**Data Transformation**

In [436]:
data["Gender"].unique()

array(['Female', 'Male'], dtype=object)

In [437]:
data['Gender'].replace(['Male', 'Female'], [1,2], inplace=True)

In [438]:
data["Card Type"].unique()

array(['DIAMOND', 'GOLD', 'SILVER', 'PLATINUM'], dtype=object)

In [439]:
data['Card Type'].replace(['SILVER', 'GOLD', 'PLATINUM', 'DIAMOND'], [1,2,3,4], inplace=True)

In [440]:
data.drop(columns = ['CustomerId', 'Surname', 'Geography', 'RowNumber'], inplace = True)

In [441]:
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,619,2,42,2,0.0,1,1,1,101348.88,1,1,2,4,464
1,608,2,41,1,83807.86,1,0,1,112542.58,0,1,3,4,456
2,502,2,42,8,159660.8,3,1,0,113931.57,1,1,3,4,377
3,699,2,39,1,0.0,2,0,0,93826.63,0,0,5,2,350
4,850,2,43,2,125510.82,1,1,1,79084.1,0,0,5,2,425


In [442]:
data['Exited'].value_counts()

Exited
0    7962
1    2038
Name: count, dtype: int64

**The dataset is unbalaced**

**0 -> Exit**

**1 -> NotExit**

In [446]:
data.groupby('Exited').mean()

Unnamed: 0_level_0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Card Type,Point Earned
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,651.837855,1.427531,37.408063,5.032781,72742.750663,1.54421,0.707109,0.554635,99726.853141,0.001256,3.01796,2.491711,607.044084
1,645.414622,1.558881,44.835623,4.93474,91109.476006,1.475466,0.699215,0.360648,101509.908783,0.998037,2.997547,2.538763,604.448479


**Balance the data since it had higher number for Customers who exited than those who did not exit. Balanced data trains the model well ensuring accurate predictions**

In [448]:
Exit = data[data.Exited == 0]
NotExit = data[data.Exited == 1]

In [449]:
print(Exit.shape)
print(NotExit.shape)

(7962, 14)
(2038, 14)


In [450]:
Exit_sample = Exit.sample(n = 2038)

In [451]:
Exit_sample.shape

(2038, 14)

In [452]:
new_data = pd.concat([Exit_sample, NotExit], axis=0)

In [453]:
new_data.shape

(4076, 14)

In [454]:
new_data.to_csv("Customer churn.csv", index = False)

In [455]:
new_data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
3287,693,1,37,1,0.00,2,1,1,82867.55,0,0,1,4,502
5284,745,1,51,3,99183.90,1,1,1,28922.25,0,0,5,1,270
3423,493,1,32,8,46161.18,1,1,1,79577.40,0,0,2,2,606
6484,719,1,44,2,0.00,2,1,0,196582.19,0,0,5,2,660
3584,506,1,28,8,53053.76,1,0,1,24577.34,0,0,3,2,340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9981,498,1,42,3,152039.70,1,1,1,53445.17,1,1,3,2,790
9982,655,2,46,7,137145.12,1,1,0,115146.40,1,1,4,2,591
9991,597,2,53,4,88381.21,1,1,0,69384.71,1,1,3,2,369
9997,709,2,36,7,0.00,1,0,1,42085.58,1,1,3,1,564


**Transforming the newly created dataset**

In [457]:
new_data["Gender"].unique()

array([1, 2], dtype=int64)

In [458]:
new_data['Gender'].replace(['Male', 'Female'], [1,2], inplace=True)

In [459]:
new_data["Card Type"].unique()

array([4, 1, 2, 3], dtype=int64)

In [460]:
new_data['Card Type'].replace(['SILVER', 'GOLD', 'PLATINUM', 'DIAMOND'], [1,2,3,4], inplace=True)

In [461]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4076 entries, 3287 to 9998
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CreditScore         4076 non-null   int64  
 1   Gender              4076 non-null   int64  
 2   Age                 4076 non-null   int64  
 3   Tenure              4076 non-null   int64  
 4   Balance             4076 non-null   float64
 5   NumOfProducts       4076 non-null   int64  
 6   HasCrCard           4076 non-null   int64  
 7   IsActiveMember      4076 non-null   int64  
 8   EstimatedSalary     4076 non-null   float64
 9   Exited              4076 non-null   int64  
 10  Complain            4076 non-null   int64  
 11  Satisfaction Score  4076 non-null   int64  
 12  Card Type           4076 non-null   int64  
 13  Point Earned        4076 non-null   int64  
dtypes: float64(2), int64(12)
memory usage: 477.7 KB


In [462]:
new_data.groupby('Exited').mean()

Unnamed: 0_level_0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Card Type,Point Earned
Exited,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,651.73209,1.436212,37.428361,5.05103,73751.675339,1.534347,0.706084,0.567713,100125.201389,0.001472,2.98577,2.460746,607.303238
1,645.414622,1.558881,44.835623,4.93474,91109.476006,1.475466,0.699215,0.360648,101509.908783,0.998037,2.997547,2.538763,604.448479


In [463]:
new_data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
3287,693,1,37,1,0.00,2,1,1,82867.55,0,0,1,4,502
5284,745,1,51,3,99183.90,1,1,1,28922.25,0,0,5,1,270
3423,493,1,32,8,46161.18,1,1,1,79577.40,0,0,2,2,606
6484,719,1,44,2,0.00,2,1,0,196582.19,0,0,5,2,660
3584,506,1,28,8,53053.76,1,0,1,24577.34,0,0,3,2,340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9981,498,1,42,3,152039.70,1,1,1,53445.17,1,1,3,2,790
9982,655,2,46,7,137145.12,1,1,0,115146.40,1,1,4,2,591
9991,597,2,53,4,88381.21,1,1,0,69384.71,1,1,3,2,369
9997,709,2,36,7,0.00,1,0,1,42085.58,1,1,3,1,564


**Splliting Data**

In [465]:
x = new_data.drop(columns = ['Exited'], axis=1)
y = new_data['Exited']

In [466]:
x

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Card Type,Point Earned
3287,693,1,37,1,0.00,2,1,1,82867.55,0,1,4,502
5284,745,1,51,3,99183.90,1,1,1,28922.25,0,5,1,270
3423,493,1,32,8,46161.18,1,1,1,79577.40,0,2,2,606
6484,719,1,44,2,0.00,2,1,0,196582.19,0,5,2,660
3584,506,1,28,8,53053.76,1,0,1,24577.34,0,3,2,340
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9981,498,1,42,3,152039.70,1,1,1,53445.17,1,3,2,790
9982,655,2,46,7,137145.12,1,1,0,115146.40,1,4,2,591
9991,597,2,53,4,88381.21,1,1,0,69384.71,1,3,2,369
9997,709,2,36,7,0.00,1,0,1,42085.58,1,3,1,564


In [467]:
y

3287    0
5284    0
3423    0
6484    0
3584    0
       ..
9981    1
9982    1
9991    1
9997    1
9998    1
Name: Exited, Length: 4076, dtype: int64

In [468]:
X_train, X_test, Y_train, Y_test = train_test_split(x , y , test_size = 0.2, stratify=y, random_state=2)

In [469]:
print(x.shape, X_train.shape, X_test.shape)

(4076, 13) (3260, 13) (816, 13)


**Logistic Regression Model**

In [471]:
model = LogisticRegression()

In [472]:
model.fit(X_train, Y_train)

**Model Evaluation**

In [474]:
preditions = model.predict(X_test)
preditions

array([0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,

In [475]:
X_train.head(1)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Card Type,Point Earned
4548,635,2,58,1,0.0,1,1,1,58907.08,1,1,4,318


In [476]:
input_data_model = pd.DataFrame(
    [[530,2,38,4,12000,1,1,1,249467,1,1,3,420]],
    columns=['CreditScore','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary','Complain','Satisfaction Score','Card Type','Point Earned'])

In [477]:
input_data_model

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Card Type,Point Earned
0,530,2,38,4,12000,1,1,1,249467,1,1,3,420


In [478]:
predition = model.predict(input_data_model)
predition

array([1], dtype=int64)

**Accuracy on training data**

In [480]:
X_train_prediction = model.predict(X_train)
accuracy = accuracy_score(X_train_prediction, Y_train)
print("Accuracy on Training data : ", accuracy)

Accuracy on Training data :  0.7619631901840491


**Accuracy on test data**

In [482]:
X_test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(X_test_prediction, Y_test)
print("Accuracy on Testing data : ", test_accuracy)

Accuracy on Testing data :  0.7450980392156863


**Decision Tree Model**

In [484]:
Random = RandomForestClassifier()

In [485]:
Random.fit(X_train, Y_train)

In [486]:
predits = Random.predict(X_test)
predits

array([1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,

In [487]:
X_train.head(1)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Card Type,Point Earned
4548,635,2,58,1,0.0,1,1,1,58907.08,1,1,4,318


In [488]:
input_data = pd.DataFrame(
    [[530,1,38,4,12000,1,1,1,249467,1,1,3,420]],
    columns=['CreditScore','Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember','EstimatedSalary','Complain','Satisfaction Score','Card Type','Point Earned'])

In [489]:
input_data

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Complain,Satisfaction Score,Card Type,Point Earned
0,530,1,38,4,12000,1,1,1,249467,1,1,3,420


In [490]:
predit = Random.predict(input_data)
predit

array([1], dtype=int64)

**Accuracy on training data**

In [492]:
X_train_predict = Random.predict(X_train)
accuracy = accuracy_score(X_train_predict, Y_train)
print("Accuracy on Training data : ", accuracy)

Accuracy on Training data :  1.0


**Accuracy on test data**

In [494]:
X_test_predict = Random.predict(X_test)
test_accuracy = accuracy_score(X_test_predict, Y_test)
print("Accuracy on Testing data : ", test_accuracy)

Accuracy on Testing data :  0.9987745098039216


**CONCLUSION**

**Random Forest Classifier model is more accurate because it has 100% accuracy on training data and 99% accuracy on testing data, which is much better than Logistic Regression Model**

**Serialization(for easier deployment)**

In [498]:
import pickle
pickle_out = open("Random.pkl", "wb")
pickle.dump(Random,pickle_out)
pickle_out.close()