In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
churn = pd.read_csv('data/churn_ver02.csv')
churn.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,0,42,2,0.0,1,1,1,101348.88,1
1,608,0,41,1,83807.86,1,0,1,112542.58,0
2,502,0,42,8,159660.8,3,1,0,113931.57,1
3,699,0,39,1,0.0,2,0,0,93826.63,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0


In [4]:
churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Gender           10000 non-null  int64  
 2   Age              10000 non-null  int64  
 3   Tenure           10000 non-null  int64  
 4   Balance          10000 non-null  float64
 5   NumOfProducts    10000 non-null  int64  
 6   HasCrCard        10000 non-null  int64  
 7   IsActiveMember   10000 non-null  int64  
 8   EstimatedSalary  10000 non-null  float64
 9   Exited           10000 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 781.4 KB


In [6]:
x = churn.drop(columns='Exited', axis='columns')
y = churn['Exited']

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y)

### Logistic Regression

In [8]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(x_train)
x_train_scaled = ss.transform(x_train)
x_test_scaled = ss.transform(x_test)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, r2_score

lr = LogisticRegression()
lr.fit(x_train_scaled, y_train)
y_predicted = lr.predict(x_test_scaled)
print('MAE:', mean_absolute_error(y_test, y_predicted).round(4))
print('R2:', r2_score(y_test, y_predicted).round(4))

MAE: 0.1987
R2: -0.2249


### KNN

In [10]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(x_train)
x_train_scaled = ss.transform(x_train)
x_test_scaled = ss.transform(x_test)

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, r2_score

kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(x_train_scaled, y_train)
y_predicted = kn.predict(x_test_scaled)
print('MAE:', mean_absolute_error(y_test, y_predicted).round(4))
print('R2:', kn.score(x_test_scaled, y_test).round(4))

MAE: 0.1877
R2: 0.8123


### SGDClassifier

In [13]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(x_train)
x_train_scaled = ss.transform(x_train)
x_test_scaled = ss.transform(x_test)

In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_absolute_error, r2_score

sc = SGDClassifier(loss='log', max_iter=100, random_state=1234)
sc.fit(x_train_scaled, y_train)
y_predicted = sc.predict(x_test_scaled)
print('MAE:', mean_absolute_error(y_test, y_predicted).round(4))
print('R2:', sc.score(x_test_scaled, y_test).round(4))

MAE: 0.1977
R2: 0.8023


### Decision Tree

In [15]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit(x_train)
x_train_scaled = ss.transform(x_train)
x_test_scaled = ss.transform(x_test)

In [16]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=1234)
dt.fit(x_train_scaled, y_train)
y_predicted = dt.predict(x_test_scaled)
print('MAE:', mean_absolute_error(y_test, y_predicted).round(4))
print('R2:', dt.score(x_test_scaled, y_test).round(4))

MAE: 0.228
R2: 0.772


### Random Forest

In [17]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=1234)
scores = cross_validate(rf, x_train_scaled, y_train, return_train_score=True, n_jobs=-1)

In [18]:
print('R2:', scores['test_score'].mean().round(4))

R2: 0.8496
