In [2]:
import pandas as pd

In [3]:
import numpy as np

In [261]:
churn = pd.read_csv('data/churn.csv')
churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Feature Engineering

In [262]:
### Feature engineering
# change gender to 1/0
churn.Gender = churn.Gender.map(lambda x: 1 if x == "Female" else 0)

# drop text columns
columns = ['Surname', 'Geography']
churn = churn.drop(columns, axis='columns')

# drop ID columns
columns = ['RowNumber', 'CustomerId']
churn = churn.drop(columns, axis='columns')

In [263]:
# drop those with 0.0 balance
churn = churn[churn.Balance > 0]

In [264]:
churn.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1,608,1,41,1,83807.86,1,0,1,112542.58,0
2,502,1,42,8,159660.8,3,1,0,113931.57,1
4,850,1,43,2,125510.82,1,1,1,79084.1,0
5,645,0,44,8,113755.78,2,1,0,149756.71,1
7,376,1,29,4,115046.74,4,1,0,119346.88,1


In [337]:
### SAVE TO A CLEAN CSV ###
churn.to_csv('clean_churn.csv', index=False)

In [302]:
from sklearn.neighbors import KNeighborsClassifier

In [303]:
classifier = KNeighborsClassifier(n_neighbors=3)

## With gender as one column, 0/1 without train_test_split

In [307]:
inputs = churn.drop(['Exited'], axis='columns')

In [308]:
output = churn.Exited

In [311]:
classifier.fit(inputs, output)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [312]:
classifier.score(inputs, output)

0.8171706094313019

## With gender as one column, 0/1 with train_test_split

In [269]:
from sklearn.model_selection import train_test_split


In [313]:
xtrain, xtest, ytrain, ytest = train_test_split(inputs, output, test_size=0.3)

In [314]:
model = KNeighborsClassifier(n_neighbors=10)
model.fit(xtrain,ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [315]:
model.score(xtest, ytest)

0.7462140992167102

In [316]:
inputs.head(1)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
1,608,1,41,1,83807.86,1,0,1,112542.58


In [336]:
person = [[908.0, 1.0, 61.0, 20.0, 8381207.86, 100.0, 1.0, 0.0, 1121542.58]]
print(model.predict_proba(person))
model.predict(person)

[[0.4 0.6]]


array([1])

### Feature selection on KNN using permutation?

In [273]:
from sklearn.inspection import permutation_importance

permutation_score = permutation_importance(model, xtrain, ytrain, n_repeats=50)

np.vstack((inputs.columns, permutation_score.importances_mean)).T

array([['CreditScore', 1.3428827215755668e-05],
       ['Gender', 0.0],
       ['Age', 0.0],
       ['Tenure', 0.0],
       ['Balance', 0.009006266786034008],
       ['NumOfProducts', 0.0],
       ['HasCrCard', 0.0],
       ['IsActiveMember', 0.0],
       ['EstimatedSalary', 0.007967770814682175]], dtype=object)

In [None]:
# permutation shows that most important are Balance, EstimatedSalary, CreditScore
# Gender, Age, Tenure, NumOfProducts, HasCrCard, isActiveMember show 0 - consider removing for simplicity?

In [191]:
model = KNeighborsClassifier(n_neighbors=10)

In [193]:
small_x = inputs[['Balance', 'CreditScore', 'EstimatedSalary']]

In [194]:
xtr, xtt, ytr, ytt = train_test_split(small_x, output, test_size=0.3)

In [195]:
model.fit(xtr, ytr)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [196]:
model.score(xtt, ytt)

0.7441253263707572

In [197]:
small_x.head()

Unnamed: 0,Balance,CreditScore,EstimatedSalary
1,83807.86,608,112542.58
2,159660.8,502,113931.57
4,125510.82,850,79084.1
5,113755.78,645,149756.71
7,115046.74,376,119346.88


In [259]:
pavel = [[11723200, 15, 723476]]

In [260]:
print(model.predict(pavel))
model.predict_proba(pavel)

[1]


array([[0.4, 0.6]])

## Forcing only categorical columns!

In [275]:
categories = churn[['NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Gender']]

In [276]:
knn_cat = KNeighborsClassifier(n_neighbors=5)

In [279]:
cattrain, cattest, outtrain, outtest = train_test_split(categories, output, test_size=0.3)

In [280]:
knn_cat.fit(cattrain, outtrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [282]:
knn_cat.score(cattest,outtest)

0.7326370757180156

In [285]:
permutation_score = permutation_importance(knn_cat, cattrain, outtrain, n_repeats=50)

np.vstack((categories.columns, permutation_score.importances_mean)).T

array([['NumOfProducts', 0.056490599820948936],
       ['HasCrCard', 0.003393017009847781],
       ['IsActiveMember', 0.02487914055505816],
       ['Gender', -0.009413607878245333]], dtype=object)

In [291]:
cattest.head(1)

Unnamed: 0,NumOfProducts,HasCrCard,IsActiveMember,Gender
8705,1,1,1,1


In [301]:
connie = [[3,1,0,1]]
print(knn_cat.predict(connie))
knn_cat.predict_proba(connie)

[1]


array([[0.2, 0.8]])