In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# KNN

## Imports for K-NN

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

In [None]:
pip install dmba

Collecting dmba
  Downloading dmba-0.2.4-py3-none-any.whl.metadata (1.9 kB)
Downloading dmba-0.2.4-py3-none-any.whl (11.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.2.4


In [None]:
from dmba import classificationSummary

Colab environment detected.


In [None]:
df = pd.read_excel('/content/for_machine_learningdf.xlsx', index_col='customerID')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5042 entries, 7590-VHVEG to 2775-SEFEE
Data columns (total 31 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   seniorCitizen                          5042 non-null   int64  
 1   partner                                5042 non-null   int64  
 2   dependents                             5042 non-null   int64  
 3   tenure                                 5042 non-null   int64  
 4   phoneService                           5042 non-null   int64  
 5   paperlessBilling                       5042 non-null   int64  
 6   monthlyCharges                         5042 non-null   float64
 7   totalCharges                           5042 non-null   float64
 8   gender_Male                            5042 non-null   int64  
 9   multipleLines_True                     5042 non-null   int64  
 10  multipleLines_No phone service         5042 non-null   int64  

# Splitting Data into Training and Validation Data

In [None]:
trainData, validData = train_test_split(df.drop('partner', axis=1), test_size=0.1, random_state=0)

In [None]:
df = df.drop('partner', axis=1)

In [None]:
df.paperlessBilling.value_counts()

Unnamed: 0_level_0,count
paperlessBilling,Unnamed: 1_level_1
1,2995
0,2047


In [None]:
df.phoneService.value_counts()

Unnamed: 0_level_0,count
phoneService,Unnamed: 1_level_1
1,4553
0,489


In [None]:
continuous = ['tenure', 'monthlyCharges', 'totalCharges']

# Standardizing the Continuous features of the training data

In [None]:
scaler = preprocessing.StandardScaler()

In [None]:
scaler.fit(trainData[continuous])

In [None]:
categorical = [col for col in df.columns if col not in continuous]

In [None]:
dfNorm = pd.concat([pd.DataFrame(scaler.transform(df[continuous]),
             columns=['ztenure', 'zphoneService', 'zpaperlessBilling']),
           df[categorical].reset_index()], axis=1).set_index('customerID')

In [None]:
pd.set_option('display.max_columns',len(dfNorm.columns)+1)

In [None]:
dfNorm.head()

Unnamed: 0_level_0,ztenure,zphoneService,zpaperlessBilling,seniorCitizen,dependents,phoneService,paperlessBilling,gender_Male,multipleLines_True,multipleLines_No phone service,internetService_DSL,internetService_Fiber optic,onlineSecurity_True,onlineSecurity_No internet service,onlineBackup_True,onlineBackup_No internet service,deviceProtection_True,deviceProtection_No internet service,techSupport_True,techSupport_No internet service,streamingTV_True,streamingTV_No internet service,streamingMovies_True,streamingMovies_No internet service,contract_One year,contract_Two year,paymentMethod_Credit card (automatic),paymentMethod_Electronic check,paymentMethod_Mailed check,churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
7590-VHVEG,-1.291446,-1.169449,-1.002607,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
5575-GNVDE,0.055517,-0.268591,-0.180983,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
3668-QPYBK,-1.250629,-0.371641,-0.968013,0,0,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
7795-CFOCW,0.504505,-0.755586,-0.202521,0,0,0,0,1,0,1,1,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
9237-HQITU,-1.250629,0.188486,-0.948794,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1


In [None]:
trainNorm = dfNorm.loc[trainData.index]

In [None]:
validNorm = dfNorm.loc[validData.index]

# Separating the response from training and validation

In [None]:
train_X = trainNorm.drop('churn', axis=1)

In [None]:
train_y = trainNorm.churn

In [None]:
valid_X = validNorm.drop('churn', axis=1)

In [None]:
valid_y = validNorm.churn

## Determining the best value of K

In [None]:
results = []

for k in range(1, 15):
  knn = KNeighborsClassifier(n_neighbors=k).fit(train_X, train_y)
  results.append({
      'k':k,
      'accuracy': accuracy_score(valid_y, knn.predict(valid_X))
  })

In [None]:
pd.DataFrame(results)

Unnamed: 0,k,accuracy
0,1,0.740594
1,2,0.772277
2,3,0.766337
3,4,0.782178
4,5,0.794059
5,6,0.788119
6,7,0.794059
7,8,0.790099
8,9,0.790099
9,10,0.788119


# Seems that K=7 if strictly determining best K on accuracy, need to adjust though because want best K based on high TPR and low FNR.

## Best value of K if optimizing recall

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score

In [None]:
results = []
params = {
    'weights': ['uniform', 'distance']
}

for k in range(1, 15):
  knn = KNeighborsClassifier(n_neighbors=k)
  knn_grid = GridSearchCV(
      estimator=knn,
      param_grid=params,
      scoring='recall',
      cv=5
  )

  knn_grid.fit(train_X, train_y)

  results.append({
      'k': k,
      'best_param': knn_grid.best_params_['weights'],
      'recall_valid': recall_score(valid_y, knn_grid.predict(valid_X))

  })

In [None]:
pd.DataFrame(results)

Unnamed: 0,k,best_param,recall_valid
0,1,uniform,0.510638
1,2,distance,0.503546
2,3,uniform,0.546099
3,4,distance,0.496454
4,5,uniform,0.595745
5,6,distance,0.553191
6,7,uniform,0.560284
7,8,distance,0.531915
8,9,uniform,0.574468
9,10,distance,0.546099


## Best K is 5 to optimize recall on validation data is 5, with uniform weights. Does not compare to best classification accuracy in best Logistic Regression model.