# KNN Classifier

## Imports

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score, precision_recall_curve
from time import time

## Data
already processed

In [7]:
client_attrition = pd.read_csv('../data/preprocessed/client_attrition_train.csv', sep=";")
X = client_attrition.drop("account_status",axis=1)
y = client_attrition["account_status"]
print(X.shape)

(10127, 37)


## Grid search
Using Repeated Stratified KFold Cross Validation

In [36]:
# Grid search results to df
def gs_to_df(grid_search):
    df = pd.DataFrame()
    for param in param_grid.keys():
        df[param] = 0
    df["mean_train_score"] = df["std_train_score"] = df["mean_test_score"] = df["std_test_score"] = 0
    for i, params in enumerate(grid_search.cv_results_['params']):
        new_row = params | {"mean_train_score": grid_search.cv_results_['mean_train_score'][i],
                            "std_train_score": grid_search.cv_results_['std_train_score'][i],
                            "mean_test_score": grid_search.cv_results_['mean_test_score'][i],
                            "std_test_score": grid_search.cv_results_['std_test_score'][i]}
        df.loc[len(df)] = new_row

    return df
    

In [38]:
start_time = time()
# Grid 
param_grid = {'n_neighbors': [5, 15, 51, 101], 'p': [0.1, 1, 2, 10]}
# Cross Validation
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=2,random_state=12345)
# Grid Search
grid_search = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, scoring="balanced_accuracy", 
                            n_jobs=12, cv=rskf, verbose=3, return_train_score=True)
grid_search.fit(X,y)
print("--- %s seconds ---" % (time() - start_time))

Fitting 20 folds for each of 16 candidates, totalling 320 fits
--- 1047.191642522812 seconds ---


 0.56415367 0.53327311        nan 0.56015719 0.51891522 0.50387198
        nan 0.54009216 0.50645024 0.50092119]
 0.58534523 0.55180499        nan 0.56533401 0.52009453 0.50444917
        nan 0.54186543 0.50734869 0.50092193]


In [41]:
df = gs_to_df(grid_search=grid_search)
df.sort_values(ascending=False,by="mean_test_score").to_csv('./results/knn_metrics_2_10_cv.csv', index=False,sep=';')
df.sort_values(ascending=False,by="mean_test_score")

Unnamed: 0,n_neighbors,p,mean_train_score,std_train_score,mean_test_score,std_test_score
1,5,1.0,0.734287,0.002418,0.652311,0.014095
2,5,2.0,0.697519,0.002877,0.618887,0.018903
5,15,1.0,0.634477,0.003365,0.608205,0.010889
3,5,10.0,0.660725,0.002953,0.586485,0.013072
6,15,2.0,0.585345,0.003551,0.564154,0.010849
9,51,1.0,0.565334,0.002804,0.560157,0.00875
13,101,1.0,0.541865,0.002515,0.540092,0.007288
7,15,10.0,0.551805,0.002488,0.533273,0.007217
10,51,2.0,0.520095,0.002142,0.518915,0.007993
14,101,2.0,0.507349,0.001461,0.50645,0.003988
