In [8]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import random

In [9]:
data = pd.read_csv('full_data.csv')
col_list = data.columns.tolist()
features = col_list.copy()
features.remove('y')
num_cols = [col for col in features if data[col].dtype=='int64']
data_le = pd.read_csv('encoded_data.csv')

In [10]:
data_oh = pd.read_csv('OHencoded_data.csv')
col_list_oh = data_oh.columns.tolist()
features = col_list_oh.copy()
features.remove('y')

In [11]:
data_std = pd.read_csv('stand_scaled_data.csv')
data_mm = pd.read_csv('min_max_scaled_data.csv')

In [12]:
def CVKNN(df, n_splits=5,  rando_state=2021, features=features, if_print=True, 
                  *args, **kwargs):
    # Prepare KStratifiedKFOLD
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rando_state)
    
    # Make copy of data
    data = df.copy()
    
    # Prepare empty lists
    train_results = []
    test_results = []
    preds = []
    
    # Prepare int to count fold s
    fold_number = 1
       
    for train, test in kf.split(data.index.values, data['y']):
        # Prepare KNN model 
        model = KNeighborsClassifier(*args, **kwargs)
        model.fit(data.loc[train, features], data.loc[train, 'y'])
        
        # Make predictions
        train_preds = model.predict(data.loc[train, features])
        test_preds = model.predict(data.loc[test, features])
        preds.append(test_preds)
        
        # Prepare ROC_AUC score
        train_roc = metrics.roc_auc_score(data.loc[train, 'y'], train_preds)
        test_roc = metrics.roc_auc_score(data.loc[test, 'y'], test_preds)
        
        # Add ROC_AUC to lis
        train_results.append(train_roc)
        test_results.append(test_roc)
        
        if if_print:
            print(f'FOLD NUMBER: {fold_number}')
            print(f'ROC_AUC ON TRAIN SCORE {train_roc}')
            print(f'ROC_AUC ON TEST SCORE {test_roc}')
                  
        fold_number += 1 
        
    return train_results, test_results, preds 

In [13]:
def hparam_tuning():
    param_dict = {
        'n_neighbors': random.randrange(5, 101, 5),
        'p': random.randrange(1, 3)
    }
    
    test_list = []
    for i in range(20):
        n_neighbors = param_dict['n_neighbors']
        p = param_dict['p']
        train_results, test_results, preds = CVKNN(df=data_std, n_neighbors=n_neighbors, p=p, n_jobs=-1)
        print([n_neighbors, p, np.mean(test_results)])
        test_list.append([n_neighbors, p, np.mean(test_results)])
    
    return sorted(test_list, key = lambda x : x[-1], reverse=True)

In [7]:
hparam_tuning()

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.7590876046075206
ROC_AUC ON TEST SCORE 0.7551494565514569
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.7630262395930102
ROC_AUC ON TEST SCORE 0.7544307543860316
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.7612005423155059
ROC_AUC ON TEST SCORE 0.7576807182346308
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.7624430500353699
ROC_AUC ON TEST SCORE 0.7584298438229053
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.7618194087926211
ROC_AUC ON TEST SCORE 0.7620149016652822
[85, 2, 0.7575411349320613]
FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.7590876046075206
ROC_AUC ON TEST SCORE 0.7551494565514569
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.7630262395930102
ROC_AUC ON TEST SCORE 0.7544307543860316
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.7612005423155059
ROC_AUC ON TEST SCORE 0.7576807182346308
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.7624430500353699
ROC_AUC ON TEST SCORE 0.7584298438229053
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.7618194087926211
ROC_AUC ON TEST SCORE 0.762014901

[[85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613],
 [85, 2, 0.7575411349320613]]

In [9]:
CVKNN(df=data_std, n_neighbors=30, n_jobs=-1)

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.7694601004176923
ROC_AUC ON TEST SCORE 0.7477197113932929
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.7712681416779226
ROC_AUC ON TEST SCORE 0.7549450381203106
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.7679950733917819
ROC_AUC ON TEST SCORE 0.7557930046716483
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.7709229546408058
ROC_AUC ON TEST SCORE 0.762600002176636
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.7690136132993953
ROC_AUC ON TEST SCORE 0.7646737663008017


([0.7694601004176923,
  0.7712681416779226,
  0.7679950733917819,
  0.7709229546408058,
  0.7690136132993953],
 [0.7477197113932929,
  0.7549450381203106,
  0.7557930046716483,
  0.762600002176636,
  0.7646737663008017],
 [array([0, 0, 1, ..., 0, 0, 1]),
  array([0, 1, 0, ..., 0, 0, 1]),
  array([0, 0, 0, ..., 0, 0, 0]),
  array([0, 1, 1, ..., 0, 1, 1]),
  array([0, 0, 0, ..., 1, 0, 0])])