In [10]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import random

In [12]:
data = pd.read_csv('Data/full_data.csv')
col_list = data.columns.tolist()
features = col_list.copy()
features.remove('y')
num_cols = [col for col in features if data[col].dtype=='int64']
data_le = pd.read_csv('Data/encoded_data.csv')

In [None]:
data_oh = pd.read_csv('Data/OHencoded_data.csv')
col_list_oh = data_oh.columns.tolist()
features = col_list_oh.copy()
features.remove('y')

In [13]:
data_std = pd.read_csv('Data/stand_scaled_data.csv')
data_mm = pd.read_csv('Data/min_max_scaled_data.csv')

In [14]:
def CVKNN(df, n_splits=5,  rando_state=2021, features=features, if_print=True, 
                  *args, **kwargs):
    # Prepare KStratifiedKFOLD
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rando_state)
    
    # Make copy of data
    data = df.copy()
    
    # Prepare empty lists
    train_results = []
    test_results = []
    preds = []
    indicies = []
    
    # Prepare int to count fold s
    fold_number = 1
       
    for train, test in kf.split(data.index.values, data['y']):
        # Prepare KNN model 
        model = KNeighborsClassifier(*args, **kwargs)
        model.fit(data.loc[train, features], data.loc[train, 'y'])
        
        # Make predictions
        train_preds = model.predict(data.loc[train, features])
        test_preds = model.predict(data.loc[test, features])
        preds.append(test_preds)
        
        indicies.append(df.iloc[test].index.tolist().copy())
        
        # Prepare ROC_AUC score
        train_roc = metrics.roc_auc_score(data.loc[train, 'y'], train_preds)
        test_roc = metrics.roc_auc_score(data.loc[test, 'y'], test_preds)
        
        # Add ROC_AUC to lis
        train_results.append(train_roc)
        test_results.append(test_roc)
        
        if if_print:
            print(f'FOLD NUMBER: {fold_number}')
            print(f'ROC_AUC ON TRAIN SCORE {train_roc}')
            print(f'ROC_AUC ON TEST SCORE {test_roc}')
                  
        fold_number += 1 
        
    return train_results, test_results, preds, indicies

In [15]:
def hparam_tuning():
    param_dict = {
        'n_neighbors': random.randrange(5, 101, 5),
        'p': random.randrange(1, 3)
    }
    
    test_list = []
    for i in range(20):
        n_neighbors = param_dict['n_neighbors']
        p = param_dict['p']
        train_results, test_results, preds = CVKNN(df=data_std, n_neighbors=n_neighbors, p=p, n_jobs=-1)
        print([n_neighbors, p, np.mean(test_results)])
        test_list.append([n_neighbors, p, np.mean(test_results)])
    
    return sorted(test_list, key = lambda x : x[-1], reverse=True)

In [None]:
hparam_tuning()

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.7688695441136144
ROC_AUC ON TEST SCORE 0.7562281861100454
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.7703545093197344
ROC_AUC ON TEST SCORE 0.7567326273688277
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.7691408372355438
ROC_AUC ON TEST SCORE 0.7620564477748127
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.7713850634392562
ROC_AUC ON TEST SCORE 0.7667186813593059
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.769490046394746
ROC_AUC ON TEST SCORE 0.7666166731361128
[45, 2, 0.7616705231498209]
FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.7688695441136144
ROC_AUC ON TEST SCORE 0.7562281861100454
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.7703545093197344
ROC_AUC ON TEST SCORE 0.7567326273688277
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.7691408372355438
ROC_AUC ON TEST SCORE 0.7620564477748127
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.7713850634392562
ROC_AUC ON TEST SCORE 0.7667186813593059
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.769490046394746
ROC_AUC ON TEST SCORE 0.76661667313

In [9]:
CVKNN(df=data_std, n_neighbors=30, n_jobs=-1)

FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.7694601004176923
ROC_AUC ON TEST SCORE 0.7477197113932929
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.7712681416779226
ROC_AUC ON TEST SCORE 0.7549450381203106
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.7679950733917819
ROC_AUC ON TEST SCORE 0.7557930046716483
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.7709229546408058
ROC_AUC ON TEST SCORE 0.762600002176636
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.7690136132993953
ROC_AUC ON TEST SCORE 0.7646737663008017


([0.7694601004176923,
  0.7712681416779226,
  0.7679950733917819,
  0.7709229546408058,
  0.7690136132993953],
 [0.7477197113932929,
  0.7549450381203106,
  0.7557930046716483,
  0.762600002176636,
  0.7646737663008017],
 [array([0, 0, 1, ..., 0, 0, 1]),
  array([0, 1, 0, ..., 0, 0, 1]),
  array([0, 0, 0, ..., 0, 0, 0]),
  array([0, 1, 1, ..., 0, 1, 1]),
  array([0, 0, 0, ..., 1, 0, 0])])

In [20]:
import pickle
with open('mutual_info_oh.pkl', 'rb') as f:
    mutual_info = pickle.load(f)

In [33]:
mutual_info[0][1]

'martial_status_ Married-civ-spouse'

In [None]:
mutual_info

In [34]:
def test_features(n_neighbors=30, p=1):
    info_list = []
    features_list = []
    for i in range(5):
        features_list.append(mutual_info[i][1])
        train_results, test_results, preds, indicies = CVKNN(df=data_std, n_neighbors=n_neighbors, p=p, n_jobs=-1, features=features_list)
        info_list.append([features_list, np.mean(test_results)])
    return info_list

In [35]:
test = test_features()

['martial_status_ Married-civ-spouse']
FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.5
ROC_AUC ON TEST SCORE 0.5
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.5
ROC_AUC ON TEST SCORE 0.5
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.5
ROC_AUC ON TEST SCORE 0.5
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.5
ROC_AUC ON TEST SCORE 0.5
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.7584514503125728
ROC_AUC ON TEST SCORE 0.767895792337861
['martial_status_ Married-civ-spouse', 'education_num']
FOLD NUMBER: 1
ROC_AUC ON TRAIN SCORE 0.6841394226534706
ROC_AUC ON TEST SCORE 0.6762028166977655
FOLD NUMBER: 2
ROC_AUC ON TRAIN SCORE 0.7357700219133004
ROC_AUC ON TEST SCORE 0.738562928356333
FOLD NUMBER: 3
ROC_AUC ON TRAIN SCORE 0.691962356712054
ROC_AUC ON TEST SCORE 0.6889319263972233
FOLD NUMBER: 4
ROC_AUC ON TRAIN SCORE 0.6908346628554898
ROC_AUC ON TEST SCORE 0.6868665405945222
FOLD NUMBER: 5
ROC_AUC ON TRAIN SCORE 0.6877368042677121
ROC_AUC ON TEST SCORE 0.6992598055296577
['martial_status_ Married-civ-spouse', 'educatio