# SCRIPT TO TEST K NEAREST NEIGHBOURS MODELS
Configuration of cross validation<br>
Execution of cross validation<br>
Results of cross validation

In [None]:
#!/usr/bin/env python
# coding: utf-8

# Maximise the width of text boxes
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# Python version and rest of packages needed

import sys
# print('Python: {}'.format(sys.version))
import scipy
# print('scipy: {}'.format(scipy.__version__))
import numpy as np
# print('numpy: {}'.format(np.__version__))
import matplotlib as mat
# print('matplotlib: {}'.format(mat.__version__))
import pandas as pd
# print('pandas: {}'.format(pd.__version__))
import sklearn as sk
# print('sklearn: {}'.format(sk.__version__))
import pyreadstat
# print('pyreadstat: {}'.format(pyreadstat.__version__))
import imblearn as im
# print('imblearn: {}'.format(im.__version__))
import joblib
# print('joblib: {}'.format(joblib.__version__))
import graphviz
# print('graphviz: {}'.format(graphviz.__version__))

In [None]:
# Import functions from packages

from collections import Counter
from numpy import mean
from numpy import std
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from time import process_time

In [None]:
# load dataset and visualize dataset

df = pd.read_csv('../INPUT_dataset/BDsocioeconomic_dummy.csv',delimiter=',',low_memory=False)
df

In [None]:
# enumerate columns

df.columns

In [None]:
# some statistics about imbalanced class
suicidal_behaviour_rate = df.loc[df['Class_suicidal_behaviour']==1].shape[0]/df.shape[0]*100
non_suicidal_behaviour_rate = df.loc[df['Class_suicidal_behaviour']==0].shape[0]/df.shape[0]*100
n_non_yes_suicidal_behaviour = Counter(df['Class_suicidal_behaviour'])
print('The rate of non suicidal behaviour (0) is: {:.2f}%'.format(non_suicidal_behaviour_rate))
print('The rate of suicidal behaviour (1) is: {:.2f}%'.format(suicidal_behaviour_rate))
print('Number of instances of each class:')
print(n_non_yes_suicidal_behaviour)

In [None]:
# visualise an instance to see what it looks like
first_instance = df.iloc[0].drop(columns=['Class_suicidal_behaviour'])
first_instance

train_targets = train['Clase_suicidio']

In [None]:
# copy of the original dataset
dataset = df.copy()    

In [None]:
# dataset with the observations. Independent variables without the class
dataset_values = dataset.drop(columns=['Class_suicidal_behaviour'])
# dataset with the classes. Dependent variable
dataset_targets = pd.DataFrame(dataset['Class_suicidal_behaviour'], columns=['Class_suicidal_behaviour'])    

In [None]:
# configuration of the cross-validation process
folds = 10
repetitions = 3
seed = 1


In [None]:
# configuration of the cross-validation process
cv = RepeatedStratifiedKFold(n_splits=folds, n_repeats=repetitions, random_state=seed)

# Setting the importance of classes.
# The cost of making a mistake missing a suicidal behaviour is higher (False Negative are worse than False Positive)
param_class_weight_min = 0.2
param_class_weight_max = 0.8
class_weight = {0:param_class_weight_min, 1:param_class_weight_max}

# Setting of the KNeighborsClassifier algorithm
# This method can not use the class weight defined

param_n_neighbors = 5
param_weights = 'uniform'

this_model = KNeighborsClassifier(
                                     n_neighbors=param_n_neighbors,
                                     weights=param_weights
                                 )

# for debugging results in every iteration (0=none, 1=low, 2=high)
debug_level = 2

# we store all intermediate results 
total_experiments = folds * repetitions
all_conf_matrix = np.zeros((total_experiments,2,2))
all_accuracy = np.zeros(total_experiments)
all_sensitivity = np.zeros(total_experiments)
all_specificity = np.zeros(total_experiments)
all_PPV_precision = np.zeros(total_experiments)
all_NPV = np.zeros(total_experiments)
all_FNR = np.zeros(total_experiments)
all_F1 = np.zeros(total_experiments)

current_experiment = 0


# Start timer 
timer_start = process_time() 

# for each of the training and test combinations generated by the cross validation process
# train_ix stores the index of the experiences that will be used for train
# test_ix stores the index of the experiences that will be used for test
for train_ix, test_ix in cv.split(dataset_values, dataset_targets):
    
    # we construct the training sets by separating observations from classes
    # for this we use the chosen training indices train_ix
    train_X = dataset_values.iloc[train_ix]
    train_Y = dataset_targets.iloc[train_ix]
    
    # we construct the test sets by separating observations from classes
    # for this we use the indexes chosen for test_ix 
    test_X = dataset_values.iloc[test_ix]
    test_Y = dataset_targets.iloc[test_ix]
         
    # some information
    if debug_level>=1:
        print('\n')
        print('Experiment %d out of %d' % (current_experiment+1,total_experiments))
        suicidal_behaviour_rate = train_Y.loc[train_Y['Class_suicidal_behaviour']==1].shape[0]/train_Y.shape[0]*100
        n_non_yes_suicidal_behaviour = Counter(train_Y['Class_suicidal_behaviour'])
        print('The rate of suicidal behaviour (1) BEFORE rebalancing is: {:.2f}%'.format(suicidal_behaviour_rate))
        print('Number of instances of each class')
        print(n_non_yes_suicidal_behaviour)      
    
    # to do the rebalancing, it works better if everything is scaled between 0 and 1
    this_scaler = preprocessing.MinMaxScaler()
    this_scaler = this_scaler.fit(train_X)
    train_X = this_scaler.transform(train_X)

    # we rebalance the dataset
    param_SMOTEsampling_strategy=0.1
    param_Undersampling_strategy=0.2
    oversampling_conf = SMOTE(sampling_strategy=param_SMOTEsampling_strategy,random_state=seed)
    undersamplinf_conf = RandomUnderSampler(sampling_strategy=param_Undersampling_strategy,random_state=seed)
    rebalance_steps = [('o', oversampling_conf), ('u', undersamplinf_conf)]
    rebalance_pipeline = Pipeline(steps=rebalance_steps)
    train_X, train_Y = rebalance_pipeline.fit_resample(train_X, train_Y)
    
    # This model does not need normalised data
    # so we return the data to their original values
    train_X = this_scaler.inverse_transform(train_X)

    # In models where the data does NOT have to be normalised
    # rounding columns that had INTEGER values can help produce a better model
    train_X[:,0]  = list(map(round, train_X[:,0]))    # Sex_M0_F1
    train_X[:,2]  = list(map(round, train_X[:,2]))    # Day_in_week
    train_X[:,3]  = list(map(round, train_X[:,3]))    # Day_in_month
    train_X[:,4]  = list(map(round, train_X[:,4]))    # Month
    train_X[:,5]  = list(map(round, train_X[:,5]))    # Quarter
    train_X[:,6]  = list(map(round, train_X[:,6]))    # Week_in_year
    train_X[:,7]  = list(map(round, train_X[:,7]))    # Week_in_month
    train_X[:,8]  = list(map(round, train_X[:,8]))    # Working_day
    train_X[:,9]  = list(map(round, train_X[:,9]))    # Day1_Night2
    train_X[:,17] = list(map(round, train_X[:,17]))   # Num_requests_last_months
    
    # some information
    if debug_level>=1:
        suicidal_behaviour_rate = train_Y.loc[train_Y['Class_suicidal_behaviour']==1].shape[0]/train_Y.shape[0]*100
        n_non_yes_suicidal_behaviour = Counter(train_Y['Class_suicidal_behaviour'])
        print('The rate of suicidal behaviour (1) AFTER rebalancing is: {:.2f}%'.format(suicidal_behaviour_rate))
        print('Number of instances of each class')
        print(n_non_yes_suicidal_behaviour)    
    
    # We scale the data
    this_scaler = preprocessing.MinMaxScaler()
    this_scaler = this_scaler.fit(train_X)
    train_X = this_scaler.transform(train_X)
    test_X = this_scaler.transform(test_X)
    
    
    # we train the model with the rebalanced datasets
    this_model.fit(train_X, np.ravel(train_Y))
    
    # we make the prediction about the test observations
    predictions = this_model.predict(test_X)    
    
    # we calculate the confusion matrix over the test classes
    conf_matrix = confusion_matrix(np.ravel(test_Y), np.ravel(predictions), labels=this_model.classes_)
    tn, fp, fn, tp = conf_matrix.ravel()

    # We recode the confusion matrix to make it more user-friendly.
    #     1   0
    # 1  tp  fn
    # 0  fp  tn
    
    # we store the result of the confusion matrix in the global
    all_conf_matrix[current_experiment] = [[tp,fn],[fp,tn]]
    
    # we store the statistics in the global
    all_accuracy[current_experiment] = (tp+tn)/(tp+tn+fp+fn)*100
    all_sensitivity[current_experiment] = tp/(tp+fn)*100
    all_specificity[current_experiment] = tn/(tn+fp)*100
    all_PPV_precision[current_experiment] = tp/(tp+fp)*100
    all_NPV[current_experiment] = tn/(tn+fn)*100
    all_FNR[current_experiment] = fn/(tp+fn)*100
    all_F1[current_experiment] = (2*tp)/(2*tp+fp+fn)*100
          
    # some aditional information for every iteration
    if debug_level>=2:
        print('Confusion matrix:')
        print(all_conf_matrix[current_experiment])
        print('Accuracy: {:.3f}%'.format(all_accuracy[current_experiment]))
        print('Sensitivity: {:.3f}%'.format(all_sensitivity[current_experiment]))
        print('Specificity: {:.3f}%'.format(all_specificity[current_experiment]))
        print('Positive Predictive Value (PPV or precision): {:.3f}%'.format(all_PPV_precision[current_experiment]))
        print('Negative Predictive Value (NPV): {:.3f}%'.format(all_NPV[current_experiment]))
        print('False Negative Rate (FNR): {:.3f}%'.format(all_FNR[current_experiment]))
        print('F1: {:.3f}%'.format(all_F1[current_experiment]))
        print('n: {}'.format(tp+fn+fp+tn))
        print('\n')     
        
    # we update to continue storing data
    current_experiment = current_experiment + 1
    
# end for

# Stop the timer
timer_stop = process_time()


# we print out the mean results and standard deviations
mean_conf_matrix = np.mean(all_conf_matrix, axis=0)
desvest_conf_matrix = np.std(all_conf_matrix, axis=0)
print("\n===============================")
print("SUMMARY FOR %d EXPERIMENTS" % (total_experiments))
print("\nCONFUSION MATRIX MEAN VALUE (for experiments)")
print(mean_conf_matrix)
print("\nCONFUSION MATRIX STD VALUE (for experiments)")
print(desvest_conf_matrix)
print('')
print('Accuracy: {:.3f}% +- {:.3f}%'.format(all_accuracy.mean(),all_accuracy.std()))
print('Sensitivity: {:.3f}% +- {:.3f}%'.format(all_sensitivity.mean(),all_sensitivity.std()))
print('Specificity: {:.3f}% +- {:.3f}%'.format(all_specificity.mean(),all_specificity.std()))
print('Positive Predictive Value (PPV or precision): {:.3f}% +- {:.3f}%'.format(all_PPV_precision.mean(),all_PPV_precision.std()))
print('Negative Predictive Value (NPV): {:.3f}% +- {:.3f}%'.format(all_NPV.mean(),all_NPV.std()))
print('False Negative Rate (FNR): {:.3f}% +- {:.3f}%'.format(all_FNR.mean(),all_FNR.std()))
print('F1: {:.3f}% +- {:.3f}%'.format(all_F1.mean(),all_F1.std()))
print('\n')

# we print out the elapsed time 
if debug_level>=0:
    print("===============================")
    print("Elapsed time for execution (in seconds):", timer_stop-timer_start) 

In [None]:
# write results to file

# Build name of file to represent configuration
name = "KNN"
name = name+"_fold"+str(folds)
name = name+"_rep"+str(repetitions)
name = name+"_seed"+str(seed)
name = name+"_weight"+str(param_class_weight_min)+"-"+str(param_class_weight_max)
name = name+"_nNeighbors"+str(param_n_neighbors)
name = name+"_weights"+str(param_weights)
name = name+"_SMOTE"+str(param_SMOTEsampling_strategy)
name = name+"_Under"+str(param_Undersampling_strategy)
name = name + ".txt"

with open(name, 'w') as f:
    f.write(name)
    f.write('\n')
    f.write('\nAccuracy: {:.3f}% +- {:.3f}%'.format(all_accuracy.mean(),all_accuracy.std()))
    f.write('\nSensitivity: {:.3f}% +- {:.3f}%'.format(all_sensitivity.mean(),all_sensitivity.std()))
    f.write('\nSpecificity: {:.3f}% +- {:.3f}%'.format(all_specificity.mean(),all_specificity.std()))
    f.write('\nPositive Predictive Value (PPV or precision): {:.3f}% +- {:.3f}%'.format(all_PPV_precision.mean(),all_PPV_precision.std()))
    f.write('\nNegative Predictive Value (NPV): {:.3f}% +- {:.3f}%'.format(all_NPV.mean(),all_NPV.std()))
    f.write('\nFalse Negative Rate (FNR): {:.3f}% +- {:.3f}%'.format(all_FNR.mean(),all_FNR.std()))
    f.write('\nF1: {:.3f}% +- {:.3f}%'.format(all_F1.mean(),all_F1.std()))
    f.write('\n')
    f.write('\nall_accuracy = ')
    f.write(np.array2string(all_accuracy, precision=3, separator=',',max_line_width=1000,suppress_small=True))
    f.write('\nall_sensitivity = ')
    f.write(np.array2string(all_sensitivity, precision=3, separator=',',max_line_width=1000,suppress_small=True))
    f.write('\nall_specificity = ')
    f.write(np.array2string(all_specificity, precision=3, separator=',',max_line_width=1000,suppress_small=True))
    f.write('\nall_PPV_precision = ')
    f.write(np.array2string(all_PPV_precision, precision=3, separator=',',max_line_width=1000,suppress_small=True))
    f.write('\nall_NPV = ')
    f.write(np.array2string(all_NPV, precision=3, separator=',',max_line_width=1000,suppress_small=True))
    f.write('\nall_FNR = ')
    f.write(np.array2string(all_FNR, precision=3, separator=',',max_line_width=1000,suppress_small=True))
    f.write('\nall_F1 = ')
    f.write(np.array2string(all_F1, precision=3, separator=',',max_line_width=1000,suppress_small=True))
    f.write('\n')
    f.write("\nElapsed time (seconds) = ")
    f.write(str(timer_stop-timer_start))