In [76]:
## Imports
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
import io
#nltk.download('punkt')
from collections import Counter
import phik
import math
import plotly.express as px


In [77]:
## Read in the data
df = pd.read_csv("encoded_data.csv")

In [78]:
## Distance metric functions

def euclidean(x, y):
    """Return the euclidean distance of two vectors"""
    return np.sqrt(sum((x-y)**2))


def cossim(x, y):
    """Return the cosine similarity distance metric of two vectors"""
    magx = np.sqrt(np.dot(x, x))
    magy = np.sqrt(np.dot(y, y))
    return np.dot(x, y) / (magx * magy)


def hamming(x, y):
    """Return the hamming distance metric of two vectors"""
    return np.logical_xor(x, y).sum()

In [79]:
## Functions to return evaluation metrics 

def confusion_matrix(y_true, y_pred):
    """ Generate a confusion matrix.
    y = actual outcomes (0, 1, 2, ...)
    y_pred = predicted outcomes (0, 1, 2, ...)
    return confusion matrix as a numpy array
    """
    
    # Find unique identifiers
    unique_classes = set(y_true) | set(y_pred)
    n_classes = len(unique_classes)
    
    # Create matrix (all zeros)
    matrix = np.zeros(shape=(n_classes, n_classes), dtype=int)
    
    # Pair up each actual outcome with the corresponding prediction
    actual_prediction = list(zip(y_true, y_pred))
    
    # For each pair, increment the correct position in the matrix
    for i,j in actual_prediction:
        matrix[i,j] += 1
        
    return matrix


def metrics(y_true, y_pred, places=4):
    """ Generate accuracy scores for classifier.
    Round each score to <places> decimal places """
    scores = {}
    C = confusion_matrix(y_true, y_pred)
    scores['accuracy'] = C.diagonal().sum() / C.sum()
    if C.shape == (2,2):
        TN, FP, FN, TP = C.ravel() 
        scores['sensitivity'] = TP / (TP + FN)
        scores['specificity'] = TN / (TN + FP)
        scores['precision'] = TP / (TP + FP) 
        scores['f1-score'] = 2*((scores['specificity'] * scores['precision'])/(scores['specificity'] + scores['precision']))
    else:
        pass 
    return scores  


In [80]:
## Functions to return k-Nearest Neighbor predictions 

def sim_matrix(A, f):
    """Compute similarity matrix 
    A: array of instance attributes 
    f: similarity / distance measure """
    m = A.shape[0]
    M = np.zeros(shape=(m,m))
    for i in range(m):
        for j in range(m):
            M[i,j] = f(A[i,], A[j,])
        
    return M

def knn(k, f, df):
    """Determine a list of predictions according to k-nearest neighbors
    k: the k-value desired for k-nearest neighbors
    f: distance metric
    df: pandas dataframe """
    #distances = []
    predictions = []
    A = np.array(df)
    M = sim_matrix(A, f)
    for i in range(len(df)):
        sims = list(zip(M[i], range(len(M))))
        sorted(sims, reverse=True)[1:k+1]
        nearest = [idx for sim, idx in sorted(sims, reverse=True)[1:k+1]]
        vote = Counter(df.iloc[nearest,:].status).most_common(1)[0][0]
        predictions.append(vote)
    return predictions

In [81]:
## Functions to graph accuracy as a function of k

def accuracy(y_true, y_pred, places=4):
    """ Generate accuracy scores for classifier.
    Round each score to <places> decimal places """
    scores = {}
    C = confusion_matrix(y_true, y_pred)
    scores['accuracy'] = C.diagonal().sum() / C.sum()
    return scores['accuracy']


def accuracy_vals(f, df):
    """Return a list of accuracy scores for the kNN function determined based on a range of k values
    f: distance metric
    df: pandas dataframe"""
    accuracy_list = []
    for k in range(1,40):
        prediction = knn(k,f,df)
        #prediction = binary_convert(prediction, "Placed", "Not Placed")
        accuracy_val = accuracy(outcome,prediction)
        accuracy_list.append(accuracy_val)
    return accuracy_list


def plot_accuracy(f, df):
    """Function to plot accuracy as a function of k
    f: distance metric 
    df: pandas dataframe"""
    k_values = list(range(1,40))
    fig = px.line(x=k_values, y=accuracy_vals(f, df))
    fig.update_layout(
        title_text='Accuracy as a function of k'
    )
    fig.show()


In [82]:
## Creating the train test split
def train_test(split_perc, df):
    """Creating the train test split
    split_perc: percentage for train test split
    df: pandas dataframe"""
    df.sample(frac=1)
    rows = df.shape[0]
    train_size = int(rows*split_perc)
    train = df[0:train_size]
    test = df[train_size:]
    return train,test

***
Creating training and testing data 

In [83]:
train, test = train_test(0.8, df)

In [84]:
train

Unnamed: 0,gender,ssc_percentage,ssc_board,hsc_percentage,hsc_board,degree_percentage,work_experience,emp_test_percentage,specialisation,mba_percent,status,hsc_subject_Arts,hsc_subject_Commerce,hsc_subject_Science,undergrad_degree_Comm&Mgmt,undergrad_degree_Others,undergrad_degree_Sci&Tech
0,1,67.00,0,91.00,0,58.00,0,55.00,0,58.80,1,0,1,0,0,0,1
1,1,79.33,1,78.33,0,77.48,1,86.50,1,66.28,1,0,0,1,0,0,1
2,1,65.00,1,68.00,1,64.00,0,75.00,1,57.80,1,1,0,0,1,0,0
3,1,56.00,1,52.00,1,52.00,0,66.00,0,59.43,0,0,0,1,0,0,1
4,1,85.80,1,73.60,1,73.30,0,96.80,1,55.50,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167,1,67.90,0,62.00,0,67.00,1,58.10,1,75.71,0,0,0,1,0,0,1
168,0,48.00,1,51.00,1,58.00,1,60.00,0,58.79,0,0,1,0,1,0,0
169,1,59.96,0,42.16,0,61.26,0,54.48,0,65.48,0,0,0,1,0,0,1
170,0,63.40,0,67.20,0,60.00,0,58.06,0,69.28,0,0,1,0,1,0,0


In [85]:
test

Unnamed: 0,gender,ssc_percentage,ssc_board,hsc_percentage,hsc_board,degree_percentage,work_experience,emp_test_percentage,specialisation,mba_percent,status,hsc_subject_Arts,hsc_subject_Commerce,hsc_subject_Science,undergrad_degree_Comm&Mgmt,undergrad_degree_Others,undergrad_degree_Sci&Tech
172,1,73.0,0,58.0,0,56.0,0,84.0,0,52.64,1,0,1,0,1,0,0
173,0,52.0,0,52.0,0,55.0,0,67.0,0,59.32,0,0,0,1,0,0,1
174,1,73.24,0,50.83,0,64.27,1,64.0,1,66.23,1,0,0,1,0,0,1
175,1,63.0,0,62.0,0,65.0,0,87.5,0,60.69,0,0,0,1,0,0,1
176,0,59.0,1,60.0,0,56.0,0,55.0,0,57.9,1,0,1,0,1,0,0
177,0,73.0,1,97.0,0,79.0,1,89.0,1,70.81,1,0,1,0,1,0,0
178,1,68.0,0,56.0,0,68.0,0,73.0,0,68.07,1,0,0,1,0,0,1
179,0,77.8,1,64.0,1,64.2,0,75.5,0,72.14,0,0,0,1,0,0,1
180,1,65.0,1,71.5,0,62.8,1,57.0,1,56.6,1,0,1,0,1,0,0
181,1,62.0,1,60.33,0,64.21,0,63.0,0,60.02,0,0,0,1,0,0,1


In [86]:
X_train = train.drop("status", axis=1)
y_train = train["status"]
X_test = test.drop("status", axis=1)
y_test = test["status"]

***
<b> Hyperparameter Tuning </b>

In [87]:
## Find k that gives the highest accuracy with Euclidean metric on training data
plot_accuracy(euclidean, train)

k=35 gives the highest accuracy of 0.3372093 with the Euclidean metric on the training data

In [88]:
## Find k that gives the highest accuracy with cossim metric on training data
plot_accuracy(cossim, train)

k=11 gives the highest accuracy of 0.8372093 with the Cossine Similarity metric on the training data

In [89]:
## Find k that gives the highest accuracy with hamming metric on training data
plot_accuracy(hamming, train)

k=39 gives the highest accuracy of 0.5116279 with the Hamming metric on the training data

<b> RESULTS OF TUNING </b>
<br>
Out of the three distance metrics and the range of k values from 1-39, 
k=11 gives the highest accuracy of 0.8372093 with the Cossine Similarity metric on the training data. 


***
<b> Results of Training vs. Results of Testing Using Best Tuning Results </b>

In [90]:
predictions_test = knn(11, cossim, test)
predictions_train = knn(11, cossim, train)

In [91]:
metrics(predictions_train, y_train)

{'accuracy': 0.8372093023255814,
 'sensitivity': 0.8251748251748252,
 'specificity': 0.896551724137931,
 'precision': 0.9752066115702479,
 'f1-score': 0.9342265529841657}

In [92]:
metrics(predictions_test, y_test)

{'accuracy': 0.7209302325581395,
 'sensitivity': 0.7027027027027027,
 'specificity': 0.8333333333333334,
 'precision': 0.9629629629629629,
 'f1-score': 0.8934707903780069}