In [1]:
import pandas as pd
import numpy as np
from random import sample

In [2]:
dataframe = pd.read_csv('16P.csv', encoding='cp1252')

In [3]:
dataframe.drop("Response Id", inplace=True, axis=1)

In [4]:
personality_to_encode = {
"ESTJ" : 0,
"ENTJ" : 1,
"ESFJ" : 2,
"ENFJ" : 3,
"ISTJ" : 4,
"ISFJ" : 5,
"INTJ" : 6,
"INFJ" : 7,
"ESTP" : 8,
"ESFP" : 9,
"ENTP" : 10,
"ENFP" : 11,
"ISTP" : 12,
"ISFP" : 13,
"INTP" : 14,
"INFP" : 15
}

dataframe.Personality = [personality_to_encode[item] for item in dataframe.Personality]

dataframe.head()

Unnamed: 0,You regularly make new friends.,You spend a lot of your free time exploring various random topics that pique your interest,Seeing other people cry can easily make you feel like you want to cry too,You often make a backup plan for a backup plan.,"You usually stay calm, even under a lot of pressure","At social events, you rarely try to introduce yourself to new people and mostly talk to the ones you already know",You prefer to completely finish one project before starting another.,You are very sentimental.,You like to use organizing tools like schedules and lists.,Even a small mistake can cause you to doubt your overall abilities and knowledge.,...,You believe that pondering abstract philosophical questions is a waste of time.,"You feel more drawn to places with busy, bustling atmospheres than quiet, intimate places.",You know at first glance how someone is feeling.,You often feel overwhelmed.,You complete things methodically without skipping over any steps.,You are very intrigued by things labeled as controversial.,You would pass along a good opportunity if you thought someone else needed it more.,You struggle with deadlines.,You feel confident that things will work out for you.,Personality
0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,-1,0,0,0,0,0,11
1,0,0,-2,-3,-1,2,-2,0,3,0,...,0,-2,0,2,0,-1,-1,-1,3,13
2,0,0,2,0,-1,2,0,0,1,0,...,0,2,0,2,-1,0,1,2,1,7
3,0,-1,3,-1,0,0,-2,0,-2,0,...,0,0,-1,-1,0,1,0,-2,-1,12
4,0,0,-1,0,2,-1,-2,0,1,0,...,0,1,0,2,0,1,-1,2,-1,3


In [5]:
df_array = dataframe.to_numpy()

In [6]:
def cross_validation_split(array):
    n = len(array)
    indices = sample(range(n), n)
    folds = np.array_split(array[indices], 5)
    
    return folds

In [7]:
def normalize(array):
    for i in range(array.shape[1]-1):
        column = array[:,i]
        min = float(column.min())
        max = float(column.max())
        
        if max - min != 0:
            array[:, i] = (column - min) / (max - min)
        else:
            array[:, i] = column
    
    return array

In [8]:
def euclidean_distance(train, test):
    train = np.array(train)
    test = np.array(test)
    
    distance = np.sqrt(-2*np.dot(test, train.T) + np.sum(train**2, axis=1) + np.transpose([np.sum(test**2, axis=1)]))
    
    return distance

In [9]:
def get_nearest_neighbors(distances, k_neighbors):
    neighbors = np.argsort(distances, axis=1)[:, :k_neighbors]
    
    return neighbors

In [10]:
def predict_classification(train, nearest_neighbors, distances, weighted=False):
    predictions = np.empty(len(nearest_neighbors))
    
    for i in range(len(nearest_neighbors)):
        if weighted:
            distances_i = distances[i][nearest_neighbors[i]]
            distances_i = np.where(distances_i==0, 1e-8, distances_i)
            weights = 1 / (distances_i)
            classes = train[nearest_neighbors[i], -1]
            class_weights = {}
            
            for c, w in zip(classes, weights):
                if c in class_weights:
                    class_weights[c] += w
                else:
                    class_weights[c] = w
            
            max_class = max(class_weights, key=class_weights.get)
            predictions[i] = max_class
        
        else:
            classes = train[nearest_neighbors[i], -1]
            class_counts = {}
            
            for c in classes:
                if c in class_counts:
                    class_counts[c] += 1
                else:
                    class_counts[c] = 1
            
            max_class = max(class_counts, key=class_counts.get)
            predictions[i] = max_class
    
    return predictions

In [11]:
def kNN(train, distances, k_neighbors):
    train_ar = np.array(train)

    nearest_neighbors = get_nearest_neighbors(distances, k_neighbors)
    predictions, predictions_weighted = predict_classification(train_ar, nearest_neighbors, distances), predict_classification(train_ar, nearest_neighbors, distances, weighted=True)
    
    return predictions, predictions_weighted

In [12]:
def accuracy(actual, predicted):
    correct = float(0)
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1

    return correct / len(actual) * 100

In [13]:
def precision_recall(actual, predicted, target):
    true_positive = float(0)
    false_positive = float(0)
    false_negative = float(0)
    
    for i in range(len(actual)):
        if actual[i] == target and predicted[i] == target:
            true_positive += 1
        elif predicted[i] == target:
            false_positive += 1
        elif actual[i] == target:
            false_negative += 1
      
    if true_positive == 0:
        return float(0)
  
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
  
    return precision, recall

In [14]:
target_value_array = list(set(df_array[:,-1]))
table_accuracy = np.zeros(shape=(5,20))
table_precision = np.zeros(shape=(16,20))
table_recall = np.zeros(shape=(16,20))

def classification():
    df_array_normalized = normalize(np.copy(df_array))
    df_array_unnormalized = np.copy(df_array)
    df_array_normalized_cvs = cross_validation_split(np.copy(df_array_normalized))
    df_array_unnormalized_cvs = cross_validation_split(np.copy(df_array_unnormalized))

    for i in range(5):
        normalized_test = np.copy(df_array_normalized_cvs[i])
        normalized_train = list()
        unnormalized_test = np.copy(df_array_unnormalized_cvs[i])
        unnormalized_train = list()
        
        for j in range(5):
            if i != j:
                normalized_train.extend(df_array_normalized_cvs[j][:])
                unnormalized_train.extend(df_array_unnormalized_cvs[j][:])
        
        distances_u  = euclidean_distance(unnormalized_train, unnormalized_test)
        distances_n = euclidean_distance(normalized_train, normalized_test)
        
        for n in range(5):
             # Unnormalized
            predictions_unnormalized, predictions_weighted_unnormalized = kNN(unnormalized_train, distances_u, 2*n+1)
            unnormalized_accuracy = accuracy(unnormalized_test[:,-1],predictions_unnormalized)
            unnormalized_accuracy_weighted = accuracy(unnormalized_test[:,-1],predictions_weighted_unnormalized)

            # Table
            table_accuracy[i][4*n] = (unnormalized_accuracy)
            table_accuracy[i][4*n+1] = (unnormalized_accuracy_weighted)
        
            print("Unnormalized Accuracy for fold {}: {} with {} neighbors".format(str(i+1),str(unnormalized_accuracy),str(2*n+1)))
            print("Unnormalized Weighted Accuracy for fold {}: {} with {} neighbors".format(str(i+1),str(unnormalized_accuracy_weighted),str(2*n+1)))
            for k in target_value_array:
                unnormalized_precision, unnormalized_recall = precision_recall(unnormalized_test[:,-1],predictions_unnormalized,k)
                table_precision[target_value_array.index(k)][4*n] += unnormalized_precision
                table_recall[target_value_array.index(k)][4*n] += unnormalized_recall
                print("Unnormalized Precision and Recall values for 'Encode {}' target on fold {}: {} , {} with {} neighbors".format(str(k),str(i+1),str(unnormalized_precision),str(unnormalized_recall),str(2*n+1)))
            
                unnormalized_precision_weighted, unnormalized_recall_weighted = precision_recall(unnormalized_test[:,-1],predictions_weighted_unnormalized,k)
                table_precision[target_value_array.index(k)][4*n+1] += unnormalized_precision_weighted
                table_recall[target_value_array.index(k)][4*n+1] += unnormalized_recall_weighted
                print("Unnormalized Weighted Precision and Recall values for 'Encode {}' target on fold {}: {} , {} with {} neighbors".format(str(k),str(i+1),str(unnormalized_precision_weighted),str(unnormalized_recall_weighted),str(2*n+1)))
            
            # Normalized
            predictions_normalized, predictions_weighted_normalized = kNN(normalized_train, distances_n, 2*n+1)
            normalized_accuracy = accuracy(normalized_test[:,-1],predictions_normalized)
            normalized_weighted_accuracy = accuracy(normalized_test[:,-1],predictions_weighted_normalized)
            
            # Table
            table_accuracy[i][4*n+2] = normalized_accuracy
            table_accuracy[i][4*n+3] = normalized_weighted_accuracy
            
            print("Normalized Accuracy for fold {}: {} with {} neighbors".format(str(i+1),str(normalized_accuracy),str(2*n+1)))
            print("Normalized Weighted Accuracy for fold {}: {} with {} neighbors".format(str(i+1),str(normalized_weighted_accuracy),str(2*n+1)))

            for l in target_value_array:
                normalized_precision, normalized_recall = precision_recall(normalized_test[:,-1],predictions_normalized,l)
                table_precision[target_value_array.index(l)][4*n+2] += normalized_precision
                table_recall[target_value_array.index(l)][4*n+2] += normalized_recall
                print("Normalized Precision and Recall values for 'Encode {}' target on fold {}: {} , {} with {} neighbors".format(str(l),str(i+1),str(normalized_precision),str(normalized_recall),str(n)))
            
                normalized_precision_weighted, normalized_recall_weighted = precision_recall(normalized_test[:,-1],predictions_weighted_normalized,l)
                table_precision[target_value_array.index(l)][4*n+3] += normalized_precision_weighted
                table_recall[target_value_array.index(l)][4*n+3] += normalized_recall_weighted
                print("Normalized Weighted Precision and Recall values for 'Encode {}' target on fold {}: {} , {} with {} neighbors".format(str(l),str(i+1),str(normalized_precision_weighted),str(normalized_recall_weighted),str(2*n+1)))


In [15]:
classification()

Unnormalized Accuracy for fold 1: 99.05833333333334 with 1 neighbors
Unnormalized Weighted Accuracy for fold 1: 99.05833333333334 with 1 neighbors
Unnormalized Precision and Recall values for 'Encode 0' target on fold 1: 0.9910485933503836 , 0.9897828863346104 with 1 neighbors
Unnormalized Weighted Precision and Recall values for 'Encode 0' target on fold 1: 0.9910485933503836 , 0.9897828863346104 with 1 neighbors
Unnormalized Precision and Recall values for 'Encode 1' target on fold 1: 0.9866666666666667 , 0.9906291834002677 with 1 neighbors
Unnormalized Weighted Precision and Recall values for 'Encode 1' target on fold 1: 0.9866666666666667 , 0.9906291834002677 with 1 neighbors
Unnormalized Precision and Recall values for 'Encode 2' target on fold 1: 0.985545335085414 , 0.9933774834437086 with 1 neighbors
Unnormalized Weighted Precision and Recall values for 'Encode 2' target on fold 1: 0.985545335085414 , 0.9933774834437086 with 1 neighbors
Unnormalized Precision and Recall values f

Error Analysis for Classification:

- When k neighbors are very low, such as 1, the accuracy is at its lowest. This means that there are not enough neighbors for correct prediction. In general, increasing the value of k also increases the accuracy, but if at some point the particular target class is dominant in the test data, it will overcount the true class and there will be a maximum number of neighbors. This leads to incorrect predictions and reduces accuracy.

- Weighted kNN gave almost the same results as kNN because nearest neighbors also have the highest weights and nearest neighbors are also the heaviest except for a few instances. Since the difference between weighted and unweighted kNNs is very small, they give the same result when it comes to accuracy (the wrong predictions are different for weighted and unweighted kNN, but they are still wrong, so the accuracy is the same).

- Normalization slightly reduced the accuracy for low values of k neighbors, but provided better accuracy for larger values of k neighbors.

In [16]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

df_accuracy = pd.DataFrame(table_accuracy, index=['Fold1','Fold2','Fold3','Fold4','Fold5'], columns=range(1,21))    
df_accuracy.head(5)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Fold1,99.058,99.058,98.283,98.283,99.183,99.183,99.667,99.675,99.183,99.183,99.892,99.892,99.2,99.2,99.958,99.958,99.217,99.217,99.95,99.95
Fold2,98.917,98.917,98.017,98.017,99.033,99.033,99.575,99.583,99.017,99.017,99.825,99.833,98.992,98.992,99.95,99.958,99.0,99.0,99.992,99.992
Fold3,98.983,98.983,98.25,98.25,99.092,99.092,99.65,99.65,99.092,99.1,99.867,99.867,99.108,99.108,99.942,99.942,99.125,99.125,99.958,99.958
Fold4,98.958,98.958,98.2,98.2,99.125,99.125,99.533,99.542,99.058,99.058,99.833,99.825,99.075,99.075,99.942,99.942,99.067,99.058,99.958,99.958
Fold5,98.967,98.967,98.175,98.175,99.167,99.167,99.567,99.575,99.15,99.15,99.867,99.867,99.108,99.117,99.925,99.925,99.142,99.142,99.983,99.983


* This table shows the accuracy of each folds for the number k. Every 4 columns contain a different number (1,3,5,7,9) of k neighbors; unnormalized KNN, unnormalized weighted KNN, normalized KNN, and normalized weighted KNN, respectively.

In [17]:
table_precision /=5.
df_precision = pd.DataFrame(table_precision, index=target_value_array, columns=range(1,21))
df_precision.head(16)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,0.993,0.993,0.986,0.986,0.994,0.994,0.995,0.995,0.994,0.994,0.997,0.997,0.994,0.994,0.999,0.999,0.995,0.995,1.0,1.0
1,0.988,0.988,0.978,0.978,0.991,0.991,0.997,0.997,0.992,0.992,0.999,0.999,0.992,0.992,1.0,1.0,0.992,0.992,1.0,1.0
2,0.99,0.99,0.98,0.98,0.991,0.991,0.996,0.996,0.99,0.99,0.998,0.998,0.989,0.99,0.999,0.999,0.991,0.99,1.0,1.0
3,0.992,0.992,0.977,0.977,0.993,0.993,0.995,0.995,0.993,0.993,0.998,0.998,0.992,0.992,0.999,0.999,0.992,0.992,0.999,0.999
4,0.991,0.991,0.979,0.979,0.993,0.993,0.994,0.994,0.992,0.992,0.999,0.999,0.992,0.992,0.999,0.999,0.991,0.99,0.999,0.999
5,0.987,0.987,0.976,0.976,0.99,0.99,0.996,0.996,0.99,0.99,0.999,0.999,0.99,0.99,0.999,0.999,0.989,0.989,1.0,1.0
6,0.987,0.987,0.977,0.977,0.988,0.988,0.993,0.993,0.988,0.988,0.999,0.999,0.988,0.988,1.0,1.0,0.989,0.989,1.0,1.0
7,0.992,0.992,0.983,0.983,0.992,0.992,0.997,0.997,0.992,0.992,0.999,0.999,0.992,0.992,1.0,1.0,0.992,0.992,1.0,1.0
8,0.989,0.989,0.982,0.982,0.99,0.99,0.996,0.996,0.989,0.989,0.999,0.999,0.99,0.99,0.999,0.999,0.99,0.99,0.999,0.999
9,0.99,0.99,0.989,0.989,0.991,0.991,0.997,0.997,0.99,0.99,0.999,0.999,0.99,0.99,0.999,0.999,0.991,0.99,1.0,1.0


* This table shows the average of 5-fold precisions for each target class.

In [18]:
table_recall /= 5.
df_recall = pd.DataFrame(table_recall, index=target_value_array, columns=range(1, 21))
df_recall.head(16)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,0.991,0.991,0.996,0.996,0.992,0.992,1.0,1.0,0.991,0.991,1.0,1.0,0.991,0.991,1.0,1.0,0.991,0.991,1.0,1.0
1,0.992,0.992,0.975,0.975,0.993,0.993,0.993,0.993,0.993,0.993,0.996,0.996,0.992,0.992,0.999,0.999,0.992,0.992,1.0,1.0
2,0.988,0.988,0.968,0.968,0.991,0.991,0.994,0.994,0.991,0.991,0.998,0.998,0.991,0.991,0.999,0.999,0.991,0.991,0.999,0.999
3,0.99,0.99,0.976,0.976,0.992,0.992,0.993,0.994,0.991,0.991,0.998,0.998,0.991,0.991,0.999,0.999,0.991,0.991,0.999,0.999
4,0.99,0.99,0.979,0.979,0.991,0.991,0.996,0.996,0.992,0.992,0.998,0.998,0.992,0.992,1.0,1.0,0.992,0.992,1.0,1.0
5,0.991,0.991,0.982,0.982,0.992,0.992,0.996,0.996,0.992,0.992,1.0,1.0,0.992,0.992,1.0,1.0,0.992,0.992,1.0,1.0
6,0.99,0.99,0.981,0.981,0.992,0.992,0.997,0.997,0.991,0.991,0.999,0.999,0.992,0.992,0.999,0.999,0.992,0.992,1.0,1.0
7,0.989,0.989,0.975,0.975,0.99,0.99,0.993,0.993,0.99,0.99,0.997,0.997,0.99,0.99,0.999,0.999,0.99,0.99,0.999,0.999
8,0.99,0.99,0.981,0.981,0.992,0.992,0.996,0.996,0.992,0.992,0.998,0.998,0.992,0.992,0.999,0.999,0.992,0.992,1.0,1.0
9,0.986,0.986,0.989,0.989,0.988,0.988,0.998,0.998,0.988,0.988,0.999,0.999,0.988,0.988,0.999,0.999,0.988,0.988,0.999,0.999


* This table shows the average of 5-fold recalls for each target class.