In [13]:
import numpy as np
import pandas as pd
import operator

In [15]:
def knn_classifier(x, D, K, labels, measure):

    if measure == 0:
        # euclidean distances from the other points
        dists = np.sqrt(((D - x)**2).sum(axis=1))
    elif measure == 1:
        # first find the vector norm for each instance in D as wel as the norm for vector x
        D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
        x_norm = np.linalg.norm(x)
        # Compute Cosine: divide the dot product o x and each instance in D by the product of the two norms
        sims = np.dot(D,x)/(D_norm * x_norm)
        # The distance measure will be the inverse of Cosine similarity
        dists = 1 - sims
    idx = np.argsort(dists) # sorting
    
    classCount={}
    for i in range(K):
        voteIlabel = labels[idx[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 # add to the count of the label or retun 1 for first occu
        sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0], idx[:K]

In [4]:
def classification_accurcay(target, x, D, K, lablels, measure):
    num_traget = len(target)
    errorCount = 0.0
    for i in range(num_target):
    
        classifierResult, neighbors = knn_classifier(x[i,:], D, K, labels, measure)
        if (classifierResult != target[i]): errorCount += 1.0
    
    error_rate = errorCount/float(num_target)
    
    return error_rate

In [43]:
training = np.genfromtxt('trainMatrixModified.txt', delimiter='\t', dtype = float)
training_transposed = training.T

labels = np.genfromtxt('trainClasses.txt', delimiter='\t', usecols = (1), dtype = int)


testing = np.genfromtxt('testMatrixModified.txt', delimiter='\t', dtype = float)
testing_transposed = testing.T

test_labels = np.genfromtxt('testClasses.txt', delimiter='\t', usecols = (1), dtype = int)

In [42]:
table = np.zeros((20,3), dtype=float)
for i in range(0,20):
    error_rate_euc = classification_accuracy(test_labels, testing_transposed, training_transposed, i+1, labels, 0)
    error_rate_cos = classification_accuracy(test_labels, testing_transposed, training_transposed, i+1, labels, 1)
    table[i] = [i+1, error_rate_euc, error_rate_cos]
    
print("Table")
print(" K  Euclidian  Cosine")
for row in table:
   print("%2.0f   %.2f    %.2f" % (row[0], row[1], row[2]) )

Table
 K  Euclidian  Cosine
 1   0.22    0.01
 2   0.22    0.01
 3   0.19    0.03
 4   0.19    0.01
 5   0.18    0.03
 6   0.17    0.01
 7   0.23    0.02
 8   0.20    0.02
 9   0.25    0.03
10   0.15    0.01
11   0.20    0.02
12   0.15    0.03
13   0.23    0.02
14   0.18    0.02
15   0.21    0.01
16   0.20    0.02
17   0.24    0.03
18   0.21    0.03
19   0.26    0.03
20   0.23    0.03


In [48]:
combined = np.concatenate((training, testing), axis=1)
array = pd.DataFrame([(combined!=0).sum(1)]).T
NDocs = combined.shape[1]
NMatrix=np.ones(np.shape(combined), dtype=float)*NDocs
IDF = np.log2(np.divide(NMatrix, np.array(array)))
TD_tfidf = combined * IDF

In [50]:
from sklearn.model_selection import train_test_split

transposed_tfidf = TD_tfidf.T

train_tfidf, test_tfidf = train_test_split(transposed_tfidf, test_size=0.20, random_state=42)


In [51]:
table1 = np.zeros((20,3), dtype=float)
for i in range(0,20):
    error_rate_euc = classification_accuracy(test_labels, test_tfidf, train_tfidf, i+1, labels, 0)
    error_rate_cos = classification_accuracy(test_labels, test_tfidf, train_tfidf, i+1, labels, 1)
    table1[i] = [i+1, error_rate_euc, error_rate_cos]
    
print("Table for TF-IDF Matrix:")
print(" K  Euclidian  Cosine")
for row in table1:
   print("%2.0f   %.2f    %.2f" % (row[0], row[1], row[2]) )

Table for TF-IDF Matrix:
 K  Euclidian  Cosine
 1   0.46    0.48
 2   0.46    0.48
 3   0.45    0.47
 4   0.44    0.51
 5   0.47    0.53
 6   0.48    0.48
 7   0.51    0.46
 8   0.51    0.48
 9   0.51    0.51
10   0.53    0.50
11   0.49    0.55
12   0.45    0.53
13   0.48    0.52
14   0.50    0.51
15   0.52    0.55
16   0.52    0.55
17   0.55    0.54
18   0.55    0.55
19   0.54    0.51
20   0.52    0.52


## The error rates when using the TF-IDF matrix are much higher than when not using it for both the euclidian and cosine distance measures. Therefore I would go with the non-TFIDF matrix for this problem as they are giving much lower error rates, also specifically I would go with the cosine similarity method as that is giving the lowest error rates.