# K-Nearest Neighbors (K-NN)

## Importing the libraries

In [16]:
import numpy as np
import pandas as pd

## Importing the dataset

In [17]:
dataset = pd.read_csv('data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

def conv(phrase):
    t = 0
    multiplier = 10 ** (len(phrase) - 1)
    notes = ['','S','r','R','g','G','M','m','P','d','D','n','N']
    for n in phrase:
        t += notes.index(n) * multiplier
        multiplier /= 10
    return t
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(X[list(y).index(61), :])
for v in range(len(X[:, 0])):
    X[v, :][0] = conv(X[v, :][0])

print(X[:20, :])


['SrgMD']
[[12470.0]
 [12490.0]
 [12589.0]
 [12590.0]
 [12691.0]
 [12789.0]
 [12792.0]
 [13196.0]
 [13298.0]
 [13308.0]
 [13309.0]
 [13421.0]
 [13431.0]
 [13432.0]
 [13463.0]
 [13513.0]
 [13518.0]
 [13563.0]
 [13587.0]
 [13592.0]]


## Splitting the dataset into the Training set and Test set

In [18]:
X_train, X_test = X, X
y_train, y_test = y, y

In [19]:
print(X_train)

[[12470.0]
 [12490.0]
 [12589.0]
 ...
 [131215.0]
 [131222.0]
 [131281.0]]


In [20]:
print(y_train)

[61 69 15 ... 31 31 58]


In [21]:
print(X_test)

[[12470.0]
 [12490.0]
 [12589.0]
 ...
 [131215.0]
 [131222.0]
 [131281.0]]


In [22]:
print(y_test)

[61 69 15 ... 31 31 58]


## Feature Scaling

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train.reshape(-1, 1))
X_test = sc.transform(X_test.reshape(-1, 1))

In [24]:
print(X_train)

[[-1.73265813]
 [-1.73207245]
 [-1.72917331]
 ...
 [ 1.74470228]
 [ 1.74490727]
 [ 1.74663504]]


In [25]:
print(X_test)

[[-1.73265813]
 [-1.73207245]
 [-1.72917331]
 ...
 [ 1.74470228]
 [ 1.74490727]
 [ 1.74663504]]


## Training the K-NN model on the Training set

In [26]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 9, metric = 'minkowski', p = 2)
knn.fit(X_train.reshape(-1, 1), y_train.reshape(-1, 1))

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Predicting a new result

In [32]:
def conv(phrase):
    t = 0
    multiplier = 10 ** (len(phrase) - 1)
    notes = ['','S','r','R','g','G','M','m','P','d','D','n','N']
    for n in phrase:
        t += notes.index(n) * multiplier
        multiplier /= 10
    return t
phrase = input('Please enter a phrase: ')
print(rfc.predict(sc.transform([[conv('gMP')]])))


[61]


## Predicting the Test set results

In [28]:
y_predknn = knn.predict(X_test)
print(np.concatenate((y_predknn.reshape(len(y_predknn),1), y_test.reshape(len(y_test),1)),1))

y_predrfc = rfc.predict(X_test)
print(np.concatenate((y_predrfc.reshape(len(y_predrfc),1), y_test.reshape(len(y_test),1)),1))

[[74 61]
 [74 69]
 [74 15]
 ...
 [17 31]
 [17 31]
 [17 58]]
[[61 61]
 [69 69]
 [15 15]
 ...
 [31 31]
 [31 31]
 [58 58]]


## Making the Confusion Matrix

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_predknn)
print(cm)
accuracy_score(y_test, y_predknn)



[[3 0 0 ... 0 0 0]
 [1 2 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


0.382031905961377

In [30]:
t, c = 0, 0
print(len(y_test), len(y_predknn))
for y in y_test:
    t += 1
    yp = y_predrfc[list(y_test).index(y)]
    if y == yp:
        c += 1
print(t, c, c/t)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_predrfc)
print(cm)
for r in cm:
    for c in r:
        print(c, end=' ')
    print('\n')
    
accuracy_score(y_test, y_predrfc)

1191 1191
1191 1076 0.9034424853064652
[[12  0  0 ...  0  0  0]
 [ 0  7  0 ...  0  0  0]
 [ 0  0  3 ...  0  0  0]
 ...
 [ 0  0  0 ...  3  0  0]
 [ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  7]]
12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

0 0 0 18 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

0.9403862300587741