## Supervised Learning
### K-Nearest Neighbor

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model, preprocessing

In [2]:
data = pd.read_csv("car.data")
print(data.head())

  buying  maint door persons lug_boot safety  class
0  vhigh  vhigh    2       2    small    low  unacc
1  vhigh  vhigh    2       2    small    med  unacc
2  vhigh  vhigh    2       2    small   high  unacc
3  vhigh  vhigh    2       2      med    low  unacc
4  vhigh  vhigh    2       2      med    med  unacc


In [5]:
# Need to convert data from non-numerical to numerical data

# Use the sklearn preprocessing module
le = preprocessing.LabelEncoder()

# attributes to quantify
buying = le.fit_transform(list(data["buying"]))
maint = le.fit_transform(list(data["maint"]))
door = le.fit_transform(list(data["door"]))
persons = le.fit_transform(list(data["persons"]))
lug_boot = le.fit_transform(list(data["lug_boot"]))
safety = le.fit_transform(list(data["safety"]))
cls = le.fit_transform(list(data["class"]))

# Print the result for buying
print(buying)

[3 3 3 ... 1 1 1]


In [6]:
# Label
predict = "class"

# Setup our attributes and label datasets
x = list(zip(buying, maint, door, persons, lug_boot, safety)) # zip() puts items into a tuple
y = list(cls)

x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.1)

In [7]:
# View the data
print(x_train, y_test)

[(3, 2, 3, 2, 1, 0), (2, 1, 1, 1, 1, 1), (0, 3, 1, 2, 0, 1), (2, 1, 3, 0, 0, 1), (0, 1, 3, 1, 2, 1), (3, 3, 2, 2, 0, 1), (3, 2, 1, 1, 0, 2), (1, 0, 1, 0, 1, 0), (0, 3, 2, 2, 2, 2), (3, 2, 2, 0, 1, 2), (2, 3, 2, 0, 1, 0), (1, 0, 1, 0, 0, 2), (3, 3, 2, 1, 1, 0), (3, 0, 1, 2, 1, 0), (2, 1, 0, 0, 2, 0), (1, 1, 1, 0, 1, 2), (1, 0, 2, 1, 2, 1), (1, 3, 1, 0, 0, 1), (0, 2, 3, 1, 1, 1), (2, 2, 1, 0, 0, 2), (2, 2, 1, 1, 2, 0), (1, 2, 2, 2, 1, 0), (2, 0, 3, 0, 1, 1), (0, 0, 2, 2, 1, 2), (3, 0, 0, 1, 0, 2), (0, 3, 2, 1, 0, 1), (2, 3, 3, 1, 1, 2), (3, 3, 3, 1, 1, 2), (2, 1, 0, 2, 1, 2), (3, 3, 0, 0, 1, 2), (3, 2, 3, 2, 0, 1), (2, 2, 3, 1, 0, 1), (0, 1, 3, 0, 0, 0), (1, 0, 1, 0, 2, 2), (2, 2, 1, 0, 2, 1), (2, 3, 2, 1, 0, 2), (2, 2, 1, 0, 2, 0), (1, 3, 3, 0, 0, 0), (1, 3, 3, 1, 2, 1), (0, 1, 2, 0, 2, 2), (1, 1, 2, 1, 0, 2), (0, 2, 2, 2, 1, 0), (3, 2, 0, 2, 0, 2), (0, 0, 3, 0, 2, 1), (3, 1, 2, 0, 0, 1), (0, 0, 3, 2, 2, 2), (3, 1, 0, 1, 1, 0), (2, 0, 2, 0, 2, 0), (1, 1, 2, 1, 1, 0), (1, 2, 1, 2, 1, 0),

## K-NN Algorithm Explanation

 <table>
  <tr>
    <td>
      <img src="KNN_graph1.png" width="200" />
    </td>
        <td>
      <img src="KNN_graph3.png" width="245" />
    </td>
    <td>
      <img src="KNN_graph2.png" width="245" />
    </td>
  </tr>
</table>

- Group items by similar characteristics
- K - the hyperparameter, the number of voting members
- K (Number of Neighbors): The 'K' hyperparameter defines the number of nearest neighbors to consider when making predictions for a new data point. For example, if k=3, the algorithm will consider the three nearest neighbors to the query point and make a prediction based on the majority class among those three neighbors (for classification tasks). The choice of 'k' can significantly impact the algorithm's performance.

Choosing the right value of 'k' is crucial because:

- A small 'K' (e.g., k=1) can lead to noise sensitivity and overfitting. The algorithm might be highly influenced by outliers or noise in the data, leading to poor generalization to new data points.
- A large 'K' (e.g., k=20) can lead to oversmoothing and underfitting. The algorithm may lose the ability to capture local patterns in the data, resulting in a more biased model.

In [10]:
# Choosing the hyperparameter
model = KNeighborsClassifier(n_neighbors=9)

In [11]:
# Fit the model
model.fit(x_train, y_train)
acc = model.score(x_test, y_test)
print(acc)

0.953757225433526


In [14]:
predicted = model.predict(x_test)

names = ["unacc", "acc", "good", "vgood"]

for x in range(len(predicted)):
    print("Predicted: ", names[predicted[x]], "Data: ", x_test[x], "Actual: ", names[y_test[x]])
    n = model.kneighbors([x_test[x]], 9, True)
    print("N: ", n)

Predicted:  good Data:  (3, 3, 3, 2, 0, 2) Actual:  good
N:  (array([[1.        , 1.        , 1.        , 1.        , 1.41421356,
        1.41421356, 1.41421356, 1.41421356, 1.41421356]]), array([[ 641,  284,  395,  518, 1270, 1003,  503, 1408,  314]],
      dtype=int64))
Predicted:  good Data:  (2, 1, 1, 0, 2, 0) Actual:  good
N:  (array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.41421356]]), array([[1350,  248, 1448,   36,   14,  719,  204,  399, 1323]],
      dtype=int64))
Predicted:  unacc Data:  (2, 0, 1, 2, 1, 0) Actual:  unacc
N:  (array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.41421356]]), array([[  13, 1300, 1319,  288, 1018, 1343,  676,  692,  734]],
      dtype=int64))
Predicted:  good Data:  (0, 0, 1, 2, 1, 1) Actual:  good
N:  (array([[1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.41421356, 1.414