# k-Nearest Neighbors

In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import pandas as pd

### 1. What does the '$k$' represent in "$k$-Nearest Neighbors"?

In [None]:
"""
k is the number of neighbors to consider
"""

### 2. How do the variance and bias of my model change as I adjust $k$? What would happen if I set $k$ to $n$, the size of my dataset?

In [None]:
"""
In general, as k increases, model bias increases and model variance decreases.
As k decreases, model bias decreases and model variance increases.

If I were to set k to n, then the model would be totally biased in favor of
the most populous class and this would become the prediction for every data
point.
"""

## $k$-Nearest Neighbors in Scikit-Learn

In this section, you will fit a classification model to the wine dataset. The data is the results of a chemical analysis of wines grown in the same region in Italy by three different cultivators. There are thirteen different measurements taken for different constituents found in the three types of wine.

In [2]:
wine = load_wine()
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [3]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [4]:
wine_df = pd.DataFrame(wine.data, columns=wine.feature_names)
wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [5]:
wine.target[:5]

array([0, 0, 0, 0, 0])

### 3. Perform a train-test split with `random_state=6`, scale, and then fit a $k$-Nearest Neighbors Classifier to the training data with $k$ = 7.

In [6]:

X = wine.data
y = wine.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=6)

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_sc, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

### Confusion Matrix

In [7]:
confusion_matrix(y_test, knn.predict(X_test_sc))

array([[15,  0,  0],
       [ 1, 16,  0],
       [ 0,  0, 13]])

### 4. How accurate is the model?  What is the precision of the model in classifying wines from *Class 0*?  What is the recall of the model in classifying wines from *Class 1*?

In [8]:

# To calculate accuracy, we can
# use knn.score(); or
print(knn.score(X_test_sc, y_test))
# import accuracy_score() from sklearn.metrics; or
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, knn.predict(X_test_sc)))
# add the entries on the main diagonal and divide by number of test wines:
print((15 + 16 + 13) / len(y_test))

0.9777777777777777
0.9777777777777777
0.9777777777777777


In [9]:

# To calculate precision, we can
# import precision_score() from sklearn.metrics; or
from sklearn.metrics import precision_score
print(precision_score(y_test, knn.predict(X_test_sc), average=None)[0])
# divide the number of true positives by the sum of true and false positives:
print(15 / (15 + 1 + 0))

0.9375
0.9375


In [10]:
# To calculate recall, we can
# import recall_score() from sklearn.metrics; or
from sklearn.metrics import recall_score
print(recall_score(y_test, knn.predict(X_test_sc), average=None)[1])
# divide the number of true positives by the sum of true positives and false negatives:
print(16 / (1 + 16 + 0))

0.9411764705882353
0.9411764705882353


In [None]:
"""
This model has about:
- 98% accuracy
- 94% precision in classifying wines from Class 0
- 94% recall in classifying wines from Class 1
"""

### Now try a model with $k$ = 5 and a Manhattan distance metric. (You can use the same train-test split.)

In [11]:

knn2 = KNeighborsClassifier(n_neighbors=5, metric='manhattan')

knn2.fit(X_train_sc, y_train)

confusion_matrix(y_test, knn2.predict(X_test_sc))

array([[15,  0,  0],
       [ 0, 16,  1],
       [ 0,  0, 13]])

### 5. How accurate is the new model? What is the precision of the model in classifying wines from *Class 0*?  What is the recall of the model in classifying wines from *Class 1*?  Which model is better? (We may or may not have enough information to make this determination)

In [12]:
print(knn2.score(X_test_sc, y_test))
print(accuracy_score(y_test, knn2.predict(X_test_sc)))
print((15 + 16 + 13) / len(y_test))

0.9777777777777777
0.9777777777777777
0.9777777777777777


In [13]:
print(precision_score(y_test, knn2.predict(X_test_sc), average=None)[0])
print(15 / (15 + 0 + 0))

1.0
1.0


In [14]:
print(recall_score(y_test, knn2.predict(X_test_sc), average=None)[1])
print(16 / (0 + 16 + 1))

0.9411764705882353
0.9411764705882353


In [None]:
"""
The new model has:
- 98% accuracy
- 100% precision in classifying wines from Class 0
- 94% recall in classifying wines from Class 1

In comparison to the previous model, the new model has the same accuracy
and recall in classifying wines from Class 1, and better precision in
classifying wines from Class 0.  In general this means we can assume the
second model is better, if these are the metrics that matter to us.

A stronger answer would also include the fact that this is a very small
set of data, so the difference between these models is just the difference
of 1 wine being incorrectly categorized as Class 0 and 1 wine being
incorrectly categorized as Class 2, so in fact the first or the second
might be better if more data becomes available
"""