# K-Nearest Neighbors (KNN) Classification

# 1. Importing and preparation of data

### 1.1 Import Libraries 

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt


### 1.2 Load dataset

In [None]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' 
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
dataset = pd.read_csv(url, header=None, names=col_names)

### 1.3 Dataset Analysis

In [None]:
dataset.head()

In [None]:
dataset.species.unique()

In [None]:
dataset.groupby(['species']).size()

### 1.4 Dividing data into features and labels

In [None]:
features_columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = dataset[features_columns].values
y = dataset['species'].values

X[0:5, :]

In [None]:
y

### 1.5 Label encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
y

### 1.6 Data Visualization

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(1, figsize=(12, 8))
ax = Axes3D(fig, elev=48, azim=134)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y,
           cmap=plt.cm.Set1, edgecolor='k', s = X[:, 3]*50)

for name, label in [('Virginica', 0), ('Setosa', 1), ('Versicolour', 2)]:
    ax.text3D(X[y == label, 0].mean(),
              X[y == label, 1].mean(),
              X[y == label, 2].mean(), name,
              horizontalalignment='center',
              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'),size=25)

ax.set_title("3D visualization", fontsize=40)
ax.set_xlabel("Sepal Length [cm]", fontsize=25)
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("Sepal Width [cm]", fontsize=25)
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("Petal Length [cm]", fontsize=25)
ax.w_zaxis.set_ticklabels([])

plt.show()

### 1.6 Splitting dataset into training set, validation set and test set

In [None]:
from sklearn.model_selection import train_test_split

#Split into 60-40
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.4, random_state=10)

#Split into 50-50
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=10)

In [None]:
print("Total Size: ", len(X))
print("Training Size: ", len(X_train))
print("Validation Size: ", len(X_val))
print("Test Size: ", len(X_test))

# 2. Using KNN for classification

### 2.1 Model Training and prediction

In [None]:
# Fitting clasifier to the Training set
# Loading libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import  accuracy_score

# Instantiate learning model (k = 3)
classifier = KNeighborsClassifier(n_neighbors=3)

# Fitting the model
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

### 2.2 Model Evaluation

In [None]:

accuracy = accuracy_score(y_test, y_pred)*100
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')

### 2.3 Using validation set for parameter tuning:

In [None]:
# creating list of K for KNN
k_list = list(range(1,10))
# creating list of cv scores
val_scores = []

for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred_val = knn.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred_val)*100
    val_scores.append(accuracy)

In [None]:
plt.figure()
plt.figure(figsize=(10,5))
plt.title('The optimal number of neighbors', fontsize=20, fontweight='bold')
plt.xlabel('Number of Neighbors K', fontsize=15)
plt.ylabel('Accuracy', fontsize=15)

plt.plot(k_list, val_scores)

plt.show()

# Task 
Build and evaluate KNN model using new optimal K value