In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import random
from pprint import pprint
from sklearn.datasets import load_iris

In [2]:
df = pd.read_csv('iris.csv')
df.drop('Id', axis = 1, inplace = True)
df = df.rename(columns = {'species':'label'})
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
y = df.iloc[:,-1]
x = df.drop('label', axis = 1)

x.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Implementation    

In [4]:
def calculate_euclidean_distance(d1,d2):
    dist = 0.0
    for i in range(len(d1)-1):
        dist +=(d1[i] - d2[i]) **2
    
    return np.sqrt(dist)

In [5]:
d1 = df.iloc[0].values
d2 = df.iloc[1].values

calculate_euclidean_distance(d1,d2)

0.5385164807134502

In [6]:
def determine_type_of_feature(df):
    feature_type = []
    unique_value_threshold = 15
    for column in df.columns:
        if column != "label":
            unique_values = df[column].unique()        
            example_value = unique_values[0]
            
            if (isinstance(example_value, str)) or (len(unique_values) <= unique_value_threshold):
                feature_type.append('categorical')
            else:
                feature_type.append('continuous')
                
    return feature_type

In [7]:
determine_type_of_feature(df)

['continuous', 'continuous', 'continuous', 'continuous']

In [8]:
def get_nearest_neighbors(train_df, y_train,test_df, k_value = 3):
    
    elcudian_dis = 0.0
    distance = list()
    for i,train_row in enumerate(train_df):
        euclidean_distance = calculate_euclidean_distance(train_row,test_df)
        row = np.append(train_row, y_train[i])
        distance.append((row, euclidean_distance))
    
    distance.sort(key=lambda tup: tup[1])
    neighbors = list()
    for i in range(k_value):
        neighbors.append(distance[i][0])
    return neighbors

In [9]:
def vote_function(train_df, y_train, test_df, k_value):
    target_class =list()
    neighbors = get_nearest_neighbors(train_df, y_train,test_df, k_value)
    for row in neighbors:
        target_class.append(row[-1])
    vote_class = max(set(target_class), key=target_class.count)
    return vote_class

In [10]:
# kNN Algorithm
def k_nearest_neighbors(train_df, y_train, test, num_neighbors):
    predictions = list()
    for row in test:
        output = vote_function(train_df, y_train, row, num_neighbors)
        
        predictions.append(output)
    return(predictions)

### Split data into Train and Test dataset

In [11]:
def train_test_split(df,y,test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    
    indices = random.sample(df.index.tolist(),test_size)

    x_test_df = df.loc[indices]
    x_train_df = df.drop(indices)
    
    y_test_df = y.loc[indices]
    y_train_df = y.drop(indices)
    
    
    return x_train_df,x_test_df,y_train_df,y_test_df

In [27]:
random.seed(0)
x_train_df,x_test_df,y_train_df,y_test_df = train_test_split(x,y,10)
x_train_df.shape
y_train_df.shape

(140,)

In [28]:
y_pred = k_nearest_neighbors(x_train_df.values,y_train_df.values,x_test_df.values, num_neighbors = 3)
y_pred

['Iris-versicolor',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-versicolor']

### Accuracy

In [29]:
## Method 1
# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

In [30]:
## Method 2
# Calculate accuracy percentage
def accuracy_metric2(actual, predicted):

    acc = list()
    acc = actual == predicted
    accuracy =acc.mean()
    
    return accuracy *100.0

In [31]:
accuracy_metric(y_pred,y_test_df.values)

90.0

In [32]:
accuracy_metric2(y_pred,y_test_df.values)

90.0