In [58]:
import numpy as np
import pandas as pd

In [59]:
# To find distance b/w two nodes, one can use Euclidean distance (other methods can also be used instead of this)
def get_euclidean_distance(p1, p2):
    # Convert to numpy arrays
    p1 = np.array(p1)
    p2 = np.array(p2)

    distance = 0
    # Iterate over each dimension of vector
    for i in range(len(p1)-1):
        distance += (p2[i] - p1[i]) ** 2
    # Take the square root
    return np.sqrt(distance)

In [60]:
def predict(k, training_set, testing_value):
    distances = []
    for i in range(len(training_set)):
        # find the distance
        # -1 to ignore target variable
        dist = get_euclidean_distance(training_set[i][:-1], testing_value)
        distances.append((training_set[i], dist))
    # Sort list of tuples by distance
    distances.sort(key=lambda x: x[1])

    # List to store K nearest neighbors
    neighbors = []
    for i in range(k):
        neighbors.append(distances[i][0])

    # Determine the class of test instance
    classes = {}
    for i in range(len(neighbors)):
        # [-1] will give the target
        response  = neighbors[i][-1]
        if response in classes:
            classes[response] += 1
        else:
            classes[response] = 1
    
    # Sort the classes by frequency in descending order
    sorted_classes = sorted(classes.items(), key=lambda x: x[1], reverse=True)
    return sorted_classes[0][0]
        

In [61]:
def train_test_split(dataset, test_size=0.25):
    n_test = int(len(dataset) * test_size)
    # Getting sample test
    test_set = dataset.sample(n_test, random_state=2)
    train_set = []
    for idx in dataset.index:
        if idx in test_set.index:
            continue
        train_set.append(dataset.iloc[idx])
    train_set = pd.DataFrame(train_set).astype(float).values.tolist()
    test_set = test_set.astype(float).values.tolist()
    
    return train_set, test_set
            
    
def get_accuracy(y_true, y_pred):
    n_correct = 0
    # Iterate over both arrays at same time
    for act, pred in zip(y_true, y_pred):
        # If they are same at same position, increment 1
        if act == pred:
            n_correct += 1
    # Accuracy is total_correct / total
    accuracy = n_correct /len(y_true)
    return accuracy

In [62]:
# We can use same dataset i.e Iris Dataset
from sklearn.datasets import load_iris
iris_dataset = load_iris()
# Get target variables
y_iris = iris_dataset.target
# Create dataframe by using columns and data mentioned in iris_dataset
iris_dataset = pd.DataFrame(iris_dataset.data, columns =iris_dataset.feature_names)
# Add target variables to the dataframe 
iris_dataset = pd.concat([iris_dataset, pd.Series(y_iris)], axis=1)
# target variable in above dataframe has header '0', rename it to something else
iris_dataset.rename(columns={0:'class'}, inplace=True)

iris_dataset

# len(train_set), len(test_set)
# iris_dataset

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [63]:
# Splitting the training and testing dataset using manually created train_test_split function
train_set, test_set = train_test_split(iris_dataset)

X_train = np.array(train_set)[:, :-1]  # :, -1 will give last element of each row
y_train = np.array(train_set)[:, -1]
X_test = np.array(test_set)[:, :-1]
y_test = np.array(test_set)[:, -1]

In [64]:
def get_test_data_accuracy(k):
    preds = []
    for row in test_set:
        # Get X variable(leaving y_label)
        # predictors_only = row[:-1]

        prediction = predict(k, train_set, row)
        preds.append(prediction)

    # Now we need to evaluate accuracy, which we can get by comparing predictions with actual result
    curr_accuracy = get_accuracy(y_test, preds)
    return curr_accuracy

In [65]:
# We don't know what value of K we should be considering 
# We have to verify it manually. All it boils down to is surrounding the code from above in a loop.
k_evaluations = []
# Looping and skipping one element everytime, as we want our K to be odd to get a mode value afterwards.
for k in range(1, 22, 2):
    curr_accuracy = get_test_data_accuracy(k)
    k_evaluations.append((k, curr_accuracy))

k_evaluations

[(1, 0.9459459459459459),
 (3, 0.9459459459459459),
 (5, 0.918918918918919),
 (7, 0.918918918918919),
 (9, 0.918918918918919),
 (11, 0.918918918918919),
 (13, 0.918918918918919),
 (15, 0.9459459459459459),
 (17, 0.9459459459459459),
 (19, 0.9459459459459459),
 (21, 0.9459459459459459)]

In [66]:
# So for this random state we can consider 3 as value of k
# Now we can also make prediction for some unseen data
random_single_test = [5,15,7,11]
k_value = 3 
y_predict = predict(k_value, train_set, random_single_test)
print("Prediction class: ", y_predict)

# Now from iris dataset set we know
classes = {0:'setosa',1:'versicolor',2:'virginica'}
# Therefore using this dict we can get the class name
print("Prediction class name: ", classes[y_predict])

Prediction class:  2.0
Prediction class name:  virginica
