# 1st Part: kNN algorithm from Scratch

In [None]:
import csv
from random import randrange

def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

def convert_str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [None]:
# Evaluating Euclidean Distance
from math import sqrt

def euclidean_distance(row1, row2):
    return sum((x - y) ** 2 for x, y in zip(row1, row2)) ** 0.5

In [None]:
def predict_classification(train, test_row, num_neighbors):
    distances = [(train_row, euclidean_distance(test_row, train_row[:-1])) for train_row in train]
    distances.sort(key=lambda tup: tup[1])
    neighbors = [distances[i][0] for i in range(num_neighbors)]
    output_values = [row[-1] for row in neighbors]
    prediction_val = max(set(output_values), key=output_values.count)
    return prediction_val

In [None]:
def k_nearest_neighbors(train, test, num_neighbors):
    n_predictions = [predict_classification(train, test_row, num_neighbors) for test_row in test]
    return n_predictions

In [None]:
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [None]:
# Calculating Accuracy from Confusion matrix
# Accuracy = All Positives / All Positives & Negatives = TP + TN / TP + TN + FP + FN

def accuracy_metric(actual, predicted):
    correct = sum([a == p for a, p in zip(actual, predicted)])
    return correct / len(actual) * 100.0

In [None]:
def algorithm_evaluation(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)         # Function called named cross_validation_split
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)        # Function called named accuracy_metric
        scores.append(accuracy)
    return scores

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# 2nd Part: Calculating Accuracies of both the methods and performing Paired T-test

### This T-Test is performed between kNN_Scratch_Accuracy and Sklearn_Accuracy

In [None]:
from scipy.stats import ttest_rel

def select_dataset():
    while True:  # Keep asking until a valid choice is made
        print("Please select a dataset you want to use:")
        print("1. Hayes-Roth")
        print("2. Car-Evaluation")
        print("3. Breast-Cancer")

        choice = input("Enter a number from 1, 2 or 3: ")
        if choice == '1':
            return 'hayes-roth.csv', 'Hayes-Roth'
        elif choice == '2':
            return 'car-evaluation.csv', 'Car Evaluation'
        elif choice == '3':
            return 'breast-cancer.csv', 'Breast-Cancer'
        else:
            print("Invalid selection. Please try again & select a number from 1, 2, or 3.")

if __name__ == "__main__":
    filename, dataset_name = select_dataset()
    if filename:
        try:
            dataset = load_csv(filename)
            print(f"Loaded dataset from {filename} with {len(dataset)} rows and {len(dataset[0])} columns.")
            print("-----------------------------------------------------------------------------------------------------------------")
            
            for i in range(len(dataset[0])): 
                convert_str_column_to_int(dataset, i)

            n_folds = 10
            num_neighbors = 6
            
            #############################################
            X = [row[:-1] for row in dataset]  
            y = [row[-1] for row in dataset]

            # Convert features to floating values
            X = [[float(value) for value in row] for row in X]

            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X) 

            # Define and train the kNN classifier using k-fold cross-validation
            knn = KNeighborsClassifier(num_neighbors)
            kfold = KFold(n_splits=10, shuffle=True, random_state=42)

            sk_learn = cross_val_score(knn, X_scaled, y, cv=kfold, scoring='accuracy')

            # Converting sk_learn values into percentage (%) values
            sk_learn_accuracy = [score * 100 for score in sk_learn]
            sk_learn_mean_accuracy = sum(sk_learn_accuracy)/float(len(sk_learn_accuracy))

            # Output of Sklearn Accuracy
            print(f"Sk-learn Accuracy List for {dataset_name} dataset:", sk_learn_accuracy)
            print(f"Sk-learn Meaan Accuracy for {dataset_name} dataset:", sk_learn_mean_accuracy)
            print("-----------------------------------------------------------------------------------------------------------------")
            #############################################

            scratch_accuracy = algorithm_evaluation(dataset, k_nearest_neighbors, n_folds, num_neighbors)
            scratch_mean_accuracy = sum(scratch_accuracy)/float(len(scratch_accuracy))

            # Output of Scratch Accuracy
            print(f'Scratch Accuracy List for {dataset_name} dataset:', scratch_accuracy)
            print(f'Scratch Mean Accuracy for {dataset_name} dataset:', scratch_mean_accuracy)
            print("-----------------------------------------------------------------------------------------------------------------")
            t_statistic, p_value = ttest_rel(sk_learn_accuracy, scratch_accuracy)

            # Output the results of the t-test
            print(f"Paired t-test between sklearn and scratch model accuracies:")
            print(f"T-statistic: {t_statistic}")
            print(f"P-value: {p_value}")

            alpha = 0.05  # Default value of alpha is 5%
            if p_value < alpha:
                print(f"Reject null hypothesis for {dataset_name} dataset.")
                print("There is a significant difference between the two models accuracies.")
            else:
                print(f"Accept null hypothesis for {dataset_name} dataset.")
                print("There is no significant difference between the two models accuracies.")
        except Exception as e:
            print(f"An error occurred: {e}")