In [1]:
import random
from heapq import nsmallest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
class KNNClassifier:
    """
    K-Nearest Neighbors Classifier
    """

    def __init__(self, k=3, dis_metric='euclidean'):
        """
        Initialize KNNClassifier with specified parameters.
        """
        self.k = k
        self.dis_metric = dis_metric
        self.train_data = []  # Training data
        self.train_labels = []  # Training labels

    def train(self, train_data, train_labels):
        """
        Train the KNN classifier with the given training data and labels.
        """
        self.train_data = train_data
        self.train_labels = train_labels

    def euclidean_dist(self, v1, v2):
        """
        Calculate Euclidean distance between two vectors.
        """
        if len(v1) != len(v2):
            return float('inf')  # return a large distance so this pair is not considered close
        return sum((a - b) ** 2 for a, b in zip(v1, v2)) ** 0.5

    def neighbors_get(self, test_row):
        """
        Get k nearest neighbors for the given test row.
        """
        dist_metrics = {
            'euclidean': self.euclidean_dist,
        }

        if self.dis_metric not in dist_metrics:
            raise ValueError("Invalid distance metric")

        calculate_distance = dist_metrics[self.dis_metric]

        dists = [(train_row, calculate_distance(test_row, train_row), label)
                 for train_row, label in zip(self.train_data, self.train_labels)]

        return nsmallest(self.k, dists, key=lambda x: x[1])

    def predict(self, test_data):
        """
        Predict labels for the given test data.
        """
        preds = []

        for test_case in test_data:
            neighbors = self.neighbors_get(test_case)
            output = [row[-1] for row in neighbors]
            prediction = max(set(output), key=output.count)
            preds.append(prediction)
        return preds


In [3]:
class kFoldCV:
    """
    K-Fold Cross Validation
    """

    def __init__(self, classifier):
        """
        Initialize kFoldCV with the given classifier.
        """
        self.classifier = classifier

    def cross_val_split(self, dataset, no_folds):
        """
        Split dataset into k folds for cross-validation.
        """
        data_split = []
        data_copy = list(dataset)
        fold_size = len(dataset) // no_folds
        for _ in range(no_folds):
            fold = []
            while len(fold) < fold_size:
                index = random.randrange(len(data_copy))
                fold.append(data_copy.pop(index))
            data_split.append(fold)
        return data_split

    def k_fold_eval(self, dataset, no_folds, *args):
        """
        Perform k-fold cross-validation and evaluate the classifier.
        """
        folds = self.cross_val_split(dataset, no_folds)
        scores = []
        for fold in folds:
            train_set = [row for i, row in enumerate(dataset) if i not in fold]
            test_set = [list(row) for row in fold]
            train_labels = [row[-1] for row in train_set]
            train_set = [row[:-1] for row in train_set]
            self.classifier.train(train_set, train_labels)
            actual = [row[-1] for row in test_set]
            test_set = [row[:-1] for row in test_set]
            predicted = self.classifier.predict(test_set)
            accuracy = self.calculate_accuracy(actual, predicted)
            scores.append(accuracy)
        self.print_metrics(actual, predicted)
        return scores

    @staticmethod
    def calculate_accuracy(actual, predicted):
        """
        Calculate the accuracy of predictions.
        """
        if len(actual) != len(predicted):
            raise ValueError("Lengths of actual and predicted lists must be equal.")
        correct = sum(1 for a, p in zip(actual, predicted) if a == p)
        return (correct / len(actual)) * 100.0

    @staticmethod
    def print_metrics(actual, predicted):
        """
        Print evaluation metrics.
        """
        if actual is None or predicted is None:
            raise ValueError("Both 'actual' and 'predicted' must be valid lists.")
        accuracy = kFoldCV.calculate_accuracy(actual, predicted)
        print(f"Accuracy: {accuracy:.2f}%")
        return accuracy


def read_data(file_name):
    """
    Read data from the given file.
    """
    data = []
    labels = []

    with open(file_name, "r") as file:
        lines = file.readlines()
    for line in lines:
        split_line = line.strip().split(',')
        data.append(split_line[:-1])
        labels.append(split_line[-1])
    return data, labels


In [4]:
# Hayes Roth data

train_file = "/Users/hemanthsukumar/Desktop/ML/hayes+roth/hayes-roth.data" #DATA SET PATH
test_file = "/Users/hemanthsukumar/Desktop/ML/hayes+roth/hayes-roth.test" #DATA SET PATH


# Data Preprocessing
def preprocess_data(file_path):
    data, labels = read_data(file_path)
    features = [[int(item) for item in row[1:]] for row in data]
    labels = [int(label) for label in labels]
    return features, labels

train_features, train_labels = preprocess_data(train_file)
test_features, test_labels = preprocess_data(test_file)

# KNN using custom implementation
knn_custom = KNNClassifier()
knn_custom.train(train_features, train_labels)
euc_predictions = knn_custom.predict(test_features)

# Calculate accuracy
correct_predictions = sum(1 for true_label, pred_label in zip(test_labels, euc_predictions) if true_label == pred_label)
total_predictions = len(test_labels)
accuracy_custom = (correct_predictions / total_predictions) * 100

# Print accuracy
print(f"\nAccuracy of KNN model using custom implementation: {accuracy_custom:.2f}%")

# K Fold
classifier = KNNClassifier()
kfold_cv = kFoldCV(classifier)
kfold_accuracy = kfold_cv.k_fold_eval(train_features, 10, 3, 'euclidean')
print("Accuracy using K-Fold cross-validation:")
for fold, acc in enumerate(kfold_accuracy, 1):
    print(f"Fold {fold}: {acc:.2f}%")


# Sklearn
X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.3)

# Calculate min and max for each feature
min_values = [min(feature) for feature in zip(*X_train)]
max_values = [max(feature) for feature in zip(*X_train)]

# Scale the features
X_train_scaled = [[(x - min_val) / (max_val - min_val) for x, min_val, max_val in zip(xi, min_values, max_values)] for xi in X_train]
X_test_scaled = [[(x - min_val) / (max_val - min_val) for x, min_val, max_val in zip(xi, min_values, max_values)] for xi in X_test]

k = 5  # Number of neighbors
knn_sklearn = KNeighborsClassifier(n_neighbors=k)
knn_sklearn.fit(X_train_scaled, y_train)
y_pred_sklearn = knn_sklearn.predict(X_test_scaled)
accuracy_sklearn = sum(1 for true_label, pred_label in zip(y_test, y_pred_sklearn) if true_label == pred_label) / len(y_test)
print(f"\nAccuracy of KNN model using sklearn: {accuracy_sklearn * 100:.2f}%")

# Kfold using Sklearn
X = X_train_scaled + X_test_scaled
Y = y_train + y_test

# Define 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
kfold_accuracy_sklearn = cross_val_score(knn_sklearn, X, Y, cv=kfold, scoring='accuracy')

print("\nAccuracy using K-Fold cross-validation with sklearn:")
for fold, acc in enumerate(kfold_accuracy_sklearn, 1):
    print(f"Fold {fold}: {acc * 100:.2f}%")



Accuracy of KNN model using custom implementation: 50.00%
Accuracy: 76.92%
Accuracy using K-Fold cross-validation:
Fold 1: 53.85%
Fold 2: 61.54%
Fold 3: 53.85%
Fold 4: 38.46%
Fold 5: 61.54%
Fold 6: 30.77%
Fold 7: 53.85%
Fold 8: 53.85%
Fold 9: 46.15%
Fold 10: 76.92%

Accuracy of KNN model using sklearn: 30.00%

Accuracy using K-Fold cross-validation with sklearn:
Fold 1: 50.00%
Fold 2: 57.14%
Fold 3: 38.46%
Fold 4: 38.46%
Fold 5: 53.85%
Fold 6: 61.54%
Fold 7: 53.85%
Fold 8: 23.08%
Fold 9: 30.77%
Fold 10: 38.46%


In [5]:
def perform_t_test(data1, data2, alpha=0.05):
    """
    Perform a paired t-test on two sets of data.

    Parameters:
        data1 (array-like): First set of data.
        data2 (array-like): Second set of data.
        alpha (float): Significance level for the test.

    Returns:
        t_statistic (float): The calculated t-statistic.
        p_value (float): The calculated p-value.
        reject_null (bool): Whether to reject the null hypothesis.
    """
    t_statistic, p_value = stats.ttest_rel(data1, data2)
    reject_null = p_value < alpha
    return t_statistic, p_value, reject_null

# Perform K-Fold cross-validation for custom KNN classifier
classifier = KNNClassifier()
kfold_cv = kFoldCV(classifier)
kfold_accuracy = kfold_cv.k_fold_eval(train_features, 10, 3, 'euclidean')

# Perform K-Fold cross-validation for sklearn KNN classifier
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
kfold_accuracy_sklearn = cross_val_score(knn_sklearn, X, Y, cv=kfold, scoring='accuracy')

# Define nonSKL and withSKL
nonSKL = kfold_accuracy
withSKL = kfold_accuracy_sklearn

# Perform t-test
t_statistic, p_value, reject_null = perform_t_test(nonSKL, withSKL)

# Print results
print(f"T Value: {t_statistic:.2f}")
print(f"P Value: {p_value:.2f}")
if reject_null:
    print("\nReject the null hypothesis. There is a significant difference between the two sets of data.")
else:
    print("\nFailed to reject the null hypothesis. There is no significant difference between the two sets of data.")


Accuracy: 61.54%
T Value: 11.05
P Value: 0.00

Reject the null hypothesis. There is a significant difference between the two sets of data.


In [6]:
# Instantiate the KNNClassifier object
knn = KNNClassifier()

# Data Preprocessing for car data
car_file = "/Users/hemanthsukumar/Desktop/ML/car+evaluation/car.data"  # Data Set Path
train_data, train_labels = read_data(car_file)

from sklearn import preprocessing

def encode_data(train_data, train_labels):
    if train_data is not None and train_labels is not None:
        le = preprocessing.LabelEncoder()

        # Apply label encoding to features
        car_features = [list(le.fit_transform(feature)) for feature in zip(*train_data)]

        # Transpose the features back to original shape
        car_features = list(map(list, zip(*car_features)))

        # Encode the labels separately
        car_labels = le.fit_transform(train_labels)

        return car_features, car_labels


car_features, car_labels = encode_data(train_data,train_labels)

x_train, x_test, y_train, y_test = train_test_split(car_features, car_labels, test_size=0.2)
knn.train(x_train, y_train)
car_euc_pred = knn.predict(x_test)
accuracy_custom_car = kFoldCV.calculate_accuracy(y_test, car_euc_pred)
print(f"Accuracy of KNN model using custom implementation for car data: {accuracy_custom_car:.2f}%")

# K-fold for car data
kfold_cv_car = kFoldCV(knn)
kfold_accuracy_car = kfold_cv_car.k_fold_eval(car_features, 10)
print("\nAccuracy using K-Fold cross-validation for car data:")
for fold, acc in enumerate(kfold_accuracy_car, 1):
    print(f"Fold {fold}: {acc:.2f}%")

# Sklearn for car data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
k = 3  # Number of neighbors
knn_classifier_car = KNeighborsClassifier(n_neighbors=k)
knn_classifier_car.fit(x_train_scaled, y_train)
y_pred_sklearn_car = knn_classifier_car.predict(x_test_scaled)
accuracy_sklearn_car = accuracy_score(y_test, y_pred_sklearn_car)
print(f"\nAccuracy of KNN model using SKlearn for car data: {accuracy_sklearn_car * 100:.2f}%")

def stack_lists(*args):
    return [item for sublist in args for item in sublist]

def k_fold_cross_val(knn_classifier_car, x_train_scaled, x_test_scaled, y_train, y_test):
    X = stack_lists(x_train_scaled, x_test_scaled)
    Y = stack_lists(y_train, y_test)
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    kfold_accuracy_sklearn_car = cross_val_score(knn_classifier_car, X, Y, cv=kfold, scoring='accuracy')
    print("\nAccuracy using K-Fold cross-validation with sklearn for car data:")
    for fold, acc in enumerate(kfold_accuracy_sklearn_car, 1):
        print(f"Fold {fold}: {acc * 100:.2f}%")
    return kfold_accuracy_sklearn_car

kfold_accuracy_sklearn_car = k_fold_cross_val(knn_classifier_car, x_train_scaled, x_test_scaled, y_train, y_test)


Accuracy of KNN model using custom implementation for car data: 84.39%
Accuracy: 37.21%

Accuracy using K-Fold cross-validation for car data:
Fold 1: 30.23%
Fold 2: 34.30%
Fold 3: 36.63%
Fold 4: 33.72%
Fold 5: 37.21%
Fold 6: 33.14%
Fold 7: 28.49%
Fold 8: 30.23%
Fold 9: 31.40%
Fold 10: 37.21%

Accuracy of KNN model using SKlearn for car data: 89.60%

Accuracy using K-Fold cross-validation with sklearn for car data:
Fold 1: 93.64%
Fold 2: 91.91%
Fold 3: 92.49%
Fold 4: 90.75%
Fold 5: 90.17%
Fold 6: 90.75%
Fold 7: 92.49%
Fold 8: 91.33%
Fold 9: 91.86%
Fold 10: 94.77%


In [7]:
sklcarfile = kfold_accuracy_sklearn_car

nonSKL_scores = kfold_cv_car.k_fold_eval(car_features, 10)  # This should return a list of accuracy scores.
withSKL = sklcarfile  # This is already an array of accuracies from previous code

# Now perform the paired t-test
t_statistic, p_value = stats.ttest_rel(nonSKL_scores, withSKL)
print(f'T Value, P Value: {t_statistic:.2f}, {p_value:.2f}')

alpha = 0.05
if p_value < alpha:
    print("\nReject the null hypothesis. There is a significant difference between the two sets of data.")
else:
    print("\nFailed to reject the null hypothesis. There is no significant difference between the two sets of data.")


Accuracy: 29.07%
T Value, P Value: 18.12, 0.00

Reject the null hypothesis. There is a significant difference between the two sets of data.


In [8]:
# Read and preprocess the cancer dataset
cancerFile = "/Users/hemanthsukumar/Desktop/ML/breast+cancer/breast-cancer.data"  # Data Set Path 
cancerData, cancerLabels = read_data(cancerFile)
from sklearn import preprocessing

def filter_data(data, labels):
    filtered_data = []
    filtered_labels = []
    for d, l in zip(data, labels):
        if '?' not in d:
            filtered_data.append(d)
            filtered_labels.append(l)
    return filtered_data, filtered_labels

def encode_data(data):
    le = preprocessing.LabelEncoder()
    return [le.fit_transform(sublist) for sublist in zip(*data)]

if cancerData is not None and cancerLabels is not None:
    filtered_data, filtered_labels = filter_data(cancerData, cancerLabels)
    encoded_data = encode_data(filtered_data)
    cancerFeatures = [list(item) for item in zip(*encoded_data)]
    cancerLabels = filtered_labels


# Split the dataset into training and testing sets
cxTrain, cxTest, cyTrain, cyTest = train_test_split(cancerFeatures, cancerLabels, test_size=0.2)

# Train and test the custom KNN classifier
knn_custom = KNNClassifier(k=3)
knn_custom.train(cxTrain, cyTrain)
predictions_custom = knn_custom.predict(cxTest)
accuracy_custom = accuracy_score(cyTest, predictions_custom) * 100
print(f"Accuracy of KNN model using custom implementation: {accuracy_custom:.2f}%")

# Perform K-Fold Cross-Validation for Custom KNN
kfold_cv_custom = kFoldCV(knn_custom)
combined_data = [list(a) + [b] for a, b in zip(cxTrain + cxTest, cyTrain + cyTest)]  # Combine for kFoldCV
kfold_accuracy_custom = kfold_cv_custom.k_fold_eval(combined_data, 10)
print("\nAccuracy using K-Fold cross-validation:")
for fold, acc in enumerate(kfold_accuracy_custom, 1):
    print(f"Fold {fold}: {acc:.2f}%")

# Train and test using SKlearn's KNN classifier
scaler = StandardScaler()
cxTrain_scaled = scaler.fit_transform(cxTrain)
cxTest_scaled = scaler.transform(cxTest)
knn_sklearn = KNeighborsClassifier(n_neighbors=3)
knn_sklearn.fit(cxTrain_scaled, cyTrain)
predictions_sklearn = knn_sklearn.predict(cxTest_scaled)
accuracy_sklearn = accuracy_score(cyTest, predictions_sklearn) * 100
print(f"\nAccuracy of KNN model using SKlearn: {accuracy_sklearn:.2f}%")

# K-Fold Cross-Validation with Sklearn's KNN

def stack_lists(*args):
    return [item for sublist in args for item in sublist]

def k_fold_cross_val(knn_classifier_BC, x_train_scaled, x_test_scaled, y_train, y_test):
    X = stack_lists(x_train_scaled, x_test_scaled)
    Y = stack_lists(y_train, y_test)
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    kfold_accuracy_sklearn_BC = cross_val_score(knn_classifier_BC, X, Y, cv=kfold, scoring='accuracy')
    print("\nAccuracy using K-Fold cross-validation with sklearn for car data:")
    for fold, acc in enumerate(kfold_accuracy_sklearn_BC, 1):
        print(f"Fold {fold}: {acc * 100:.2f}%")

k_fold_cross_val(knn_classifier_car,cxTrain, cxTest, cyTrain, cyTest)



Accuracy of KNN model using custom implementation: 75.00%
Accuracy: 88.89%

Accuracy using K-Fold cross-validation:
Fold 1: 81.48%
Fold 2: 85.19%
Fold 3: 81.48%
Fold 4: 92.59%
Fold 5: 88.89%
Fold 6: 88.89%
Fold 7: 88.89%
Fold 8: 74.07%
Fold 9: 88.89%
Fold 10: 88.89%

Accuracy of KNN model using SKlearn: 76.79%

Accuracy using K-Fold cross-validation with sklearn for car data:
Fold 1: 71.43%
Fold 2: 71.43%
Fold 3: 82.14%
Fold 4: 75.00%
Fold 5: 67.86%
Fold 6: 78.57%
Fold 7: 75.00%
Fold 8: 66.67%
Fold 9: 66.67%
Fold 10: 62.96%


In [9]:

# Import necessary library for statistical tests
from scipy import stats

sklcancerfile = kfold_accuracy_sklearn

# Assuming `kfold_cv_cancer.k_fold_eval` correctly returns a list of accuracy scores for a custom implementation
nonSKL_scores = kfold_cv_custom.k_fold_eval(cancerFeatures, 10)  # Ensure `cancer_features` is defined and contains the correct data

# Assuming `sklcancerfile` holds the accuracy scores from Sklearn's K-fold cross-validation
withSKL = sklcancerfile  # Ensure this variable is an array of accuracies

# Perform the paired t-test
t_statistic, p_value = stats.ttest_rel(nonSKL_scores, withSKL)
print(f'T Value: {t_statistic:.2f}')
print(f'P Value: {p_value:.2f}')

# Significance level
alpha = 0.05

# Decision rule based on p-value
if p_value < alpha:
    print("\nReject the null hypothesis. There is a significant difference between the two sets of data.")
else:
    print("\nFailed to reject the null hypothesis. There is no significant difference between the two sets of data.")


Accuracy: 51.85%
T Value: 18.20
P Value: 0.00

Reject the null hypothesis. There is a significant difference between the two sets of data.
