In [15]:
import numpy as np
import pandas as pd
from collections import Counter
from math import sqrt
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from scipy import stats

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# Load the dataset
file_path = "/content/drive/MyDrive/breast-cancer.data"
names = ['Class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']
dataset = pd.read_csv(file_path, names=names)

# Drop Missing Values
dataset.dropna(inplace=True)

# Convert categorical features to numerical
for column in dataset.columns:
    if dataset[column].dtype == 'object':
        dataset[column] = dataset[column].astype('category').cat.codes

In [18]:
# Split the dataset
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Normalize the data
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

In [19]:
# Define the custom kNN classifier
class KNN:
  def __init__(self, k=10, distance='euclidean', weighted=False):
        self.k = k
        self.distance = distance
        self.weighted = weighted

  def fit(self, X, y):
        self.X_train = X
        self.y_train = y

  def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return y_pred

  def _predict(self, x):
        # Compute distances
        distances = [self._distance(x_train, x) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]  # Get the indices of k-nearest samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        if self.weighted:
            # Weighted Majority vote
            weights = [1 / d if d != 0 else 1 for d in np.sort(distances)[:self.k]]
            weighted_vote = Counter()
            for label, weight in zip(k_nearest_labels, weights):
                weighted_vote[label] += weight
            return weighted_vote.most_common(1)[0][0]

        # Majority vote
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

  def _distance(self, x1, x2):
        if self.distance == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        return sqrt(np.sum((x1 - x2) ** 2))

In [20]:
# Define k-fold cross-val
def custom_kfold_crossval(X, y, k=10):
    folds = np.array_split(X, k)
    fold_labels = np.array_split(y, k)
    accuracy_custom = []

    for k in range(1, 11):
        fold_accu = []
        for i in range(10):  # Assuming 10-fold CV
            X_train = np.concatenate([fold for j, fold in enumerate(folds) if j != i])
            y_train = np.concatenate([fold for j, fold in enumerate(fold_labels) if j != i])
            X_test = folds[i]
            y_test = fold_labels[i]

            classifier = KNN(k=k, distance='euclidean', weighted=True)
            classifier.fit(X_train, y_train)
            preds = classifier.predict(X_test)
            accuracy = sum(preds == y_test) / len(y_test) * 100
            fold_accu.append(accuracy)

        accuracy_custom.append(fold_accu)

    return accuracy_custom

In [21]:
# Define k-fold cross-validation function for scikit-learn kNN
def sklearn_kfold_crossval(X, y, k=10):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    accuracy_sklearn = []

    for k in range(1, 11):
        fold_accuracy_custom = []
        fold_accuracy_sklearn = []

        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Scikit-learn kNN
            sklearn_classifier = KNeighborsClassifier(n_neighbors=k, weights='distance', metric='euclidean')
            sklearn_classifier.fit(X_train, y_train)
            sklearn_preds = sklearn_classifier.predict(X_test)
            sklearn_accuracy = (sklearn_preds == y_test).mean() * 100
            fold_accuracy_sklearn.append(sklearn_accuracy)

        accuracy_sklearn.append(fold_accuracy_sklearn)

    return accuracy_sklearn

In [22]:
# Perform k-fold cross-validation for both custom and scikit-learn kNN
custom_accuracies, sklearn_accuracies = custom_kfold_crossval(X, y, k=10), sklearn_kfold_crossval(X, y, k=10)

In [23]:
# Calculate mean accuracies for both custom and scikit-learn kNN
custom_mean_accu = np.mean(custom_accuracies, axis=1)
sklearn_mean_accu = np.mean(sklearn_accuracies, axis=1)

In [24]:
# Print mean accuracies
print("Custom KNN mean", custom_mean_accu)
print("Scikit-learn kNN Mean Accuracies:", sklearn_mean_accu)

Custom KNN mean [67.99261084 67.98029557 73.18965517 74.24876847 73.14039409 74.24876847
 74.22413793 73.1773399  73.87931034 73.87931034]
Scikit-learn kNN Mean Accuracies: [73.00492611 73.00492611 74.39655172 76.1453202  76.1453202  75.44334975
 76.51477833 75.80049261 76.50246305 75.80049261]


In [25]:
print("For k=10, Custom Mean Accuracy:", custom_mean_accu[9])
print("For k=10, Scikit-learn Mean Accuracy", sklearn_mean_accu[9])
t_stat, p_value = stats.ttest_rel(custom_accuracies[9], sklearn_accuracies[9])
print(f"For k={10}: t-statistic={t_stat:.2f}, p-value={p_value:.4f}")

For k=10, Custom Mean Accuracy: 73.87931034482759
For k=10, Scikit-learn Mean Accuracy 75.80049261083744
For k=10: t-statistic=-0.31, p-value=0.7633


In [26]:
significance_value = 0.05
if p_value < significance_value:
    print("For k-value=10, Statistically Significant")
else:
    print("For k-value=10, Not Statistically Significant")

For k-value=10, Not Statistically Significant
