In [None]:
from scipy import stats
import numpy as np
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
from matplotlib import pyplot as plt
from scipy.io.arff import loadarff

raw_data = loadarff('breast.w.arff')
df_data = pd.DataFrame(raw_data[0])  # converting data to a pandas DataFrame
df_data = df_data.dropna()  # all rows with Na values are dropped
df_data['Class'].replace({b'malignant': 1, b'benign': 0}, inplace=True)
df_data

### Class-conditional distributions per variable using a 3x3 plot grid

In [None]:
# ---------------------------- Drawing the plots -------------------------------------
fig = plt.figure(figsize=(15, 10))

for idx, variable in enumerate(df_data.drop(columns='Class')):
    # A 3x3 plot grid is created and subplots are placed
    sub = fig.add_subplot(3, 3, idx+1)
    sub.set_xlabel(variable)
    # data is separated by class and plots are overlaid
    sub.hist(df_data[variable].loc[df_data['Class'] == 0], color="c")
    sub.hist(df_data[variable].loc[df_data['Class'] == 1],
             color="firebrick", alpha=0.5)

fig.legend(labels=["Benign", "Malign"], loc="upper right", fontsize=15)
plt.savefig('plots.png')

### KNN cross validation : Finding the best K value

In [None]:
# all columns except for the class column
data = df_data.drop(columns=['Class']).values
target = df_data['Class'].values  # target column is the class column

for k in range(3, 8, 2):  # data is split with a 10-fold cv and used for training and testing
    knn = KNeighborsClassifier(n_neighbors=k, weights="uniform", p=2)
    # random_state = seed = 10
    kf = KFold(n_splits=10, shuffle=True, random_state=10)
    accuracies = []
    for train_subset, test_subset in kf.split(data):
        X_train, X_test = data[train_subset], data[test_subset]
        Y_train, Y_test = target[train_subset], target[test_subset]
        knn.fit(X_train, Y_train)  # train
        accuracies.append(knn.score(X_test, Y_test))  # test and store accuracy
    print("Accuracy with K = " + str(k) + " " +
          str(np.mean(accuracies)))  # accuracy mean


### Hypothesis Test

In [None]:
# classifiers
knn = KNeighborsClassifier(n_neighbors=3, weights="uniform", p=2)
naive_bayes = MultinomialNB()
knn_acc, bayes_acc = [], []  # accuracies for each set and for each classifier

for train_subset, test_subset in kf.split(data):
    X_train, X_test = data[train_subset], data[test_subset]
    Y_train, Y_test = target[train_subset], target[test_subset]
    knn.fit(X_train, Y_train)  # train
    naive_bayes.fit(X_train, Y_train)
    knn_acc.append(knn.score(X_test, Y_test))  # test and store accuracy
    bayes_acc.append(naive_bayes.score(X_test, Y_test))

t_value, pvalue = stats.ttest_rel(knn_acc, bayes_acc)  # t-test
if pvalue <= 0.05:
    print('The alternative hypotesis : "𝑘NN is statistically superior to Naïve Bayes" is confirmed')
else:
    print('The null hypotesis : "𝑘NN is statistically equal to Naïve Bayes" is confirmed')