In [1]:
from google.colab import drive
drive.mount ('/content/gdrive')

Mounted at /content/gdrive


In [44]:
import pandas as pd
import numpy as np
import math

def calculate_mean_for_class(dataframe, class_feature, class_value):
  mean = np.array([])
  features = dataframe.keys()
  features = features.drop(class_feature)
  for key in features:
    mean = np.append(mean, [dataframe[key][df[class_feature] == class_value].sum()/len(df[key][df[class_feature] == class_value])])
  return mean

def calculate_covariance_matrix_for_class(dataframe, class_mean, class_feature, class_value):
  features = dataframe.keys()
  features = features.drop(class_feature)
  dimension = len(features)
  covariance = np.zeros(shape=(dimension, dimension))
  for i in range(0, dimension):
    for j in range(0, dimension):
      buffer = (df[features[i]][df[class_feature] == class_value] - class_mean[i])*(df[features[j]][df[class_feature] == class_value] - class_mean[j])
      covariance[i][j] = buffer.sum()/len(buffer)

  return covariance

def calculate_pdf_for_test_set(x, mean, covariance):
  deter_covariance = np.linalg.det(covariance)
  covariance_inv = np.linalg.inv(covariance)
  dimension = len(mean)
  pdf_x = (1 / ( (2*math.pi)**(dimension/2) * (deter_covariance**(1/2)))) * np.exp((-1/2)*((x.to_numpy()-mean).dot(covariance_inv)).dot((x.to_numpy()-mean).T))
  return pdf_x.diagonal()

def calculate_pdf_parzen_for_test_set(x_train, x_test):
  x_train_array = x_train.to_numpy()
  x_test_array = x_test.to_numpy()
  N = x_train_array.shape[0]
  h = math.sqrt(N) # square root of number of samples for class
  dimension = x_train_array.shape[1]
  marginal_pdfs = []
  for i in range(0, dimension):
    buffer = np.zeros(x_test_array.shape[0])
    for xi in x_train_array[:][i]:
      buffer = buffer + (1 / (math.sqrt(2*math.pi)*h)) * np.exp((-1 / (2*(h**2)))*(x_test_array[:,i] - xi)**2)
    buffer = buffer / N
    marginal_pdfs.append(buffer)

  pdf_parzen = np.ones(marginal_pdfs[0].shape)
  for marg_pdf in marginal_pdfs:
    pdf_parzen = pdf_parzen * marg_pdf
  return pdf_parzen

def classify_and_calculate_epoch_accuracy(prop_class0, prop_class1, test_set, epoch):
  classification = []
  for i in range(0, len(prop_class0)):
    if prop_class0[i] > prop_class1[i]:
      classification.append(0)
    else:
      classification.append(1)
  correct_classifications = 0
  for i in range(0, len(classification)):
    if classification[i] == test_set['Y'][epoch*len(classification)+i]:
      correct_classifications += 1
  accurracy_percentage = correct_classifications * 100 / len(classification)
  return accurracy_percentage

def calculate_accuracy_for_validation_epoch(train_set, test_set, prop_non_diabetes, prop_diabetes, epoch):
  mean_class_non_diabetes = calculate_mean_for_class(train_set, 'Y', 0)
  covariance_class_non_diabetes = calculate_covariance_matrix_for_class(train_set, mean_class_non_diabetes, 'Y', 0)
  mean_class_diabetes = calculate_mean_for_class(train_set, 'Y', 1)
  covariance_class_diabetes = calculate_covariance_matrix_for_class(train_set, mean_class_diabetes, 'Y', 1)
  pdfs_test_set_class_non_diabetes = calculate_pdf_for_test_set(test_set.drop('Y', axis=1), mean_class_non_diabetes, covariance_class_non_diabetes)
  prop_class_non_diabetes_test_set = pdfs_test_set_class_non_diabetes*prop_non_diabetes
  pdfs_test_set_class_diabetes = calculate_pdf_for_test_set(test_set.drop('Y', axis=1), mean_class_diabetes, covariance_class_diabetes)
  prop_class_diabetes_test_set = pdfs_test_set_class_diabetes*prop_diabetes
  return classify_and_calculate_epoch_accuracy(prop_class_non_diabetes_test_set, prop_class_diabetes_test_set, test_set, epoch)

def calculate_accuracy_parzen_for_validation_epoch(train_set, test_set, prop_non_diabetes, prop_diabetes, epoch):
  pdfs_parzen_test_set_class_non_diabetes = calculate_pdf_parzen_for_test_set(train_set[train_set['Y'] == 0].drop('Y', axis=1), test_set.drop('Y', axis=1))
  prop_class_non_diabetes_test_set = pdfs_parzen_test_set_class_non_diabetes*prop_non_diabetes
  pdfs_parzen_test_set_class_diabetes = calculate_pdf_parzen_for_test_set(train_set[train_set['Y'] == 1].drop('Y', axis=1), test_set.drop('Y', axis=1))
  prop_class_diabetes_test_set = pdfs_parzen_test_set_class_diabetes*prop_diabetes
  return classify_and_calculate_epoch_accuracy(prop_class_non_diabetes_test_set, prop_class_diabetes_test_set, test_set, epoch)


df = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/pima-indians-diabetes.data',delimiter=',', encoding='ISO-8859–1', names=['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'Y'])
k = math.ceil(df.index.shape[0] / 8) # 8-Fold cross validation
class_non_diabetes_prop = df[df['Y'] == 0].shape[0] / df.index.shape[0]
class_diabete_prop = df[df['Y'] == 1].shape[0] / df.index.shape[0]

##### Pdfs are gaussian, with non-diagonal covariance matrices
accurracy_percentages = []
for i in range(0, 8):
  accurracy_percentages.append(calculate_accuracy_for_validation_epoch(df.drop(df.index[i*k:(i+1)*k]), df[i*k:(i+1)*k], class_non_diabetes_prop, class_diabete_prop, i))

print(accurracy_percentages)
average_accuracy = sum(accurracy_percentages) / len(accurracy_percentages)
print(round(average_accuracy, 2))

#### Components of the feature vectors are mutually statistically independent (the usual naïve Bayes approach). Marginal pdfs 1-d Parzen windows, gaussian kernels
accurracy_percentages_parzen = []
for i in range(0, 8):
  accurracy_percentages_parzen.append(calculate_accuracy_parzen_for_validation_epoch(df.drop(df.index[i*k:(i+1)*k]), df[i*k:(i+1)*k], class_non_diabetes_prop, class_diabete_prop, i))

print(accurracy_percentages_parzen)
average_accuracy_parzen = sum(accurracy_percentages_parzen) / len(accurracy_percentages_parzen)
print(round(average_accuracy_parzen, 2))

[72.91666666666667, 75.0, 72.91666666666667, 72.91666666666667, 76.04166666666667, 78.125, 76.04166666666667, 73.95833333333333]
74.74
[36.458333333333336, 43.75, 55.208333333333336, 52.083333333333336, 51.041666666666664, 47.916666666666664, 53.125, 48.958333333333336]
48.57
