In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import math

In [14]:
def load_file(path, names):
    if not path.is_file():
        raise FileNotFoundError(str(path))
        
    data = pd.read_csv(path, sep=",", names=names, header=None)
    return data


def load_dfs():
    cols = ["PregnanciesNumber", "GlucosePlasma", "BloodPressureDiastolic", "SkinThicknessTriceps", 
           "Insulin2Hour", "BMI", "DiabetesPedigreeFunction", "Age", "OutcomeClass"]
    path = Path.cwd() / "data"
    diabetes_file = path / "pima-indians-diabetes.data.csv"
    train_file = path / "train.csv"
    test_file = path / "test.csv"

    diabetes_data = load_file(diabetes_file, cols)
    train_data = load_file(train_file, cols)
    test_data = load_file(test_file, cols)
    return diabetes_data, train_data, test_data


def mean_and_std(data):
    return data.mean(), data.std()


def norm_dist(data, mean, std):
    variance = std**2
    denominator = (2 * math.pi* variance)**(.5)
    numerator = np.exp(-(data - mean)**2 / (2 * variance))
    return numerator / denominator


def class_probs(data):
    n_false = train_data['OutcomeClass'][train_data['OutcomeClass'] == 0].count()
    n_true = train_data['OutcomeClass'][train_data['OutcomeClass'] == 1].count()
    n_total = train_data['OutcomeClass'].count()

    p_false = n_false / n_total
    p_true = n_true / n_total
    
    return p_false, p_true


def mean_std_by_class(data):
    data_by_class = data.groupby('OutcomeClass')
    mean, std = mean_and_std(data_by_class)
    false_mean = mean[std.index == 0.0].values[0]
    false_std = std[std.index == 0.0].values[0]
    true_mean = mean[std.index == 1.0].values[0]
    true_std = std[std.index == 1.0].values[0]
    return false_mean, false_std, true_mean, true_std
    

In [15]:
diabetes_data, train_data, test_data = load_dfs()

p_false, p_true = class_probs(train_data)

false_mean, false_std, true_mean, true_std = mean_std_by_class(train_data)

test_data_no_outcome = test_data.drop('OutcomeClass', axis=1)

false_norm = norm_dist(test_data_no_outcome, false_mean, false_std)
false_norm = false_norm.prod(axis=1) * p_false

true_norm = norm_dist(test_data_no_outcome, true_mean, true_std)
true_norm = true_norm.prod(axis=1) * p_true

norm = pd.concat([false_norm, true_norm], axis=1)

norm['diabetes_predicted'] = np.where(norm[1] > norm[0], 1.0, 0.0)

merged = pd.concat([norm['diabetes_predicted'], test_data['OutcomeClass']], axis=1)
merged['accurate'] = np.where(merged['diabetes_predicted'] == merged['OutcomeClass'], True, False)


In [16]:
accuracy = merged.mean()['accurate']
accuracy

0.7480314960629921