# Bayes Model Calculations

In [204]:
import numpy as np
import pandas as pd
import itertools
from scipy import stats

### Loading Data

In [205]:
df = pd.read_csv('train.csv')
df.describe()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,BMI,DiabetesPedigreeFunction,Age,Outcome
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,388.925081,1.625407,1.019544,0.956026,1.781759,0.363192,0.311075,0.32899
std,221.25832,1.054659,0.719984,0.434658,1.038222,0.481312,0.553183,0.470229
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,203.25,1.0,1.0,1.0,1.0,0.0,0.0,0.0
50%,394.5,2.0,1.0,1.0,2.0,0.0,0.0,0.0
75%,579.75,3.0,1.0,1.0,3.0,1.0,1.0,1.0
max,767.0,3.0,3.0,2.0,3.0,1.0,2.0,1.0


In [206]:
#Removing redundant indices column
df.drop('Unnamed: 0', axis=1, inplace=True)

### Functions for creating discrete probability distributions 

In [207]:
def possible_values_per_feature(df_, features):
    possible_values = []
    for feature in features:
        possible_values.append(df_.dropna(subset=[feature]).index.values)
    return possible_values

def value_probabilities_per_feature(df_, features, observations):
    value_probabilities = []
    for feature in features:
        value_probabilities.append(df_[feature]/observations)
    return value_probabilities

def discrete_dist_per_feature(possible_values, value_probabilities, features):
    dist = []
    for feature, x, p in itertools.izip(features, possible_values, value_probabilities):
        dist.append(stats.rv_discrete(name=feature, values=(x, p)))
    return dist

### Priors

In [208]:
total_observations = len(df)
counts_df = df.apply(pd.value_counts)

In [209]:
column_list = counts_df.columns.values.tolist()
column_list.remove('Outcome')

In [210]:
diabetes_prior = 0.66  #uniform, from research paper on Pimas

In [211]:
possible_values = possible_values_per_feature(counts_df, column_list)
value_probabilities = value_probabilities_per_feature(counts_df, column_list, total_observations)
priors = discrete_dist_per_feature(possible_values, value_probabilities, column_list)
#use xk for possible values, pk for probabilities

### Likelihoods

In [212]:
pd.options.mode.chained_assignment = None  # default='warn'

subsetting the DF to diabetes positive and diabetes negative patients:

In [213]:
diabetes_df = df.loc[df.Outcome == 1]
diabetes_df.drop('Outcome', axis=1, inplace=True)
counts_diabetes_df= diabetes_df.apply(pd.value_counts)

non_diabetes_df = df.loc[df.Outcome == 0]
non_diabetes_df.drop('Outcome', axis=1, inplace=True)
counts_non_diabetes_df= non_diabetes_df.apply(pd.value_counts)

In [214]:
diabetes_observations = len(diabetes_df)
non_diabetes_observations = total_observations - diabetes_observations

Calculating likelihoods (From the class definition)

In [215]:
diabetes_value_prob = value_probabilities_per_feature(counts_diabetes_df, column_list, diabetes_observations)
diabetes_likelihoods = discrete_dist_per_feature(possible_values, diabetes_value_prob, column_list)
non_diabetes_value_prob = value_probabilities_per_feature(counts_non_diabetes_df, column_list, non_diabetes_observations)
non_diabetes_likelihoods = discrete_dist_per_feature(possible_values, non_diabetes_value_prob, column_list)

# Testing


### Functions for testing

Returns a list of P(e|Diabetes), I assume independence between variables.

In [216]:
def cond_prob_sample(likelihoods0, likelihoods1, sample):
    sample_prob = []
    for e, l0, l1 in itertools.izip(sample, likelihoods0, likelihoods1):
        sample_prob.append([l0.pmf(e), l1.pmf(e)])
    return sample_prob

marginal = [P(e1|D=1)...P(e6|D=1)P(D=1) + P(e1|D=0)...P(e6|D=0)P(D=0)]

In [217]:
def marg_prob_sample(sample_probability, diabetes_prior):
    d0 = 1
    d1 = 1
    for l0, l1 in sample_probability:
        d0 *= l0
        d1 *= l1
    return d1*diabetes_prior + d0*(1-diabetes_prior)

posterior = P(D|e1,...,e6) = [P(e1|D=1)...P(e6|D=1)P(D=1)] / marginal

In [218]:
def posterior(sample, diabetes_prior, likelihoods0, likelihoods1):
    cond_prob = cond_prob_sample(likelihoods0, likelihoods1, sample)
    PE = marg_prob_sample(cond_prob, diabetes_prior)
    posterior = 1
    for cond_prob0, cond_prob1 in cond_prob:
        posterior *= cond_prob1
    return (posterior*diabetes_prior)/PE

### Loading test data

In [219]:
df_test = pd.read_csv('test.csv')

In [220]:
df_test.describe()
test_size = len(df_test)

In [221]:
df_test.drop('Unnamed: 0', axis=1, inplace=True)

In [222]:
test_outcomes = df_test['Outcome'].as_matrix
test_outcomes = test_outcomes > 0
test_samples = df_test.drop('Outcome', axis=1).as_matrix()

### Predicting

In [226]:
#create two functions
def prediction_check(samples, prior, likelihood0, likelihood1, threshold_):
    diabetes_probability = []
    for sample in samples:
        diabetes_probability.append(posterior(sample, prior, likelihood0, likelihood1))
    threshold = [threshold_]*test_size
    return (np.array(diabetes_probability) > threshold)

In [227]:
threshold = 0.5
diabetes_indicator = prediction_check(test_samples, diabetes_prior, diabetes_likelihoods, non_diabetes_likelihoods,threshold)
correct_predictions = sum(diabetes_indicator == test_outcomes)
accuracy = float(correct_predictions)/test_size
print accuracy

0.655844155844


# Information Gain

Information gain will be calculated using KL-Divergence:
<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/b1c44a164308ced602825bacc5122ca7d4715c78">

Where P(i) is the original prior and Q(i) is the posterior calculated for each evidence in each sample, without updates.

In [228]:
column_to_id = {}
i = 0
for column in column_list:
    column_to_id[i] = column
    i+=1
print column_to_id

{0: 'Pregnancies', 1: 'Glucose', 2: 'BloodPressure', 3: 'BMI', 4: 'DiabetesPedigreeFunction', 5: 'Age'}


In [232]:
num_of_features = len(column_list)
posterior_matrix = np.zeros((test_size,num_of_features))
for i, sample in test_size, test_samples:
    for (j, evidence) in itertools.izip(num_of_features, sample):
        posterior_matrix[i][j] = posterior(evidence, diabetes_prior, likelihoods0[j], likelihoods1[j])
        

TypeError: izip argument #1 must support iteration