### Fitting Naive Bayes to the Training set

posterior = prior occurrances * liklihood / evidence
The "gaussian" and "naive" come from two assumptions present in this likelihood:
1. we assume each feature is uncorrelated from each other. This is obviously not true, and is a "naive"                assumption.
2. we assume have that the value of the features (e.g. meanfreq of women) are normally distributed (gaussian distribution).

In [156]:
from sklearn.naive_bayes import GaussianNB
import pandas as pd 
import numpy as np  
import random
import math
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split  
from collections import defaultdict
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

In [157]:
%store -r train_x
%store -r test_x
%store -r train_y
%store -r test_y

%store -r train_x_two_features
%store -r test_x_two_features
%store -r train_y_two_features
%store -r test_y_two_features

%store -r pca_train_x
%store -r pca_test_x

In [3]:
def run_naive_bayes(train_x, test_x, train_y, test_y):
    clf = GaussianNB()
    clf.fit(train_x, train_y)
    scores = cross_val_score(clf, train_x, train_y, cv=5)
    print("In-sample accuracy for svm with Linear kernel: %.10f" % scores.mean())
    scores = cross_val_score(clf, test_x, test_y, cv=5)
    print("Out-of-sample accuracy for svm with Linear kernel: %.10f" % scores.mean())

In [128]:
# START: OWN CODE
def get_training_set(train_x,train_y):
    # female -> 0
    # male -> 1
    # Translate every element in numpy array according to key
    my_dict = {0:"female",1:"male"}
    temp_train_y = np.vectorize(my_dict.get)(train_y)
    # Reshape train_y
    temp_train_y = temp_train_y.reshape(len(temp_train_y), 1)
    # Join two arrays vertically
    train_df = np.concatenate((train_x, temp_train_y), axis=1)
    # Columns
    columns_new = ["meanfreq","sd","median","Q25","Q75","IQR","skew","kurt","sp.ent","sfm","mode","centroid","meanfun","minfun","maxfun","meandom","mindom","maxdom","dfrange","modindx","label"]
    # Pass in array and columns
    train_df =pd.DataFrame(train_df, columns=columns_new)
    return train_df

In [159]:
# For both the classes 0 and 1, and for each of the 20 features, we saved the mean and the standard in summary. 
def getSummary(df,n_cls):
    cols = df.columns.drop('label')
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
    data_means = df.groupby('label').mean()
    data_variance = df.groupby('label').var()
    
    cnames = np.array(df.columns.tolist())[:-1] 
    labels = df.values[:, -1:]
    gender_encoder = LabelEncoder()
    labels = gender_encoder.fit_transform(labels)
    classes = np.unique(labels)
    classnames = np.array(['female','male'])
    summary = {}

    for cls in classes:
        #initializing the dictionary
        summary[cls] = defaultdict(list)
    for cls in classes:
        for j in range(0, n_cls):
            summary[cls][j] += list(np.array([data_means[cnames[j]][data_means.index == classnames[cls]].values[0],
                                          data_variance[cnames[j]][data_variance.index == classnames[cls]].values[0]]))
    # print summary   
    return summary

In [137]:
# Create a function to calculate the probability density of each of the terms of the likelihood 
# We use mean and standard deviation of input values (x) for each class to summarize the distribution
# Probabilities of new x values are calculated using the Gaussian Probability Density Function (PDF).

def p_x_given_y(x, mean_y, variance_y):
    # Input the arguments into a probability density function
    p = 1/(np.sqrt(2*np.pi*variance_y)) * np.exp((-(x-mean_y)**2)/(2*variance_y))
    return p

In [138]:
def calculateClassProbabilities(summaries, inputVector, P_male, P_female):
    # p(x|y) (e.g. p(meannfreq∣female))
    probabilities = {}
    for classValue, classSummaries in summaries.iteritems():
        if (classValue == 0):
            probabilities[classValue] = P_female      
        else:
            probabilities[classValue] = P_male
        for i in range(len(classSummaries)):
            mean, var = classSummaries[i]
            x = inputVector[i]
            probabilities[classValue] *= p_x_given_y(x, mean, var)
    return probabilities

In [166]:
summary

{0: defaultdict(list,
             {0: [0.31465303397540956, 1.0209976576640936],
              1: [-0.45515747947775154, 1.223148575289184],
              2: [0.26173163529602117, 0.79880542215518768],
              3: [0.48467831718375221, 1.0567444461852398],
              4: [-0.070203081853782212, 0.97313430843572135],
              5: [-0.59093896270977997, 1.0202825585418609],
              6: [-0.031265555388518634, 0.55342392913697802],
              7: [-0.084121023078157803, 0.55264445852470023],
              8: [-0.47129996721092715, 1.1023382930981496],
              9: [-0.33729341871361473, 1.0407789589360446],
              10: [0.15868574287748005, 0.77238136969743543],
              11: [0.31465303397540956, 1.0209976576640936],
              12: [0.81105329101189871, 0.33001097451970712],
              13: [0.11817629747682706, 1.2956267577102256],
              14: [0.16636824260461208, 0.52322910409631351],
              15: [0.18979820305273506, 1.200110969598738

In [139]:
# Make a Prediction
import operator
def predict(summaries, inputVector, P_male, P_female):
    probabilities = calculateClassProbabilities(summaries, inputVector, P_male, P_female)
    #print probabilities
    probabilities = sorted(probabilities.iteritems(), key=operator.itemgetter(1), reverse=True)
    return probabilities[0][0]

In [140]:
from sklearn.metrics import classification_report
def run_my_gaussian_naive_bayes(train_x, test_x, train_y, test_y):
    results = []
    df = get_training_set(train_x,train_y);
    # Number of males
    n_male = df['label'][df['label'] == 'male'].count()
    # Number of males
    n_female = df['label'][df['label'] == 'female'].count()
    # Total rows
    total_ppl = df['label'].count()
    # Number of males divided by the total rows
    P_male = n_male/float(total_ppl)
    # Number of females divided by the total rows
    P_female = n_female/float(total_ppl)
    
    summary = getSummary(df, train_x.shape[1])
    for i in range(len(test_x)):
        test_single = test_x[i]
        res = predict(summary, test_single, P_male, P_female)
        results.append(res)
    # We evaluate the accuracy of the model by calculating a ratio of the total correct predictions out of all predictions made (the classification accuracy)
    correct = 0
    wrong = 0
    for j in range(len(test_y)):
        if(results[j] == test_y[j]):
            correct += 1
        else:
            wrong += 1
    print('Correct rate is %s'  % (float(correct)/(correct+wrong)))
    print(classification_report(test_y,results))
# END: OWN CODE

In [142]:
# Test our code
p_x_given_y(2, 2, 4)

0.19947114020071635

In [158]:
df = get_training_set(train_x,train_y)
df

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,-0.515170194553,0.0394788123909,-0.369132101157,0.402749826313,-1.39685481535,-1.22735871536,0.652712392911,0.0976512809743,-0.0973278686989,0.461946270319,...,-0.515170194553,0.229896856555,-0.271713383102,-1.84840116088,-1.02284535791,-0.0598951967248,-0.434770537024,-0.433801197862,-1.08465412057,female
1,-0.0437398263866,1.00748276956,0.175079081464,-0.193794805783,0.95658693657,0.747053200608,-0.437209835606,-0.246352646291,1.06507021258,1.2817891651,...,-0.0437398263866,-1.3146065981,-1.08422378918,-3.33074033134,-0.978579598989,-0.709815979575,-1.24421846003,-1.23191155367,1.11300045887,male
2,-2.02408638999,1.29572018693,-1.9244151262,-1.98316723759,-1.29507142329,1.5475307024,-0.457844532829,-0.243903539115,1.54599730943,1.84402299208,...,-2.02408638999,1.10336441901,-1.0647814118,0.109150566955,-0.941292013065,-0.462227109918,-1.17409369919,-1.1661558086,1.52016403145,female
3,0.890341097287,-1.25012575151,0.568471649076,0.872840964516,0.321705540047,-0.817687814472,-0.337324322877,-0.233921907187,-0.692708079943,-1.19513387451,...,0.890341097287,1.09851235709,0.729048457513,0.673235240199,0.111373769604,-0.462227109918,0.598747910865,0.607057450165,-0.607083466006,female
4,0.526588058671,0.230213229013,0.796577210092,-0.112217561594,1.09953942742,0.732728134965,-0.474900766727,-0.251094913149,0.404998180101,-0.111756820307,...,0.526588058671,-0.294291017649,0.52471304122,0.565369518983,1.15830132791,-0.462227109918,0.289760683402,0.298005448329,-0.130655163224,male
5,1.03797616307,-0.556727988687,0.907740884938,0.934533389771,0.974251343434,-0.529029013488,-0.320015221298,-0.227772048903,0.403250448442,-0.587326093844,...,1.03797616307,-0.806157184808,-0.998550344422,0.565369518983,-0.125853942556,-0.709815979575,0.346737051586,0.359377477062,-0.273000613207,male
6,-0.516467891466,0.130667809757,-0.34364790359,-0.662671183031,-0.646415562944,0.399565613411,-0.434160989947,-0.24100721069,0.926187044046,0.576048736736,...,-0.516467891466,-1.11638410248,-1.04530988544,-0.433633006741,-0.24708162981,-0.709815979575,0.414670413653,0.427325080303,-0.0142483545094,male
7,0.673616470732,-0.885695142378,0.571105136339,1.16173183116,-0.189978324672,-1.42837059515,0.0999835190842,-0.142626891321,-1.12008673814,-0.713512606239,...,0.673616470732,1.43011450232,-0.899556288231,-0.304297858321,-0.555186979565,-0.756238892635,-0.386833688791,-0.373525098213,0.110959682432,female
8,-0.145807596095,0.341402535924,-0.30532387982,-0.259323711939,0.424633677634,0.529103462406,-0.352848106136,-0.226412451791,0.809742722127,0.567617535235,...,-0.145807596095,-0.184615728418,0.61120158559,0.673235240199,0.0898347706405,-0.462227109918,0.00707024125456,0.0152557445219,-0.291289671061,male
9,-0.693700559049,0.137360676442,-1.27401098192,-0.621542166271,-0.227365731567,0.58321309931,-0.123958895658,-0.189268491639,0.293497222586,-0.0562416276854,...,-0.693700559049,-0.918297761636,-0.317130967197,0.138164491536,-0.761428858141,0.559076977418,-1.19765123603,-1.20780111381,3.57662069908,male


In [161]:
df.groupby('label').mean()

Unnamed: 0_level_0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
female,0.314653,-0.455157,0.261732,0.484678,-0.070203,-0.590939,-0.031266,-0.084121,-0.4713,-0.337293,0.158686,0.314653,0.811053,0.118176,0.166368,0.189798,0.183518,0.189015,0.185805,-0.02168
male,-0.330399,0.477935,-0.27483,-0.508933,0.073716,0.620511,0.03283,0.088331,0.494885,0.354173,-0.166627,-0.330399,-0.851641,-0.12409,-0.174694,-0.199296,-0.192702,-0.198474,-0.195104,0.022765


In [162]:
df.groupby('label').var()

Unnamed: 0_level_0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
female,1.020998,1.223149,0.798805,1.056744,0.973134,1.020283,0.553424,0.552644,1.102338,1.040779,0.772381,1.020998,0.330011,1.295627,0.523229,1.200111,1.299962,1.20263,1.20476,0.835003
male,0.766369,0.321059,1.065409,0.436022,1.01932,0.228061,1.468564,1.456243,0.415699,0.713796,1.186494,0.766369,0.288032,0.661205,1.442745,0.713981,0.614181,0.711975,0.712331,1.173976


In [160]:
# mean/variance summary for two gender
summary = getSummary(df,train_x.shape[1])
summary

{0: defaultdict(list,
             {0: [0.31465303397540956, 1.0209976576640936],
              1: [-0.45515747947775154, 1.223148575289184],
              2: [0.26173163529602117, 0.79880542215518768],
              3: [0.48467831718375221, 1.0567444461852398],
              4: [-0.070203081853782212, 0.97313430843572135],
              5: [-0.59093896270977997, 1.0202825585418609],
              6: [-0.031265555388518634, 0.55342392913697802],
              7: [-0.084121023078157803, 0.55264445852470023],
              8: [-0.47129996721092715, 1.1023382930981496],
              9: [-0.33729341871361473, 1.0407789589360446],
              10: [0.15868574287748005, 0.77238136969743543],
              11: [0.31465303397540956, 1.0209976576640936],
              12: [0.81105329101189871, 0.33001097451970712],
              13: [0.11817629747682706, 1.2956267577102256],
              14: [0.16636824260461208, 0.52322910409631351],
              15: [0.18979820305273506, 1.200110969598738

In [154]:
calculateClassProbabilities(summary, test_x[10], 0.5, 0.5)

{0: 3.1929906288506781e-12, 1: 2.0838061933531316e-26}

In [155]:
# pick the class with greater possibility
predict(summary, test_x[10], 0.5, 0.5)

0

In [163]:
run_my_gaussian_naive_bayes(train_x, test_x, train_y, test_y)

Correct rate is 0.900252525253
             precision    recall  f1-score   support

          0       0.89      0.90      0.89       367
          1       0.91      0.90      0.91       425

avg / total       0.90      0.90      0.90       792

