## QUESTION 2 : IMPLEMENT A NAIVE BAYES CLASSIFIER

### <font color = "blue"> Import Required Modules

In [6]:
import numpy as np
import math
from tabulate import tabulate
import pprint
import operator
import ipdb
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import pandas as pd
eps = np.finfo(float).eps
from numpy import log2 as log

### <font color = "blue"> Utility to preprocess dataset

In [7]:
def load_preprocess_data(path):
    dataset = pd.read_csv(path, names = ['ID', 'Age', 'Experience', 'Income', 'ZIP', 'Family_size', 'Spending', 'Education', 'Mortgage', 'Loan_accept', 'Securities_account', 'CD', 'Internet_banking', 'Credit_card'])
    dataset = dataset[['ID', 'Age', 'Experience', 'Income', 'ZIP', 'Family_size', 'Spending', 'Education', 'Mortgage', 'Securities_account', 'CD', 'Internet_banking', 'Credit_card', 'Loan_accept']]
    dropIndex = [0]
    dataset.drop(dataset.columns[dropIndex],axis=1,inplace=True)
    dataset = dataset.drop(dataset.index[0])
    return dataset

### <font color = "blue">Utility to split data into training and testing set using 80:20 split

In [8]:
def split_data(dataset):
#     train_data = dataset.sample(frac=0.8)
#     val_data = dataset.drop(train_data.index)
    train_data, val_data = np.split(dataset, [int(.8*len(dataset))])
    return train_data, val_data

### <font color = "blue"> Utility to segregate attributes into categorical and numerical sets

In [9]:
def get_attribute_list():
    categorical_attributes= ['Education', 'Securities_account', 'CD', 'Internet_banking', 'Credit_card']
    numerical_attributes = ['Age', 'Experience', 'Income', 'ZIP', 'Family_size', 'Spending', 'Mortgage']
    return categorical_attributes, numerical_attributes

In [10]:
def count(data, attribute, label, target):
    condition = (data[attribute] == label) & (data['Loan_accept'] == target)
    return len(data[condition])


### <font color = "blue"> This function calculates mean and standard deviation of a feature according to different output labels

In [11]:
def numeric_probability(col, data):
    mean_of_0 = data[col][data['Loan_accept'] == 0].mean()
    mean_of_1 = data[col][data['Loan_accept'] == 1].mean()
    std_deviation_of_0 = data[col][data['Loan_accept'] == 0].std()
    std_deviation_of_1 = data[col][data['Loan_accept'] == 1].std()
    return mean_of_0, std_deviation_of_0, mean_of_1, std_deviation_of_1

### <font color = "blue">This function creates a dictionary where it stores probabilities of different combination of feature values and output labels

In [12]:
def get_probabilities(train_data):
    categorical_attributes, numerical_attributes = get_attribute_list()
    cat_probabilities = {0.0 : {}, 1.0 : {}}
    
    zeroes = count(train_data,'Loan_accept', 0, 0)
    ones = count(train_data,'Loan_accept', 1, 1)
    prob_zero = zeroes / len(train_data)
    prob_one = ones / len(train_data)
    
    columns = ['Mean_of_0', 'StdDeviation_of_0', 'Mean_of_1', 'StdDeviation_of_1']
    numeric_prob_matrix = pd.DataFrame(columns = columns)
    
    for col in numerical_attributes:
        a, b, c, d = numeric_probability(col, train_data)
        numeric_prob_matrix = numeric_prob_matrix.append({'Mean_of_0' : a, 'StdDeviation_of_0' : b, 'Mean_of_1' : c, 'StdDeviation_of_1' : d}, ignore_index=True)

    for col in categorical_attributes:
        cat_probabilities[0.0][col] = {}
        cat_probabilities[1.0][col] = {}
        
        labels = train_data[col].unique()
        for label in labels:
            count_ct_zero = count(train_data, col, label, 0.0)
            count_ct_one = count(train_data, col, label, 1.0)
            
            cat_probabilities[0.0][col][label] = count_ct_zero / zeroes
            cat_probabilities[1.0][col][label] = count_ct_one / ones
            
    return prob_zero, prob_one, cat_probabilities, numeric_prob_matrix

### <font color = "blue">Returns the gaussian probability value of a set of data points

In [13]:
def Gaussian_probability(val, mean, stdev):
    exp = math.exp(-(math.pow(val-mean,2)/(2*math.pow(stdev,2))))
    return (1 / (math.sqrt(2*math.pi) * stdev)) * exp

### <font color = "blue"> Function to predict the class using max probability

In [14]:
def predict(train_data, val_data):
    categorical_attributes, numerical_attributes = get_attribute_list()
    prob_zero, prob_one, cat_probability, numeric_matrix = get_probabilities(train_data)
    predicted= []
    
    attr_list = list(val_data.columns.values)[:-1]
    for index, row in val_data.iterrows():
        res_0 = 0.0
        res_1 = 0.0
        i = 0
        
        for attribute in attr_list:
            if(attribute in categorical_attributes):
                prob_0 = cat_probability[0.0][attribute][row[attribute]]
                prob_1 = cat_probability[1.0][attribute][row[attribute]]
            else:
                mean_0 = numeric_matrix.iloc[i]['Mean_of_0']
                std_0 = numeric_matrix.iloc[i]['StdDeviation_of_0']
                mean_1 = numeric_matrix.iloc[i]['Mean_of_1']
                std_1 = numeric_matrix.iloc[i]['StdDeviation_of_1']

                prob_0 = Gaussian_probability(row[attribute], mean_0, std_0)
                prob_1 = Gaussian_probability(row[attribute], mean_1, std_1)
                i += 1
            
            res_0 += math.log(prob_0)
            res_1 += math.log(prob_1)
            
        res_0 += math.log(prob_zero)
        res_1 += math.log(prob_one)
        if(res_0 >= res_1):
            predicted.append(0.0)
        else:
            predicted.append(1.0)
            
    return predicted

            
            

### <font color = "blue"> Function to predict the accuracy of the classifier

In [15]:
def accuracy(train_data, val_data):
    true, false = 0, 0
    predicted = predict(train_data, val_data)
    length = len(predicted)
    actual = list(val_data['Loan_accept'])
    
    for i in range (0, length):
        if(predicted[i] == actual[i]):
            true += 1
        else:
            false += 1        
    return ((true / length) * 100)

In [21]:
def main():
    path = "../Input/LoanDataset/data.csv"
    dataset = load_preprocess_data(path)
    train_data, val_data = split_data(dataset)
    print("Model Accuracy = ",accuracy(train_data, val_data))

In [22]:
main()

Model Accuracy =  91.55555555555556


## Observations:
### <font color = "blue">1. All the colums except mortgage value follows Guassian distribution. 
### <font color = "blue">2. Initially I thought that the feature "ZIP" will not play a major role in prediction. However using it in calculating the probabilities resulted in increase in the accuracy of the model from 89% to 91.55%. This probably suggests that area also affects the chances of availing loans.