In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
# Handling missing data with replacing it with mode
def replace_missing_val(census_data):
    census_data['workclass'].replace(np.NaN, census_data['workclass'].mode()[0] , inplace=True )
    census_data['occupation'].replace(np.NaN, census_data['occupation'].mode()[0] , inplace=True )
    census_data['native-country'].replace(np.NaN, census_data['native-country'].mode()[0] , inplace=True )
    return census_data

In [3]:
# helper function for calculating probability using PDF
def apply_pdf(x, mean, std_dev):
#     return (math.erf((x-mean)/(std_dev * math.sqrt(2))))/2    
    expo = math.exp(-(math.pow(x-mean,2)/(2*math.pow(std_dev,2))))
    return (1 / (math.sqrt(2*math.pi) * std_dev)) * expo

In [4]:
# assume atributes follows Gaussian distribution
def NB_Gaussian_fit(training_set):
    # dictioanry to store weights
    weightdict = {}   
    continues_attributes = ['age','fnlwgt','education-num','capital-loss','capital-gain','hours-per-week']
    # separate dataframe according to class labels
    yes_df = training_set.loc[training_set['class'] == ' >50K']
    no_df = training_set.loc[training_set['class'] == ' <=50K']    
        
    training_set = training_set.drop(labels= 'class', axis=1)
    
    for label, content in training_set.iteritems(): 
        if label in continues_attributes:
            weightdict[label] = {}
            weightdict[label]['yes'] = {}
            weightdict[label]['no'] = {}
            weightdict[label]['yes']['std_dev'] = yes_df[label].std()
            weightdict[label]['yes']['mean'] = yes_df[label].mean()
            weightdict[label]['no']['std_dev'] = no_df[label].std()
            weightdict[label]['no']['mean'] = no_df[label].mean()
            
        else:
            if label not in weightdict.keys():
                weightdict[label]={}
                for val in content: # add all unique value of the attribute to dictionary key
                    if val not in weightdict[label].keys():
                        weightdict[label][val] = {} 
                        #store only count of records in both class for feature
                        weightdict[label][val]['yes'] = sum(y for y in yes_df[label] == val )
                        weightdict[label][val]['no'] = sum(n for n in no_df[label] == val)          
         
       
    weightdict['total_yes_label'] = len(yes_df)
    weightdict['total_no_label'] = len(no_df)  
#     print(weightdict)
    return weightdict

In [5]:
# helper function to predict labels

def NB_Gaussian_predict(test_set, weight_dict):
#     binned_test_set = binningDF(test_set)
    continues_attributes = ['age','fnlwgt','education-num','capital-loss','capital-gain','hours-per-week']
    predicted_label = []
    for index, content in test_set.iterrows():
        yes_prob = []
        no_prob = []
        total_yes_targets = weight_dict['total_yes_label']
        total_no_targets = weight_dict['total_no_label']
        for label in content.keys():
            if label in continues_attributes:
                yes_prob.append(apply_pdf(content[label],weight_dict[label]['yes']['mean'], weight_dict[label]['yes']['std_dev']))
                no_prob.append(apply_pdf(content[label],weight_dict[label]['no']['mean'], weight_dict[label]['no']['std_dev']))
            else:
                
                if content[label] in weight_dict[label].keys():
                    if(weight_dict[label][content[label]]['yes'] and weight_dict[label][content[label]]['no']):                        
                        yes_prob.append(weight_dict[label][content[label]]['yes'] /total_yes_targets)
                        no_prob.append(weight_dict[label][content[label]]['no'] /total_no_targets)
                    else:
                        #Laplacian correction for feature present in only one class
                        yes_prob.append((weight_dict[label][content[label]]['yes'] +1) /(total_yes_targets + len(weight_dict[label])))
                        no_prob.append((weight_dict[label][content[label]]['no']+1) /(total_no_targets + len(weight_dict[label])))              
                else:
                    #apply laplacian correction for feature not existing in training set
                    yes_prob.append(1/total_yes_targets + len(weight_dict[label]))
                    no_prob.append(1/total_no_targets + len(weight_dict[label]))
                   
        predicted_yes = np.prod(np.array(yes_prob)) * (total_yes_targets/(total_yes_targets+total_no_targets))
        predicted_no = np.prod(np.array(no_prob)) * (total_no_targets/(total_yes_targets+total_no_targets))
        
        if predicted_yes >= predicted_no:
            predicted_label.append(' >50K')
        else:
            predicted_label.append(' <=50K')
            
    return predicted_label       

In [6]:
def k_fold_cv_Gaussian(data, k):
    accuracy_list = []
    F1_score = []
    MCC_list = [] 
    split_ratio = k/100
    sub_data = []    
    # divide data into k sub-dataframes    
    for i in range(0, k):
        sub_data.append(data.sample(frac= 0.1) )      
                
    for i in range(0,k):
        test_split = sub_data[i]
        train_split = pd.DataFrame()
        for j in range(0, k):
            if (j != i):   
                train_split = pd.concat([train_split,sub_data[j]])
        
        
        test_data_label = test_split['class']
        test_split = test_split.drop(labels= 'class', axis=1)
        # train the model to calculate the weights
        weightdict = NB_Gaussian_fit(train_split)
        # predict the labels for test data
        predicted_val = NB_Gaussian_predict(test_split,weightdict)
        true_pos = len([i for i, j in zip(test_data_label,predicted_val) if i == j and j == ' >50K'])
        true_neg = len([i for i, j in zip(test_data_label,predicted_val) if i == j and j == ' <=50K'])
        false_pos = len([i for i, j in zip(test_data_label,predicted_val) if i != j and j == ' >50K'])
        false_neg = len([i for i, j in zip(test_data_label,predicted_val) if i != j and j == ' <=50K'])
        precision = true_pos/(true_pos+false_pos)
        recall = true_pos/(true_pos+false_neg)
                
        accuracy = len([i for i, j in zip(test_data_label,predicted_val) if i == j])/len(test_data_label)
        F1_score.append((2*precision*recall)/(precision+recall)) 
        #calculate matthews correlation coefficient 
        Mcc_denominator = math.sqrt((true_pos+false_pos) * (true_pos+false_neg)*(true_neg+false_neg)*(false_pos+true_neg))
        MCC = ((true_pos*true_neg) - (false_pos*false_neg))/Mcc_denominator
        MCC_list.append(MCC)
        
        accuracy_list.append(accuracy)
        
    return (accuracy_list,F1_score, MCC_list)      
        

In [7]:
def Naive_Bayesian_model_Gauusian(data, k, remove_missing_value):   
    
    if remove_missing_value:
        data.dropna(inplace=True)
        print("Removed missing values from dataset")
    else:
        data = replace_missing_val(data)
        print("Replaced missing values from dataset with mode of the attributes")
    
    accuracy_list, F1_score, MCC_list = k_fold_cv_Gaussian(data, k)
    print("avg accuracy of the model :", (sum(accuracy_list)/len(accuracy_list)))
    print("F1-measure is :", sum(F1_score)/len(F1_score))
    print("Matthews corelation coefficient :", sum(MCC_list)/len(MCC_list))

In [8]:
## load the data using pandas
header = ['age', 'workclass','fnlwgt', 'education','education-num','marital-status','occupation','relationship','race','sex',
          'capital-gain','capital-loss','hours-per-week','native-country','class']         
census_data = pd.read_table('adult.data',sep=',', header=None)
census_data.columns = header

In [9]:
print("shape of dataset ", census_data.shape)

shape of dataset  (32561, 15)


In [10]:
#get the number of missing values in attributes
census_data.replace(' ?', np.NaN, inplace=True)
print("number of records of missing values in attributes")    
census_data.isnull().sum()

number of records of missing values in attributes


age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
class                0
dtype: int64

In [11]:
# parameters for model are data_set, value of K for K fold cv and remove_missing_value is true or false 

# Model with removed missing values records
Naive_Bayesian_model_Gauusian(census_data, 10, True)

Removed missing values from dataset
avg accuracy of the model : 0.8233090185676393
F1-measure is : 0.5896543113110007
Matthews corelation coefficient : 0.4941266852477363


In [12]:
#Model with replaced missing values
Naive_Bayesian_model_Gauusian(census_data, 10, False)

Replaced missing values from dataset with mode of the attributes
avg accuracy of the model : 0.8272877984084881
F1-measure is : 0.592932800656593
Matthews corelation coefficient : 0.5002828207652984
