In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
# Handling missing data with replacing it with mode
def replace_missing_val(census_data):
    census_data['workclass'].replace(np.NaN, census_data['workclass'].mode()[0] , inplace=True )
    census_data['occupation'].replace(np.NaN, census_data['occupation'].mode()[0] , inplace=True )
    census_data['native-country'].replace(np.NaN, census_data['native-country'].mode()[0] , inplace=True )
    return census_data

In [3]:
# bin width for continuous attributes
bin_width_dict = {}
bin_width_dict['age'] = 5
bin_width_dict['fnlwgt'] = 23000
bin_width_dict['education-num'] = 2
bin_width_dict['capital-loss'] = 300
bin_width_dict['capital-gain'] = 6000
bin_width_dict['hours-per-week'] = 10



In [4]:
# helper functions to create bins
def binning(column ,binningwidth, min_range, max_range):
    if min_range <= column <= max_range: 
        return column - (column%binningwidth)
    elif (column < min_range):
        return (min_range -(min_range%binningwidth) - binningwidth)
    elif (colum > max_range):
        return max_range + (binningwidth+(max_range%binningwidth))

    
def binningDF(dataframe):
    dataframe.loc[:,'age'] = dataframe.age.apply(binning, args=(bin_width_dict['age'], dataframe['age'].min(), dataframe['age'].max(),))
    dataframe.loc[:,'fnlwgt'] = dataframe.fnlwgt.apply(binning, args=(bin_width_dict['fnlwgt'], dataframe['fnlwgt'].min(), dataframe['fnlwgt'].max(),))
    dataframe.loc[:,'education-num'] = dataframe['education-num'].apply(binning, args=(bin_width_dict['education-num'], dataframe['education-num'].min(), dataframe['education-num'].max(),))
    dataframe.loc[:,'capital-loss'] = dataframe['capital-loss'].apply(binning, args=(bin_width_dict['capital-loss'], dataframe['capital-loss'].min(), dataframe['capital-loss'].max(),))
    dataframe.loc[:,'capital-gain'] = dataframe['capital-gain'].apply(binning, args=(bin_width_dict['capital-gain'], dataframe['capital-gain'].min(), dataframe['capital-gain'].max(), ))
    dataframe.loc[:,'hours-per-week'] = dataframe['hours-per-week'].apply(binning, args=(bin_width_dict['hours-per-week'], dataframe['hours-per-week'].min(), dataframe['hours-per-week'].max(),)) 
    return dataframe    

In [5]:
# using equi-width binning 
def NaiveBayesian_fit(training_set): 
    #apply binning    
    binned_df = binningDF(training_set)  
    #dictinary to save weights
    weightdict ={}     
    # income >50K --> yes
    # income <=50K --> no
    yes_df = binned_df.loc[binned_df['class'] == ' >50K']
    no_df = binned_df.loc[binned_df['class'] == ' <=50K']
    
    totalYeslables = len(yes_df) # count of records in >50K class
    totalNolables = len(no_df) # count of records in <= 50K class
    binned_df = binned_df.drop(labels= 'class', axis=1)
    # iterate over each column to calculate feature weights
    for label, content in binned_df.iteritems(): 
        if label not in weightdict.keys():
            weightdict[label]={}
            for val in content: # add all unique value of the attribute to dictionary key
                if val not in weightdict[label].keys():
                    weightdict[label][val] = {} 
                    #store only count of records in both class for feature
                    weightdict[label][val]['yes'] = sum(y for y in yes_df[label] == val )
                    weightdict[label][val]['no'] = sum(n for n in no_df[label] == val)
        
    weightdict['total_yes_lable'] = totalYeslables
    weightdict['total_no_lable'] = totalNolables
    return weightdict
    

In [6]:
# Helper function to predict using equi-width binning
def NaiveBayesian_predict(test_dataset, weightdict):
    predicted_lable = []
    test_dataset = binningDF(test_dataset)
    
    total_yes_targets = weightdict['total_yes_lable']
    total_no_targets = weightdict['total_no_lable']
    # iterate rows of the data frame to calculate probabily of incoming records
    for index, content in test_dataset.iterrows():
        yes_prob = []
        no_prob = []
        for lable in content.keys():
            if content[lable] in weightdict[lable].keys():
                #calculate the probability using weights stored in dictionary
                if (weightdict[lable][content[lable]]['yes'] and weightdict[lable][content[lable]]['no']):
                    yes_prob.append(weightdict[lable][content[lable]]['yes'] /total_yes_targets)
                    no_prob.append(weightdict[lable][content[lable]]['no'] /total_no_targets)
                else:
                    #apply laplacian correction for feature present in only one of the class
                    yes_prob.append((weightdict[lable][content[lable]]['yes']+1) /(total_yes_targets+len(weightdict[lable])))
                    no_prob.append((weightdict[lable][content[lable]]['no'] +1)/(total_no_targets+len(weightdict[lable])))
                
            else:
                #apply laplacian correction for feature not present in training set
                yes_prob.append(1/total_yes_targets + len(weightdict[lable]))
                no_prob.append(1/total_no_targets + len(weightdict[lable]))
                
        predicted_yes = np.prod(np.array(yes_prob)) * (total_yes_targets/(total_yes_targets+total_no_targets))
        predicted_no = np.prod(np.array(no_prob)) * (total_no_targets/(total_yes_targets+total_no_targets))
        
        
        if predicted_yes >= predicted_no:
            predicted_lable.append(' >50K')
        else:
            predicted_lable.append(' <=50K')
                
    return predicted_lable           
        

In [7]:
# helper function to perform k-fold cross validation
def k_fold_cv(data, k):
    accuracy_list = []
    F1_score = []
    MCC_list = []   
   
    split_ratio = k/100
    sub_data = []    
    # divide data into k sub-dataframes    
    for i in range(0, k):
        sub_data.append(data.sample(frac= split_ratio))      
                
    for i in range(0,k): # perform k - fold
        test_split = sub_data[i] # assign test data in each k-iterations
        train_split = pd.DataFrame() 
        for j in range(0, k): # sub-dataframes other than test data are used to train the model
            if (j != i):   
                train_split = pd.concat([train_split,sub_data[j]])        
        
        test_data_label = test_split['class']
        test_split = test_split.drop(labels= 'class', axis=1)
        # train the model to calculate the weights
        weightdict = NaiveBayesian_fit(train_split)
        # predict the labels for test data
        predicted_val = NaiveBayesian_predict(test_split,weightdict)
        
        true_pos = len([i for i, j in zip(test_data_label,predicted_val) if i == j and j == ' >50K'])
        true_neg = len([i for i, j in zip(test_data_label,predicted_val) if i == j and j == ' <=50K'])
        false_pos = len([i for i, j in zip(test_data_label,predicted_val) if i != j and j == ' >50K'])
        false_neg = len([i for i, j in zip(test_data_label,predicted_val) if i != j and j == ' <=50K'])
        precision = true_pos/(true_pos+false_pos)
        recall = true_pos/(true_pos+false_neg)
                
        accuracy = len([i for i, j in zip(test_data_label,predicted_val) if i == j])/len(test_data_label)
        F1_score.append((2*precision*recall)/(precision+recall)) 
        #calculate matthews correlation coefficient 
        Mcc_denominator = math.sqrt((true_pos+false_pos) * (true_pos+false_neg)*(true_neg+false_neg)*(false_pos+true_neg))
        MCC = ((true_pos*true_neg) - (false_pos*false_neg))/Mcc_denominator
        MCC_list.append(MCC)
        
        accuracy_list.append(accuracy)
        
    return (accuracy_list, F1_score, MCC_list ) 

In [8]:
def Naive_Bayesian_model(data, k, remove_missing_value):    
    if remove_missing_value:
        data.dropna(inplace=True)
        print("Removed missing values from dataset")
    else:
        data = replace_missing_val(data)
        print("Replaced missing values from dataset with mode of the attributes")
    
    accuracy_list, F1_score, MCC_list = k_fold_cv(data, k)
    print("avg accuracy of the model :", sum(accuracy_list)/len(accuracy_list))
    print("F1-measure is :", sum(F1_score)/len(F1_score))
    print("Matthews corelation coefficient :", sum(MCC_list)/len(MCC_list))
    
        

In [9]:
## load the data using pandas
header = ['age', 'workclass','fnlwgt', 'education','education-num','marital-status','occupation','relationship','race','sex',
          'capital-gain','capital-loss','hours-per-week','native-country','class']         
census_data = pd.read_table('adult.data',sep=',', header=None)
census_data.columns = header

In [10]:
print("shape of dataset ", census_data.shape)

shape of dataset  (32561, 15)


In [11]:
#get the number of missing values
census_data.replace(' ?', np.NaN, inplace=True)    
print("details of missing values in attributes") 
census_data.isnull().sum()

details of missing values in attributes


age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
class                0
dtype: int64

In [12]:
# print("Dataset after replacing values \n")
# census_data.isnull().sum()

In [13]:
# parameters for model are data_set, value of K for K fold cv and remove_missing_value is true or false 

# Model with removed missing values records
Naive_Bayesian_model(census_data, 10, True)

Removed missing values from dataset
avg accuracy of the model : 0.823076923076923
F1-measure is : 0.6856098824682734
Matthews corelation coefficient : 0.5723776955982555


In [14]:
#Model with replaced missing values
Naive_Bayesian_model(census_data, 10, False)

Replaced missing values from dataset with mode of the attributes
avg accuracy of the model : 0.8242705570291777
F1-measure is : 0.6858895055205045
Matthews corelation coefficient : 0.5720853191506535
