# Naive Bayes on Abalone Dataset

In [524]:
# importing all necessary packages and functions

import pandas as pd
import numpy as np
import math as m
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [525]:
# kernel option to see output of multiple code lines

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [526]:
def df_split(dataframe, training_samples):
    df_training, df_validation = train_test_split(dataframe, train_size=training_samples)
    return df_training, df_validation

In [527]:
def convert_to_sex_prob_1(x):
    if x == 'F':
        return sex_prob.loc['F'][1]
    elif x == 'M':
        return sex_prob.loc['M'][1]
    elif x == 'I':
        return sex_prob.loc['I'][1]
    else:
        return 'N/A'

def convert_to_sex_prob_2(x):
    if x == 'F':
        return sex_prob.loc['F'][2]
    elif x == 'M':
        return sex_prob.loc['M'][2]
    elif x == 'I':
        return sex_prob.loc['I'][2]
    else:
        return 'N/A'

def convert_to_sex_prob_3(x):
    if x == 'F':
        return sex_prob.loc['F'][3]
    elif x == 'M':
        return sex_prob.loc['M'][3]
    elif x == 'I':
        return sex_prob.loc['I'][3]
    else:
        return 'N/A'

In [528]:
df = pd.read_table("abalone_dataset.txt", sep="\t", header=None)
df.columns = ["sex", "length", "diameter", "height", "whole_weight",
                "shucked_weight", "viscera_weight", "shell_weight", "age_class"]

### We decide the training samples for different cases (100, 1000, 2000):

In [529]:
# only input before clicking Run All

training_samples = 2000

df_train = df_split(df, training_samples)[0]
df_test = df_split(df, training_samples)[1]

In [530]:
df_train1 = df_train[df_train.age_class==1]
df_train2 = df_train[df_train.age_class==2]
df_train3 = df_train[df_train.age_class==3]

stat_table_1 = df_train1.describe()[1:3].transpose()
stat_table_2 = df_train2.describe()[1:3].transpose()
stat_table_3 = df_train3.describe()[1:3].transpose()

stat_table_1 = stat_table_1.add_prefix('class1_')
stat_table_2 = stat_table_2.add_prefix('class2_')
stat_table_3 = stat_table_3.add_prefix('class3_')

frames = [stat_table_1, stat_table_2, stat_table_3]
stat_table = pd.concat(frames, axis=1, join_axes=[stat_table_1.index])
stat_table = stat_table.transpose()

In [531]:
age_prob = pd.DataFrame([len(df_train[df_train.age_class==1])/len(df_train),
          len(df_train[df_train.age_class==2])/len(df_train),
          len(df_train[df_train.age_class==3])/len(df_train)],
                        index=['1','2','3'],
                        columns=['probability'])

sex_prob = pd.crosstab(df_train.sex, df_train.age_class, normalize='columns')

In [532]:
df_test['age_prob_1'],df_test['age_prob_2'],df_test['age_prob_3'] = [0,0,0]
df_test['sex_1'],df_test['sex_2'],df_test['sex_3'] = df_test.sex,df_test.sex,df_test.sex
df_test['length_1'],df_test['length_2'],df_test['length_3'] = df_test.length,df_test.length,df_test.length
df_test['diameter_1'],df_test['diameter_2'],df_test['diameter_3'] = df_test.diameter,df_test.diameter,df_test.diameter
df_test['height_1'],df_test['height_2'],df_test['height_3'] = df_test.height,df_test.height,df_test.height
df_test['whole_weight_1'],df_test['whole_weight_2'],df_test['whole_weight_3'] = df_test.whole_weight,df_test.whole_weight,df_test.whole_weight
df_test['shucked_weight_1'],df_test['shucked_weight_2'],df_test['shucked_weight_3'] = df_test.shucked_weight,df_test.shucked_weight,df_test.shucked_weight
df_test['viscera_weight_1'],df_test['viscera_weight_2'],df_test['viscera_weight_3'] = df_test.viscera_weight,df_test.viscera_weight,df_test.viscera_weight
df_test['shell_weight_1'],df_test['shell_weight_2'],df_test['shell_weight_3'] = df_test.shell_weight,df_test.shell_weight,df_test.shell_weight

In [533]:
df_test.age_prob_1 = df_test.age_prob_1.apply(lambda x: age_prob.loc['1'])
df_test.age_prob_2 = df_test.age_prob_2.apply(lambda x: age_prob.loc['2'])
df_test.age_prob_3 = df_test.age_prob_3.apply(lambda x: age_prob.loc['3'])

In [534]:
df_test.sex_1 = df_test.sex_1.apply(convert_to_sex_prob_1)
df_test.sex_2 = df_test.sex_2.apply(convert_to_sex_prob_2)
df_test.sex_3 = df_test.sex_3.apply(convert_to_sex_prob_3)

In [535]:
df_test.length_1 = df_test.length_1.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class1_std']['length'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class1_mean']['length'],2)/(2*m.pow(stat_table.loc['class1_std']['length'],2)))))
df_test.length_2 = df_test.length_2.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class2_std']['length'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class2_mean']['length'],2)/(2*m.pow(stat_table.loc['class2_std']['length'],2)))))
df_test.length_3 = df_test.length_3.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class3_std']['length'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class3_mean']['length'],2)/(2*m.pow(stat_table.loc['class3_std']['length'],2)))))

In [536]:
df_test.diameter_1 = df_test.diameter_1.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class1_std']['diameter'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class1_mean']['diameter'],2)/(2*m.pow(stat_table.loc['class1_std']['diameter'],2)))))
df_test.diameter_2 = df_test.diameter_2.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class2_std']['diameter'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class2_mean']['diameter'],2)/(2*m.pow(stat_table.loc['class2_std']['diameter'],2)))))
df_test.diameter_3 = df_test.diameter_3.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class3_std']['diameter'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class3_mean']['diameter'],2)/(2*m.pow(stat_table.loc['class3_std']['diameter'],2)))))

In [537]:
df_test.height_1 = df_test.height_1.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class1_std']['height'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class1_mean']['height'],2)/(2*m.pow(stat_table.loc['class1_std']['height'],2)))))
df_test.height_2 = df_test.height_2.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class2_std']['height'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class2_mean']['height'],2)/(2*m.pow(stat_table.loc['class2_std']['height'],2)))))
df_test.height_3 = df_test.height_3.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class3_std']['height'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class3_mean']['height'],2)/(2*m.pow(stat_table.loc['class3_std']['height'],2)))))

In [538]:
df_test.whole_weight_1 = df_test.whole_weight_1.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class1_std']['whole_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class1_mean']['whole_weight'],2)/(2*m.pow(stat_table.loc['class1_std']['whole_weight'],2)))))
df_test.whole_weight_2 = df_test.whole_weight_2.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class2_std']['whole_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class2_mean']['whole_weight'],2)/(2*m.pow(stat_table.loc['class2_std']['whole_weight'],2)))))
df_test.whole_weight_3 = df_test.whole_weight_3.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class3_std']['whole_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class3_mean']['whole_weight'],2)/(2*m.pow(stat_table.loc['class3_std']['whole_weight'],2)))))

In [539]:
df_test.shucked_weight_1 = df_test.shucked_weight_1.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class1_std']['shucked_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class1_mean']['shucked_weight'],2)/(2*m.pow(stat_table.loc['class1_std']['shucked_weight'],2)))))
df_test.shucked_weight_2 = df_test.shucked_weight_2.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class2_std']['shucked_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class2_mean']['shucked_weight'],2)/(2*m.pow(stat_table.loc['class2_std']['shucked_weight'],2)))))
df_test.shucked_weight_3 = df_test.shucked_weight_3.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class3_std']['shucked_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class3_mean']['shucked_weight'],2)/(2*m.pow(stat_table.loc['class3_std']['shucked_weight'],2)))))

In [540]:
df_test.viscera_weight_1 = df_test.viscera_weight_1.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class1_std']['viscera_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class1_mean']['viscera_weight'],2)/(2*m.pow(stat_table.loc['class1_std']['viscera_weight'],2)))))
df_test.viscera_weight_2 = df_test.viscera_weight_2.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class2_std']['viscera_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class2_mean']['viscera_weight'],2)/(2*m.pow(stat_table.loc['class2_std']['viscera_weight'],2)))))
df_test.viscera_weight_3 = df_test.viscera_weight_3.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class3_std']['viscera_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class3_mean']['viscera_weight'],2)/(2*m.pow(stat_table.loc['class3_std']['viscera_weight'],2)))))

In [541]:
df_test.shell_weight_1 = df_test.shell_weight_1.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class1_std']['shell_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class1_mean']['shell_weight'],2)/(2*m.pow(stat_table.loc['class1_std']['shell_weight'],2)))))
df_test.shell_weight_2 = df_test.shell_weight_2.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class2_std']['shell_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class2_mean']['shell_weight'],2)/(2*m.pow(stat_table.loc['class2_std']['shell_weight'],2)))))
df_test.shell_weight_3 = df_test.shell_weight_3.apply(lambda x: (1 / (m.sqrt(2*m.pi*m.pow(stat_table.loc['class3_std']['shell_weight'],2)))) * m.exp(-(m.pow(x-stat_table.loc['class3_mean']['shell_weight'],2)/(2*m.pow(stat_table.loc['class3_std']['shell_weight'],2)))))

In [542]:
df_test['class_prob_1_vars_3'] = df_test.age_prob_1 * df_test.sex_1 * df_test.length_1 * df_test.diameter_1
df_test['class_prob_1_vars_8'] = df_test.age_prob_1 * df_test.sex_1 * df_test.length_1 * df_test.diameter_1 * df_test.height_1 * df_test.whole_weight_1 * df_test.shucked_weight_1 * df_test.viscera_weight_1 * df_test.shell_weight_1

In [543]:
df_test['class_prob_2_vars_3'] = df_test.age_prob_2 * df_test.sex_2 * df_test.length_2 * df_test.diameter_2
df_test['class_prob_2_vars_8'] = df_test.age_prob_2 * df_test.sex_2 * df_test.length_2 * df_test.diameter_2 * df_test.height_2 * df_test.whole_weight_2 * df_test.shucked_weight_2 * df_test.viscera_weight_2 * df_test.shell_weight_2

In [544]:
df_test['class_prob_3_vars_3'] = df_test.age_prob_3 * df_test.sex_3 * df_test.length_3 * df_test.diameter_3
df_test['class_prob_3_vars_8'] = df_test.age_prob_3 * df_test.sex_3 * df_test.length_3 * df_test.diameter_3 * df_test.height_3 * df_test.whole_weight_3 * df_test.shucked_weight_3 * df_test.viscera_weight_3 * df_test.shell_weight_3

In [545]:
conditions_3 = [
    (df_test['class_prob_1_vars_3'] >= df_test['class_prob_2_vars_3']) & (df_test['class_prob_1_vars_3'] >= df_test['class_prob_3_vars_3']), 
    (df_test['class_prob_2_vars_3'] >= df_test['class_prob_1_vars_3']) & (df_test['class_prob_2_vars_3'] >= df_test['class_prob_3_vars_3'])]

choices_3 = [1, 2]

df_test['vars_3_guess'] = np.select(conditions_3, choices_3, default=3)

In [546]:
conditions_8 = [
    (df_test['class_prob_1_vars_8'] >= df_test['class_prob_2_vars_8']) & (df_test['class_prob_1_vars_8'] >= df_test['class_prob_3_vars_8']), 
    (df_test['class_prob_2_vars_8'] >= df_test['class_prob_3_vars_8'])]

choices_8 = [1, 2]

df_test['vars_8_guess'] = np.select(conditions_8, choices_8, default=3)

In [550]:
print ('Accuracy with {} samples and 3 variables'.format(training_samples))
len(df_test[df_test.age_class == df_test.vars_3_guess])/len(df_test)
print ('Accuracy with {} samples and 8 variables'.format(training_samples))
len(df_test[df_test.age_class == df_test.vars_8_guess])/len(df_test)

Accuracy with 2000 samples and 3 variables


0.6187413872301332

Accuracy with 2000 samples and 8 variables


0.5668350941662839

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,age_class,age_prob_1,...,shell_weight_2,shell_weight_3,class_prob_1_vars_3,class_prob_1_vars_8,class_prob_2_vars_3,class_prob_2_vars_8,class_prob_3_vars_3,class_prob_3_vars_8,vars_3_guess,vars_8_guess
3477,I,0.4,0.3,0.11,0.2985,0.1375,0.071,0.075,1,0.199,...,0.969636,0.457362,2.676886,11259.94,0.165408,0.503853,0.002691,0.0003819262,1,1
655,M,0.295,0.215,0.075,0.129,0.05,0.0295,0.04,1,0.199,...,0.54305,0.269742,0.351057,603.5557,0.00125,3.6e-05,5e-06,2.189766e-09,1,1
1154,M,0.6,0.455,0.17,1.1915,0.696,0.2395,0.24,2,0.199,...,3.658779,2.26609,0.003173,8.789747e-16,4.596616,319.481449,3.053716,277.5962,2,2
3721,M,0.43,0.31,0.13,0.6485,0.2735,0.163,0.184,2,0.199,...,3.02318,1.551654,0.433279,2.113498,0.558911,97.226602,0.035004,0.8270937,2,2
2562,I,0.44,0.325,0.1,0.4165,0.185,0.0865,0.11,1,0.199,...,1.559827,0.725959,1.862892,5431.35,0.549531,3.984076,0.017495,0.005059849,1,1


In [548]:
print ('Confusion matrix with {} samples and 3 variables'.format(training_samples))

cm3 = {'Matrix': ['Guessed 1', 'Guessed 2', 'Guessed 3'],
         'Actual Value 1': [len(df_test[(df_test.age_class==1) & (df_test.vars_3_guess==1)]), len(df_test[(df_test.age_class==1) & (df_test.vars_3_guess==2)]), len(df_test[(df_test.age_class==1) & (df_test.vars_3_guess==3)])],
         'Actual Value 2': [len(df_test[(df_test.age_class==2) & (df_test.vars_3_guess==1)]), len(df_test[(df_test.age_class==2) & (df_test.vars_3_guess==2)]), len(df_test[(df_test.age_class==2) & (df_test.vars_3_guess==3)])],
         'Actual Value 3': [len(df_test[(df_test.age_class==3) & (df_test.vars_3_guess==1)]), len(df_test[(df_test.age_class==3) & (df_test.vars_3_guess==2)]), len(df_test[(df_test.age_class==3) & (df_test.vars_3_guess==3)])]}
cm3 = pd.DataFrame.from_dict(cm3)
cm3

print ('Total misclassification errors: {}'.format(len(df_test)-cm3.iloc[0][1]-cm3.iloc[1][2]-cm3.iloc[2][3]))
  
print ('Confusion matrix with {} samples and 8 variables'.format(training_samples))

cm8 = {'Matrix': ['Guessed 1', 'Guessed 2', 'Guessed 3'],
         'Actual Value 1': [len(df_test[(df_test.age_class==1) & (df_test.vars_8_guess==1)]), len(df_test[(df_test.age_class==1) & (df_test.vars_8_guess==2)]), len(df_test[(df_test.age_class==1) & (df_test.vars_8_guess==3)])],
         'Actual Value 2': [len(df_test[(df_test.age_class==2) & (df_test.vars_8_guess==1)]), len(df_test[(df_test.age_class==2) & (df_test.vars_8_guess==2)]), len(df_test[(df_test.age_class==2) & (df_test.vars_8_guess==3)])],
         'Actual Value 3': [len(df_test[(df_test.age_class==3) & (df_test.vars_8_guess==1)]), len(df_test[(df_test.age_class==3) & (df_test.vars_8_guess==2)]), len(df_test[(df_test.age_class==3) & (df_test.vars_8_guess==3)])]}
cm8 = pd.DataFrame.from_dict(cm8)
cm8

print ('Total misclassification errors: {}'.format(len(df_test)-cm8.iloc[0][1]-cm8.iloc[1][2]-cm8.iloc[2][3]))

Confusion matrix with 2000 samples and 3 variables


Unnamed: 0,Matrix,Actual Value 1,Actual Value 2,Actual Value 3
0,Guessed 1,350,136,18
1,Guessed 2,90,844,315
2,Guessed 3,1,270,153


Total misclassification errors: 830
Confusion matrix with 2000 samples and 8 variables


Unnamed: 0,Matrix,Actual Value 1,Actual Value 2,Actual Value 3
0,Guessed 1,383,247,34
1,Guessed 2,56,610,211
2,Guessed 3,2,393,241


Total misclassification errors: 943


## References
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_table.html<br>
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html<br>
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html<br>
http://www.inf.ed.ac.uk/teaching/courses/inf2b/learnnotes/inf2b-learn-note09-2up.pdf<br>
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.add_prefix.html<br>