In [1]:
import pandas as pd
import numpy as np
import random
import re
import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict
from tqdm import tqdm
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

## 2 outputs

In [2]:
columns = ['age', 'workclass','fnlwgt','education', 'educational-num','marital-status', 'occupation', \
           'relationship', 'race', 'sex', 'capital-gain','capital-loss', 'hours-per-week', 'native-country', 'income']
df = pd.read_csv('./data/adult/adult.data', names =columns, header=None)

In [3]:
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
for col in df.columns:
    df[col].replace('?', np.NaN, inplace=True)
    
df.dropna(inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   workclass        32561 non-null  object
 2   fnlwgt           32561 non-null  int64 
 3   education        32561 non-null  object
 4   educational-num  32561 non-null  int64 
 5   marital-status   32561 non-null  object
 6   occupation       32561 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   sex              32561 non-null  object
 10  capital-gain     32561 non-null  int64 
 11  capital-loss     32561 non-null  int64 
 12  hours-per-week   32561 non-null  int64 
 13  native-country   32561 non-null  object
 14  income           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
def convert_label(df, column):
    unique_value = df[column].unique()
    res = []
    res_dict = dict()
    for i, u in enumerate(unique_value):
        res_dict[i] = u
    for v in df[column]:
        idx = np.where(unique_value==v)[0][0]
        res.append(idx)
    return res, res_dict

In [7]:
categorical_column = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
res_dicts = dict()
for col in categorical_column+['income']:
    res, res_dict = convert_label(df, col)
    res_dicts[col] = res_dict
    df[col]=res

In [8]:
def estimate_target_prob(df, target, label):
    return len(df[df[target]==label])/len(df)

In [9]:
def category_prob(df, column, category, target, label, l=1):
    numerator = len(df[(df[column]==category)&(df[target]==label)])+l
    denominator = len(df[df[target]==label]) + l*len(df[column].unique())
    return numerator/denominator

In [10]:
def estimate_mean_std(df, column, target, label):
    temp = df[df[target]==label]
    mu = temp[column].mean()
    std = temp[column].std()
    return mu, std

In [11]:
class NaiveBayesClassifier:
    def __init__(self, unique_label, target, categorical_column=[]):
        self.__reference_dict = None
        self.__prioi_probs = None
        self.__categorical_column = categorical_column
        self.__unique_label = unique_label
        self.__target = target
        
    def train(self, train_df):
        reference_dict = dict()
        for col in train_df.columns[:-1]:
            temp_res = []
            for label in self.__unique_label:
                if col in self.__categorical_column:
                    unique_value = train_df[col].unique()
                    temp_res1 = []
                    for u in unique_value:
                        temp_res1.append(category_prob(train_df,col,u,self.__target,label,l=1))
                    temp_res.append(temp_res1)
                else:
                    temp_res.append(estimate_mean_std(train_df,col, self.__target,label))
                    
            reference_dict[col] = temp_res
        self.__reference_dict = reference_dict
        
        prior_probs=[]
        for label in self.__unique_label:
            prior_prob = estimate_target_prob(train_df, self.__target, label)
            prior_probs.append(prior_prob)
        self.__prior_probs = prior_probs
    
    def get_reference_dict(self):
        return self.__reference_dict
    
    def predict(self, new_data):
        object_value = [0]*len(self.__unique_label)
        for idx in new_data.index[:-1]:
            value = new_data[idx]
            reference_value = self.__reference_dict[idx]
            
            if idx in self.__categorical_column:
                for i,r in enumerate(reference_value):
                    value = int(value)
                    object_value[i] += np.log(r[value])
            else:
                for i,r in enumerate(reference_value):
                    object_value[i] += np.log(norm.pdf(value, r[0], r[1]))
                    
        
        for i in self.__unique_label:
            object_value[i] += np.log(self.__prior_probs[i])
        max_object_value = max(object_value)
        max_idx = object_value.index(max_object_value)
        return max_idx

In [12]:
def split_test_train_data(df, split_ratio):
    num_test_df = int(len(df)*split_ratio)
    idx_test_df = random.sample(df.index.tolist(), num_test_df)
    
    test_df = df.loc[idx_test_df].reset_index(drop=True)
    train_df = df.drop(idx_test_df).reset_index(drop=True)
    return train_df, test_df

In [13]:
random.seed(110)
split_ratio = 0.2

train_df, test_df = split_test_train_data(df, split_ratio)

for col in train_df.columns[:-1]:
    if col not in categorical_column:
        mean = train_df[col].mean()
        std = train_df[col].std()
        train_df[col] = train_df[col].map(lambda x: (x-mean)/std)
        test_df[col] = test_df[col].map(lambda x: (x-mean)/std)
        
target = 'income'
unique_label = df[target].unique()

NBC = NaiveBayesClassifier(unique_label, target, categorical_column)

In [14]:
NBC.train(train_df)

In [15]:
total_sum = 0
for i, new_data in tqdm(train_df.iterrows(), total= len(train_df)):
    if new_data[target] == NBC.predict(new_data):
        total_sum +=1
        
accuracy = total_sum/len(train_df)
print(accuracy)

100%|███████████████████████████████████████████████████████████████████████████| 26049/26049 [00:39<00:00, 661.56it/s]

0.8285538792276095





In [16]:
accuracy

0.8285538792276095

In [17]:
total_sum = 0
for i, new_data in tqdm(test_df.iterrows(), total= len(test_df)):
    if new_data[target] == NBC.predict(new_data):
        total_sum +=1
        
accuracy = total_sum/len(test_df)
print(accuracy)

100%|█████████████████████████████████████████████████████████████████████████████| 6512/6512 [00:09<00:00, 653.62it/s]

0.827088452088452





In [18]:
mode = train_df[target].mode()[0]
print(len(test_df[test_df[target]==mode])/len(test_df))

0.7627457002457002


## 3 outputs

In [19]:
columns = ['Id_number', 'Rl', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']
df = pd.read_csv('./data/glass+identification/glass.data', names =columns, header=None)

In [20]:
df

Unnamed: 0,Id_number,Rl,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
209,210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,211,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,212,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,213,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


In [21]:
df.drop('Id_number', axis=1, inplace=True)

res_dicts = dict()

for col in ['Type']:
    res, res_dict = convert_label(df,col)
    res_dicts[col]= res_dict
    df[col] = res

In [22]:
random.seed(110)
split_ratio = 0.2


train_df, test_df = train_test_split(df,test_size=0.2, random_state = 110)

for col in train_df.columns[:-1]:
    if col not in categorical_column:
        mean = train_df[col].mean()
        std = train_df[col].std()
        train_df[col] = train_df[col].map(lambda x: (x-mean)/std)
        test_df[col] = test_df[col].map(lambda x: (x-mean)/std)
        
target = 'Type'
unique_label = df[target].unique()

NBC_multiclass = NaiveBayesClassifier(unique_label, target)

In [23]:
NBC_multiclass.train(train_df)

In [24]:
total_sum = 0
for i, new_data in tqdm(train_df.iterrows(), total=len(train_df)):
    if new_data[target] == NBC_multiclass.predict(new_data):
        total_sum += 1
        
accuracy = total_sum/len(train_df)
print(accuracy)

100%|███████████████████████████████████████████████████████████████████████████████| 171/171 [00:01<00:00, 168.62it/s]

0.5964912280701754





In [25]:
total_sum = 0
for i, new_data in tqdm(test_df.iterrows(), total=len(test_df)):
    if new_data[target] == NBC_multiclass.predict(new_data):
        total_sum += 1
        
accuracy = total_sum/len(test_df)
print(accuracy)

100%|█████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 162.26it/s]

0.4186046511627907





In [26]:
mode = train_df[target].mode()[0]
print(len(test_df[test_df[target]==mode])/len(test_df))

0.3488372093023256


In [27]:
train_df.columns[:-1]

Index(['Rl', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], dtype='object')

In [28]:
nb = GaussianNB()
nb.fit(train_df[train_df.columns[:-1]], train_df['Type'])

In [29]:
nb.score(train_df[train_df.columns[:-1]], train_df['Type'])

0.5906432748538012

In [30]:
nb.score(test_df[test_df.columns[:-1]], test_df['Type'])

0.4418604651162791