In [1]:
import pandas as pd
import numpy as np
import random
import re
import warnings
warnings.filterwarnings('ignore')

from collections import defaultdict
from tqdm import tqdm
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

### 출력변수가 하나인 경우

In [2]:
columns = ['age', 'workclass','fnlwgt','education', 'educational-num','marital-status', 'occupation', \
           'relationship', 'race', 'sex', 'capital-gain','capital-loss', 'hours-per-week', 'native-country', 'income']
df = pd.read_csv('./data/adult/adult.data', names =columns, header=None)

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
for col in df.columns:
    df[col].replace('?',np.NaN, inplace=True)
    
df.dropna(inplace=True)

In [5]:
df.info

<bound method DataFrame.info of        age          workclass  fnlwgt    education  educational-num  \
0       39          State-gov   77516    Bachelors               13   
1       50   Self-emp-not-inc   83311    Bachelors               13   
2       38            Private  215646      HS-grad                9   
3       53            Private  234721         11th                7   
4       28            Private  338409    Bachelors               13   
...    ...                ...     ...          ...              ...   
32556   27            Private  257302   Assoc-acdm               12   
32557   40            Private  154374      HS-grad                9   
32558   58            Private  151910      HS-grad                9   
32559   22            Private  201490      HS-grad                9   
32560   52       Self-emp-inc  287927      HS-grad                9   

            marital-status          occupation    relationship    race  \
0            Never-married        Adm-cle

In [6]:
#범주형 자료를 숫자로
def convert_label(df, column):
    unique_value = df[column].unique()
    res = []
    res_dict = dict()
    
    for i,u in enumerate(unique_value):
        res_dict[i] = u
    
    for v in df[column]:
        idx = np.where(unique_value==v)[0][0]
        res.append(idx)
    
    return res, res_dict

In [7]:
categorical_column = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

res_dicts = dict()

for col in categorical_column+['income']:
    res, res_dict = convert_label(df, col)
    res_dicts[col] = res_dict
    df[col] = res

In [8]:
#target 확률
def estimate_target_prob(df, target, label):
    return len(df[df[target]==label])/len(df)

In [9]:
#범주별 확률
def category_prob(df, column, category, target, label, l=1):
    numerator = len(df[(df[column]==category)&(df[target]==label)]) + l
    denominator = len(df[df[target]==label])+l*len(df[column].unique())
    return numerator/denominator

In [10]:
#mean, std 추정
def estimate_mean_std(df, column, target, label):
    temp = df[df[target]==label]
    mu = temp[column].mean()
    std = temp[column].std()
    return mu, std

In [11]:
class NaiveBayesClassifier:
    def __init__(self,unique_label,target,categorical_column=[]):
        self.__reference_dict = None
        self.__prior_probs = None
        self.__categorical_column = categorical_column
        self.__unique_label = unique_label #출력변수 label
        self.__target = target #출력변수
        
    def train(self, train_df):
        reference_dict = dict()
        
        for col in train_df.columns[:-1]:
            temp_res = []
            
            for label in self.__unique_label:
                if col in self.__categorical_column:
                    unique_value = train_df[col].unique()
                    temp_res1 = []
                    
                    for u in unique_value:
                        temp_res1.append(category_prob(train_df, col, u, self.__target, label, l=1))
                    temp_res.append(temp_res1)
                else:
                    temp_res.append(estimate_mean_std(train_df, col, self.__target,label))
                    
            reference_dict[col] = temp_res
        self.__reference_dict = reference_dict
        
        prior_probs = []
        for label in self.__unique_label:
            prior_prob = estimate_target_prob(train_df, self.__target, label)
            prior_probs.append(prior_prob)
        self.__prior_probs = prior_probs
        
    def get_reference_dict(self):
        return self.__reference_dict
    
    def predict(self, new_data):
        object_value = [0]*len(self.__unique_label)
        
        for idx in new_data.index[:-1]:
            value = new_data[idx]
            reference_value = self.__reference_dict[idx]
            
            if idx in self.__categorical_column:
                for i, r in enumerate(reference_value):
                    value =int(value)
                    object_value[i] += np.log(r[value])
                    
            else:
                for i,r in enumerate(reference_value):
                    object_value[i] += np.log(norm.pdf(value, r[0], r[1]))
                    
        for i in self.__unique_label:
            object_value[i] += np.log(self.__prior_probs[i])
            
        max_object_value = max(object_value)
        max_idx = object_value.index(max_object_value)
        
        return max_idx

In [12]:
def split_test_train_data(df, split_ratio):
    num_test_df = int(len(df)*split_ratio)
    idx_test_df = random.sample(df.index.tolist(), num_test_df)
    
    test_df = df.loc[idx_test_df].reset_index(drop=True)
    train_df = df.drop(idx_test_df).reset_index(drop=True)
    
    return train_df, test_df

In [13]:
#학습
random.seed(110)
split_ratio = 0.2

train_df, test_df = split_test_train_data(df,split_ratio)

for col in train_df.columns[:-1]:
    if col not in categorical_column:
        mean = train_df[col].mean()
        std = train_df[col].std()
        train_df[col] = train_df[col].map(lambda x: (x-mean)/std)
        test_df[col] = test_df[col].map(lambda x: (x-mean)/std)
        
target = 'income'
unique_label = df[target].unique()

NBC = NaiveBayesClassifier(unique_label, target, categorical_column)

In [14]:
NBC.train(train_df)

In [15]:
#train accuracy
total_sum = 0

for i, new_data in tqdm(train_df.iterrows(), total=len(train_df)):
    if new_data[target] == NBC.predict(new_data):
        total_sum += 1
        
accuracy = total_sum/len(train_df)

100%|███████████████████████████████████████████████████████████████████████████| 26049/26049 [00:37<00:00, 689.95it/s]


In [16]:
accuracy

0.8285538792276095

In [17]:
#test accuracy
total_sum = 0

for i, new_data in tqdm(test_df.iterrows(), total=len(test_df)):
    if new_data[target] == NBC.predict(new_data):
        total_sum += 1
        
accuracy = total_sum /len(test_df)

100%|█████████████████████████████████████████████████████████████████████████████| 6512/6512 [00:09<00:00, 689.24it/s]


In [18]:
accuracy

0.827088452088452

In [19]:
mode = train_df[target].mode()[0]
print(len(test_df[test_df[target]==mode])/len(test_df))

0.7627457002457002


### 출력변수가 3개 이상의 클래스인 경우

In [20]:
columns = ['Id_number', 'Rl', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type']
df = pd.read_csv('./data/glass+identification/glass.data', names =columns, header=None)

In [21]:
df.head()

Unnamed: 0,Id_number,Rl,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [22]:
df.drop('Id_number', axis=1, inplace=True)

res_dicts = dict()

for col in ['Type']:
    res, res_dict = convert_label(df,col)
    res_dicts[col] = res_dict
    df[col] = res

In [23]:
random.seed(110)
split_ratio = 0.2


train_df, test_df = train_test_split(df,test_size=0.2, random_state = 110)

for col in train_df.columns[:-1]:
    if col not in categorical_column:
        mean = train_df[col].mean()
        std = train_df[col].std()
        train_df[col] = train_df[col].map(lambda x: (x-mean)/std)
        test_df[col] = test_df[col].map(lambda x: (x-mean)/std)
        
target = 'Type'
unique_label = df[target].unique()

NBC_multiclass = NaiveBayesClassifier(unique_label, target)

In [24]:
NBC_multiclass.train(train_df)

In [25]:
#train accuracy
total_sum = 0

for i, new_data in tqdm(train_df.iterrows(), total = len(train_df)):
    if new_data[target] == NBC_multiclass.predict(new_data):
        total_sum += 1
        
accuracy = total_sum / len(train_df)

100%|███████████████████████████████████████████████████████████████████████████████| 171/171 [00:00<00:00, 174.48it/s]


In [26]:
accuracy

0.5964912280701754

In [27]:
#test accuracy
total_sum = 0

for i, new_data in tqdm(test_df.iterrows(), total = len(test_df)):
    if new_data[target] == NBC_multiclass.predict(new_data):
        total_sum += 1
        
accuracy = total_sum / len(test_df)

100%|█████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 169.29it/s]


In [28]:
accuracy

0.4186046511627907

In [29]:
mode = train_df[target].mode()[0]
print(len(test_df[test_df[target]==mode])/len(test_df))

0.3488372093023256


In [30]:
nb = GaussianNB()
nb.fit(train_df[train_df.columns[:-1]], train_df['Type'])

GaussianNB()

In [31]:
nb.score(train_df[train_df.columns[:-1]], train_df['Type'])

0.5906432748538012

In [32]:
nb.score(test_df[test_df.columns[:-1]], test_df['Type'])

0.4418604651162791

### 다항 나이브 베이즈 분류기

In [33]:
columns = ['v1','v2','no']
df = pd.read_csv('./data/spam/spam.csv',names =columns, header=None, encoding = "ISO-8859-1")

In [34]:
df.head()

Unnamed: 0,v1,v2,no
0,ham,"Go until jurong point, crazy.. Available only ...",
1,ham,Ok lar... Joking wif u oni...,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,
3,ham,U dun say so early hor... U c already then say...,
4,ham,"Nah I don't think he goes to usf, he lives aro...",


In [35]:
df.dropna(inplace=True, axis =1)

res_dicts = dict()

for col in ['v1']:
    res, res_dict = convert_label(df, col)
    res_dicts[col] = res_dict
    df[col] = res
    
df = df[['v2', 'v1']]

In [36]:
def tokenize(message):
    message = message.lower()
    all_words = re.findall("[a-z0-9']+", message)
    return list(set(all_words))

In [37]:
def count_words(df, unique_label, column, target):
    counts = defaultdict(lambda : [0]*len(unique_label))
    
    for _,row in df.iterrows():
        for word in tokenize(row[column]):
            counts[word][row[target]] += 1
    return counts

In [38]:
def count_row_words(new_data):
    counts = defaultdict(lambda:0)
    
    for word in tokenize(new_data):
        counts[word] += 1
    return counts

In [39]:
class MultinomialNaiveBayesClassifier:
    def __init__(self,unique_label, target, column):
        self.__reference_counts = None
        self.__prior_probs = None
        self.__target = target
        self.__unique_label = unique_label
        self.__column = column
        
    def train(self, train_df):
        reference_counts = count_words(train_df, self.__unique_label, self.__column, self.__target)
        self.__reference_counts = reference_counts
        
        prior_probs = []
        
        for label in self.__unique_label:
            prior_prob = estimate_target_prob(train_df, self.__target, label)
            prior_probs.append(prior_prob)
        self.__prior_probs = prior_probs
    
    def get_reference_counts(self):
        return self.__reference_counts
    
    def predict(self, new_data):
        new_data = new_data[column]
        new_data = count_row_words(new_data)
        object_value = []
        
        for label in self.__unique_label:
            temp_counts = {k:v[label] for k,v in self.__reference_counts.items()}
            cum_word_count = 0
            numerators = []
            train_word_counts = []
            new_counts = []
            
            for word, count in new_data.items():
                if word not in temp_counts.keys():
                    train_word_count = 0
                else:
                    train_word_count = temp_counts[word]
                    
                new_counts.append(count)
                numerators.append(train_word_count + l)
                cum_word_count += train_word_count
                
            denominator = cum_word_count + l*len(self.__reference_counts.keys())
            probs = np.array(numerators)/denominator
            
            log_sum = 0
            
            for i,p in enumerate(probs):
                log_sum += new_counts[i]*np.log(p)
                
            object_value.append(log_sum)
            
        for i in self.__unique_label:
            object_value[i] += np.log(self.__prior_probs[i])
        max_object_value = max(object_value)
        max_idx = object_value.index(max_object_value)
        
        return max_idx

In [40]:
random.seed(110)
split_ratio = 0.2
l=1

train_df, test_df = train_test_split(df,test_size=0.2, random_state = 110)

In [41]:
target = 'v1'
unique_label = train_df[target].unique()
column = 'v2'
MNBC = MultinomialNaiveBayesClassifier(unique_label, target, column)

In [42]:
MNBC.train(train_df)

In [43]:
#train accuracy
total_sum = 0

for i, new_data in tqdm(train_df.iterrows(), total = len(train_df)):
    if new_data[target] == MNBC.predict(new_data):
        total_sum += 1
        
accuracy = total_sum/len(train_df)

100%|█████████████████████████████████████████████████████████████████████████████| 4455/4455 [00:06<00:00, 638.53it/s]


In [44]:
accuracy

0.970594837261504

In [45]:
#test accuracy
total_sum = 0

for i, new_data in tqdm(test_df.iterrows(), total = len(test_df)):
    if new_data[target] == MNBC.predict(new_data):
        total_sum += 1
        
accuracy = total_sum/len(test_df)

100%|█████████████████████████████████████████████████████████████████████████████| 1114/1114 [00:01<00:00, 639.85it/s]


In [46]:
accuracy

0.9542190305206463

In [47]:
mode = train_df[target].mode()[0]
print(len(test_df[test_df[target]==mode])/len(test_df))

0.8653500897666068


In [48]:
X_tr = train_df['v2']
y_tr = train_df['v1']

X_te = test_df['v2']
y_te = test_df['v1']

cv = CountVectorizer()
cv.fit(X_tr)

X_tr = cv.transform(X_tr)
X_te = cv.transform(X_te)

MNBC1 = MultinomialNB()
MNBC1.fit(X_tr, y_tr)

MultinomialNB()

In [49]:
MNBC1.score(X_tr,y_tr)

0.9930415263748598

In [50]:
MNBC1.score(X_te,y_te)

0.9874326750448833

In [51]:
reference_counts = MNBC.get_reference_counts()
len(sorted(list(reference_counts.keys())))

7892

In [52]:
len(cv.get_feature_names())

7727