# Naive Bayes Exercise

In [12]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('ggplot')

plt.rcParams['figure.figsize'] = (12.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'

# Fix the seed of the random number 
# generator so that your results will match ours
np.random.seed(1)

%load_ext autoreload
%autoreload 2

## Iris dataset

The iris dataset  has 4 features with continuous values. As such, we will be using a Gaussian distribution to describe our data. 

How many parameters do we need to estimate for NB classification (posterior)?


In [2]:
import pandas as pd

# loads csv file into a pandas dataframe
iris = pd.read_csv('iris.csv')

# pd.head() displays the first 5 elements
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
iris.groupby('species').describe()

Unnamed: 0_level_0,petal_length,petal_length,petal_length,petal_length,petal_length,petal_length,petal_length,petal_length,petal_width,petal_width,...,sepal_length,sepal_length,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width,sepal_width
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
setosa,50.0,1.464,0.173511,1.0,1.4,1.5,1.575,1.9,50.0,0.244,...,5.2,5.8,50.0,3.418,0.381024,2.3,3.125,3.4,3.675,4.4
versicolor,50.0,4.26,0.469911,3.0,4.0,4.35,4.6,5.1,50.0,1.326,...,6.3,7.0,50.0,2.77,0.313798,2.0,2.525,2.8,3.0,3.4
virginica,50.0,5.552,0.551895,4.5,5.1,5.55,5.875,6.9,50.0,2.026,...,6.9,7.9,50.0,2.974,0.322497,2.2,2.8,3.0,3.175,3.8


In [1]:
# convert species feature from type string to integer
iris["species"] = pd.Categorical(iris["species"]).codes
iris.head()

NameError: name 'pd' is not defined

#### Data preparation

In [73]:
X = iris.values[:,:-1] # get everything before the species type as data X
y = iris.values[:,-1].astype(int) # get the last column (species) as label y (from float64 to int)

In [75]:
num_items = X.shape[0]
randIdx = np.arange(num_items)
# randomly shuffly the indices
np.random.shuffle(randIdx)

test_percentage_split = 0.5

num_test = np.ceil(num_items * test_percentage_split).astype(int)
X_test = X[randIdx[0:num_test]]
y_test = y[randIdx[0:num_test]]

X_train = X[randIdx[num_test:]]
y_train = y[randIdx[num_test:]]

## Naive Bayes Classifier (Gaussian)

Open naive_bayes.py and implement the TODOs for Gaussian distribution

In [122]:
from naive_bayes import NaiveBayes

In [123]:
gaussian_nb = NaiveBayes(distribution="gaussian")
gaussian_nb.train(X_train, y_train)

#### Calculate for the accuracy of the classifier

In [6]:
np.sum(y_test == predictions) / num_test

NameError: name 'np' is not defined

## Spam/Not-Spam dataset

The spam/not-spam dataset are data taken from spam and non-spam emails. Based on the words used, we want to classify whether the email is spam or not spam.

In this dataset, we just look at how frequent a chosen 2,500 words appear per document.


How many parameters do we need to estimate for NB classification (posterior)?



##### Data preparation

In [13]:
import os
train_spam_dir = "lingspam/spam-train/"
train_nonspam_dir = "lingspam/nonspam-train/"
test_spam_dir = "lingspam/spam-test/"
test_nonspam_dir = "lingspam/nonspam-test/"

global_word_dict = {}
train_doc_word_list = []
test_doc_word_list = []
for folder in [train_spam_dir, train_nonspam_dir, test_spam_dir, test_nonspam_dir]:
    for file in os.listdir(folder):
        tmp_doc_word = {}
        with open(folder + file,"r") as f:
            words = f.read().split(" ")

            for word in words:
                word = word.strip()
                if len(word) > 1:
                    if word in global_word_dict:
                        global_word_dict[word] += 1
                    else:
                        global_word_dict[word] = 1
                        
                    if word in tmp_doc_word:
                        tmp_doc_word[word] += 1
                    else:
                        tmp_doc_word[word] = 1
                
                
            if folder == train_spam_dir:
                train_doc_word_list.append({'words': tmp_doc_word, 'label': 1})
            elif folder == train_nonspam_dir:
                train_doc_word_list.append({'words': tmp_doc_word, 'label': 0})
            elif folder == test_spam_dir:
                test_doc_word_list.append({'words': tmp_doc_word, 'label': 1})
            elif folder == test_nonspam_dir:
                test_doc_word_list.append({'words': tmp_doc_word, 'label': 0})

In [14]:
sorted_words = []
sorted_counts = []
for key in sorted(global_word_dict, key=global_word_dict.get, reverse=True):
    sorted_words.append(key)
    sorted_counts.append(global_word_dict[key])
vocab_size = 2500
vocabulary = sorted_words[0:vocab_size]
vocab_counts = sorted_counts[0:vocab_size]

In [25]:
vocabulary[:10]

['email',
 'order',
 'address',
 'language',
 'report',
 'mail',
 'our',
 'university',
 'send',
 'program']

In [15]:
num_train = len(train_doc_word_list)
X_train = np.zeros((num_train, vocab_size))
y_train = np.zeros(num_train)
for i in range(num_train):  
    y_train[i] = train_doc_word_list[i]['label']
    
    for j in range(len(vocabulary)):
        word = vocabulary[j]
        if word in train_doc_word_list[i]['words']:
            X_train[i,j] = train_doc_word_list[i]['words'][word]
        else:
            X_train[i,j] = 0
            
num_test = len(test_doc_word_list)
X_test = np.zeros((num_test, vocab_size))
y_test = np.zeros(num_test)
for i in range(num_test):  
    y_test[i] = test_doc_word_list[i]['label']
    
    for j in range(len(vocabulary)):
        word = vocabulary[j]
        if word in test_doc_word_list[i]['words']:
            X_test[i,j] = test_doc_word_list[i]['words'][word]
        else:
            X_test[i,j] = 0

## Naive Bayes Classifier (Multinomial)

Open naive_bayes.py and implement the TODOs for multinomial distribution

In [14]:
multinomial_nb = NaiveBayes(distribution="multinomial")
multinomial_nb.train(X_train, y_train)

In [16]:
pred = multinomial_nb.predict(X_test)

In [17]:
np.sum(y_test == pred) / num_test

0.98076923076923073