In [16]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [17]:
class NaiveBayesClassifier():
    '''
    Bayes Theorem form
    P(y|X) = P(X|y) * P(y) / P(X)
    '''
    def calc_prior(self, features, target):
        '''
        prior probability P(y)
        calculate prior probabilities
        '''
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()

        return self.prior
    
    def calc_statistics(self, features, target):
        '''
        calculate mean, variance for each column and convert to numpy array
        ''' 
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def gaussian_density(self, class_idx, x):     
        '''
        calculate probability from gaussian density function (normally distributed)
        we will assume that probability of specific target value given specific class is normally distributed 
        
        probability density function derived from wikipedia:
        (1/√2pi*σ) * exp((-1/2)*((x-μ)^2)/(2*σ²)), where μ is mean, σ² is variance, σ is quare root of variance (standard deviation)
        '''
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (2 * var))
#         numerator = np.exp(-((x-mean)**2 / (2 * var)))
        denominator = np.sqrt(2 * np.pi * var)
        prob = numerator / denominator
        return prob
    
    def calc_posterior(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for i in range(self.count):
            prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
            conditional = np.sum(np.log(self.gaussian_density(i, x))) # use the log to make it more numerically stable
            posterior = prior + conditional
            posteriors.append(posterior)
        # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
     

    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
        
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds

    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy

    def visualize(self, y_true, y_pred, target):
        
        tr = pd.DataFrame(data=y_true, columns=[target])
        pr = pd.DataFrame(data=y_pred, columns=[target])
        
        
        fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
        
        sns.countplot(x=target, data=tr, ax=ax[0], palette='viridis', alpha=0.7, hue=target, dodge=False)
        sns.countplot(x=target, data=pr, ax=ax[1], palette='viridis', alpha=0.7, hue=target, dodge=False)
        

        fig.suptitle('True vs Predicted Comparison', fontsize=20)

        ax[0].tick_params(labelsize=12)
        ax[1].tick_params(labelsize=12)
        ax[0].set_title("True values", fontsize=18)
        ax[1].set_title("Predicted values", fontsize=18)
        plt.show()


In [18]:

# pre-process dataset for training 

# upload Iris dataset -  shape is (150, 5)
df = pd.read_csv("iris.csv")
# shuffle dataset with sample
df = df.sample(frac=1, random_state=1).reset_index(drop=True)
# df shape
print(df.shape)
# set features and target
X, y = df.iloc[:, :-1], df.iloc[:, -1]


# # split on train and test 0.7/0.3
X_train, X_test, y_train, y_test = X[:100], X[100:], y[:100], y[100:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(150, 5)
(100, 4) (100,)
(50, 4) (50,)


In [19]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.8,4.0,1.2,0.2,Setosa
1,5.1,2.5,3.0,1.1,Versicolor
2,6.6,3.0,4.4,1.4,Versicolor
3,5.4,3.9,1.3,0.4,Setosa
4,7.9,3.8,6.4,2.0,Virginica
...,...,...,...,...,...
145,6.3,2.8,5.1,1.5,Virginica
146,6.4,3.1,5.5,1.8,Virginica
147,6.3,2.5,4.9,1.5,Versicolor
148,6.7,3.1,5.6,2.4,Virginica


In [20]:
X_train.shape

(100, 4)

In [21]:
X.classes, X.feature_nums, X.rows, X.count

AttributeError: ignored

In [None]:
x.calc_prior(X_train, y_train)

In [None]:
x.prior

In [None]:

x.calc_statistics(X_train, y_train)

In [None]:
x.mean

In [None]:
x.var

In [None]:
X_train

In [None]:
# train the model
x = NaiveBayesClassifier()


x.fit(X_train, y_train)

In [None]:
predictions = x.predict(X_test)

In [None]:
x.accuracy(y_test, predictions)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
x.visualize(y_test, predictions, 'variety')

In [None]:
# Sentiment Analysis with Naive Bayes 

In [None]:
emails = pd.read_csv("data/spambase.data")

In [None]:
emails.head(2)

In [None]:
emails['1'].value_counts()

In [None]:
emails['spam'] = emails['1']
emails = emails.drop(columns=['1'])
emails['spam'] = emails['spam'].map({1: "spam", 0: "not_spam"}) 

In [None]:
emails['spam'].value_counts()

In [None]:
print(emails.shape)
emails.head()

In [None]:
X,y = emails.iloc[:, :-1], emails.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [None]:
model = NaiveBayesClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
preds = model.predict(X_test)

In [None]:
y_train

In [None]:
model.accuracy(y_test, preds)

In [None]:
model.visualize(y_test, preds, 'spam')

In [None]:
# compare to sklearn Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


In [None]:
clf = GaussianNB()

In [None]:
# iris dataset
clf.fit(X_train, y_train)

In [None]:
preds = clf.predict(X_test)

In [None]:
clf.score(X_test, y_test)

In [None]:
tr = pd.DataFrame(data=y_test, columns=['variety'])
pr = pd.DataFrame(data=preds, columns=['variety'])


fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))

sns.countplot(x='variety', data=tr, ax=ax[0], palette='viridis', alpha=0.7)
sns.countplot(x='variety', data=pr, ax=ax[1], palette='viridis', alpha=0.7)


fig.suptitle('True vs Predicted Comparison', fontsize=20)

ax[0].tick_params(labelsize=12)
ax[1].tick_params(labelsize=12)
ax[0].set_title("True values", fontsize=18)
ax[1].set_title("Predicted values", fontsize=18)
plt.show()

In [None]:
#emails dataset 
clf1 = GaussianNB()

In [None]:
clf1.fit(X_train, y_train)

In [None]:
preds1 = clf1.predict(X_test)

In [None]:
# prediced better for emails classifications
clf1.score(X_test, y_test)

In [None]:
test_df = pd.DataFrame(data=y_test, columns=['spam'])
pred_df = pd.DataFrame(data=preds1, columns=['spam'])
    
fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(15,6))
        
sns.countplot(x='spam', data=test_df, ax=ax[0], palette='pastel', alpha=0.7)
sns.countplot(x='spam', data=pred_df, ax=ax[1], palette='pastel', alpha=0.7)


fig.suptitle('True vs Predicted Comparison', fontsize=20)

ax[0].tick_params(labelsize=12)
ax[1].tick_params(labelsize=12)
ax[0].set_title("True values", fontsize=18)
ax[1].set_title("Predicted values", fontsize=18)
plt.show()