# GAUSSIAN NAIVE BAYES

## IMPORTING LIBRARIES

In [None]:
import pandas as pd
import numpy as np

#scikit-learn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

## IMPORTING DATASET

In [None]:
data_train = pd.read_csv("Tubes2_AI/data/data_train.csv")
data_validation = pd.read_csv("Tubes2_AI/data/data_validation.csv")
data_test= pd.read_csv("Tubes2_AI/data/test.csv")

## MODEL

In [None]:
y = data_train[['price_range']]
X = data_train.drop('price_range', axis = 1)
y_true_test = data_validation['price_range']
X_test = data_validation.drop('price_range', axis = 1)

X = (X - X.min()) / (X.max() - X.min())
X_test = (X_test - X_test.min()) / (X_test.max() - X_test.min())

class GaussianNaiveBayes:
    def fit(self, feats: np.ndarray, labels: np.ndarray) -> None:

        self.labels = labels
        self.unique_labels = np.unique(labels)
        
        # store the gaussian distribution parameters
        self.params = []
        # calculate the mean and variance of all feats
        for label in self.unique_labels:
            label_feats = feats[self.labels == label]
            params = [(col.mean(), col.var()) for col in label_feats.values.T]
            self.params.append(params)

    # calclate the likelihood
    def likelihood(self, data: float, mean: float, var: float) -> float:
        # eps -> to prevent zero division
        eps = 1e-4
        exp = -((data - mean) ** 2 / (2 * var + eps))
        coef = 1 / np.sqrt(2 * np.pi * var + eps)
        return coef * np.exp(exp)

    def predict(self, feats: np.ndarray) -> np.ndarray:
        # create empty array for predictions
        num_samples, _ = feats.shape
        predictions = np.empty(num_samples)

        for idx, feat in enumerate(feats):
            # create empty array for posterior
            posteriors = []

            for label_idx, label in enumerate(self.unique_labels):
                # calculate the prior of the label
                prior = np.log((self.labels == label).mean())

                # calculate the total log likelihood of all features
                pairs = zip(feat, self.params[label_idx])
                likelihood = np.sum([np.log(self.likelihood(f, m, v)) for f, (m, v) in pairs])

                # calculate posteriors
                posteriors.append( likelihood)

            # store the label with the largest posterior probability
            predictions[idx] = self.unique_labels[np.argmax(posteriors)]

        return predictions

# convert to float datatype
y = y.values.astype(float)
X_test = X_test.values.astype(float)

gnb = GaussianNaiveBayes()
gnb.fit(X, y)
y_pred_test = gnb.predict(X_test)


accuracy_score_gnb = accuracy_score(y_true_test, y_pred_test)
precision_score_gnb = precision_score(y_true_test, y_pred_test, average='macro')
recall_score_gnb = recall_score(y_true_test, y_pred_test, average='macro')

print(f'Accuracy Score of Gaussian Naive Bayes Algorithm : {accuracy_score_gnb}')
print(f'Precision Score of Gaussian Naive Bayes Algorithm : {precision_score_gnb}')
print(f'Recall Score of Gaussian Naive Bayes Algorithm : {recall_score_gnb}')

Accuracy Score of Gaussian Naive Bayes Algorithm : 0.78
Precision Score of Gaussian Naive Bayes Algorithm : 0.7804663979309369
Recall Score of Gaussian Naive Bayes Algorithm : 0.7788856605345197


## SUBMISSION TO KAGGLE

In [None]:
training_set = data_train.append(data_validation)
y = training_set['price_range']
X = training_set.drop('price_range', axis = 1)
data_test= pd.read_csv("Tubes2_AI/data/test.csv")

X_test = data_test.drop('id', axis = 1)
X_test = X_test.values.astype(float)
X = (X - X.min()) / (X.max() - X.min())
X_test = (X_test - X_test.min()) / (X_test.max() - X_test.min())
gnb = GaussianNaiveBayes()
gnb.fit(X, y)
y_pred_test = gnb.predict(X_test)

submission = pd.concat([data_test['id'], pd.DataFrame(y_pred_test, columns = ['price_range'])], axis = 1)
submission.to_csv('out3.csv', index = False)
submission

Unnamed: 0,id,price_range
0,0,0.0
1,1,3.0
2,2,3.0
3,3,2.0
4,4,0.0
...,...,...
1995,1995,1.0
1996,1996,1.0
1997,1997,3.0
1998,1998,0.0


## EXPORT MODEL

In [None]:
with open('model-naive-bayes-2.pkl', 'wb') as file:
    pickle.dump(nb, file)

NameError: name 'pickle' is not defined

## LOAD MODEL

In [None]:
with open('model-naive-bayes-2.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

## GAUSSIAN NAIVE BAYES USING SCIKIT-LEARN

In [None]:
y = data_train['price_range']
X = data_train.drop('price_range', axis = 1)
naiveBayesModel = GaussianNB()
naiveBayesModel.fit(X, y)

y_true_test = data_validation['price_range']
X_test = data_validation.drop('price_range', axis = 1)
y_pred_test = naiveBayesModel.predict(X_test)
accuracy_score_2 = accuracy_score(y_true_test, y_pred_test)
precision_score_2 = precision_score(y_true_test, y_pred_test, average='macro')
recall_score_2 = recall_score(y_true_test, y_pred_test, average='macro')

print(f'Accuracy Score of Naive Bayes Algorithm : {accuracy_score_2}')
print(f'Precision Score of Naive Bayes Algorithm : {precision_score_2}')
print(f'Recall Score of Naive Bayes Algorithm : {recall_score_2}')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0cf4af83-b6f2-43d0-9e47-37804035f63d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>