# MULTINOMIAL NAIVE BAYES

## IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import pickle

#scikit-learn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

## IMPORTING DATASET

In [2]:
data_train = pd.read_csv("Tubes2_AI/data/data_train.csv")
data_validation = pd.read_csv("Tubes2_AI/data/data_validation.csv")
data_test= pd.read_csv("Tubes2_AI/data/test.csv")

## MODEL

In [3]:
class Category:
    def __init__(self, _attribute_name, _category_value, _data_train, _non_numerical_columns):  
        self.attribute_name = _attribute_name
        self.category_value = _category_value
        self.data_train = _data_train
        self.prob_category_value_0 = 0
        self.prob_category_value_1 = 0
        self.prob_category_value_2 = 0
        self.prob_category_value_3 = 0
        self.non_numerical_columns = []
        self.lower_bound = 0
        self.upper_bound = 0

        if self.attribute_name not in _non_numerical_columns:
            self.lower_bound = float(self.category_value.split(" to ")[0])
            self.upper_bound = float(self.category_value.split(" to ")[1])
        else: 
            self.lower_bound = self.category_value
            self.upper_bound = self.category_value
        
    def num_of_category_value(self):
        return len(self.data_train[self.data_train[self.attribute_name] == self.category_value])
    
    def num_of_x_category_value(self, x):
        return len(self.data_train[(self.data_train[self.attribute_name] == self.category_value) & (self.data_train['price_range'] == x)])
    
    def update_category(self, total_count_0, total_count_1, total_count_2, total_count_3):
        self.prob_category_value_0 = self.num_of_x_category_value(0) / total_count_0
        self.prob_category_value_1 = self.num_of_x_category_value(1) / total_count_1
        self.prob_category_value_2 = self.num_of_x_category_value(2) / total_count_2
        self.prob_category_value_3 = self.num_of_x_category_value(3) / total_count_3


class Attribute:
    def __init__(self, _attribute_name):
        self.attribute_name = _attribute_name
        self.category_value_list = []
        
    def add_category(self, c):
        self.category_value_list.append(c)
    
        
    def find_category(self, _category_value):
        for c in self.category_value_list:
            if float(c.lower_bound) <= _category_value <= float(c.upper_bound):
                return c

class NaiveBayes:
    def __init__(self):
        self.attribute_arr = []
    def add_attribute(self, a):
        self.attribute_arr.append(a)
    def convert_df(self, data_train):
        non_numerical_columns = []
        for col in data_train.columns:
            if len(data_train[col].unique()) <= 4 or col == "id":
                non_numerical_columns.append(col)
        dummy_df = data_train.copy()  # Make a copy to avoid modifying the original DataFrame

        for col in dummy_df.columns:
            if col not in non_numerical_columns:
                iqr = np.percentile(data_train[col], 75) - np.percentile(data_train[col], 25)
                num_bins = 4
                hist, bin_edges = np.histogram(data_train[col], bins=num_bins)
                bin_labels = [f'Bin_{i+1}' for i in range(len(bin_edges)-1)]
                for i in range(len(bin_edges) - 1):
                    mask = (data_train[col] >= bin_edges[i]) & (data_train[col] <= bin_edges[i + 1])
                    dummy_df.loc[mask, col] = (str(bin_edges[i]) + " to " + str(bin_edges[i+1]))

        return dummy_df    
    def find_attribute(self, _attribute_name):
        for a in self.attribute_arr:
            if a.attribute_name == _attribute_name:
                return a
                
    def fit(self, X, y):
        total_count_0 = len(y[y == 0])
        total_count_1 = len(y[y == 1])
        total_count_2 = len(y[y == 2])
        total_count_3 = len(y[y == 3])
        non_numerical_columns = []
        for col in X.columns:
            if (len(X[col].unique())) <= 4:
                non_numerical_columns.append(col)

        X = self.convert_df(X)
        for col in X.columns:
            new_attribute = Attribute(_attribute_name=col)
            self.add_attribute(new_attribute)
            min_val = np.inf
            max_val = -np.inf
            for val in X[col].unique():
                if col not in non_numerical_columns:
                    temp = val.split(" to ")[0]
                    if float(temp) < min_val:
                        min_val = float(temp)
                    temp = val.split(" to ")[1]
                    if float(temp) > max_val:
                        max_val = float(temp)
                new_category = Category(_attribute_name=col, _category_value=val, _data_train=pd.concat([X, y], axis=1), _non_numerical_columns= non_numerical_columns)
                new_category.update_category(total_count_0, total_count_1, total_count_2, total_count_3)
                new_attribute.add_category(new_category)
            val = str(-999999) + " to " + str(min_val)
            new_category = Category(_attribute_name=col, _category_value=val, _data_train=pd.concat([X, y], axis=1), _non_numerical_columns= non_numerical_columns)
            new_category.update_category(total_count_0, total_count_1, total_count_2, total_count_3)
            new_attribute.add_category(new_category)
            val = str(max_val) + " to " + str(999999)
            new_category = Category(_attribute_name=col, _category_value=val, _data_train=pd.concat([X, y], axis=1), _non_numerical_columns= non_numerical_columns)
            new_category.update_category(total_count_0, total_count_1, total_count_2, total_count_3)
            new_attribute.add_category(new_category)


    
    def predict(self, X_test_disc): 
        y_prediction = []
        total_count_0 = len(y[y == 0])
        total_count_1 = len(y[y == 1])
        total_count_2 = len(y[y == 2])
        total_count_3 = len(y[y == 3])
        
        for index, row in X_test_disc.iterrows():
            p_0 = 1
            p_1 = 1
            p_2 = 1
            p_3 = 1
            for x in row.index:
                _attribute_name = x
                _category_value = row[x]
                
                attribute = self.find_attribute(_attribute_name)
                category = attribute.find_category(float(_category_value))
                p_0 *= category.prob_category_value_0
                p_1 *= category.prob_category_value_1
                p_2 *= category.prob_category_value_2
                p_3 *= category.prob_category_value_3
            total = total_count_0 + total_count_1 + total_count_2 + total_count_3
            p_0 *= (total_count_0 / total)
            p_1 *= (total_count_1 / total)
            p_2 *= (total_count_2 / total)
            p_3 *= (total_count_3 / total)

            if p_0 >= p_1 and p_0 >= p_2 and p_0 >= p_3:
                y_prediction.append(0)
            elif p_1 >= p_0 and p_1 >= p_2 and p_1 >= p_3:
                y_prediction.append(1)
            elif p_2 >= p_1 and p_2 >= p_0 and p_2 >= p_3:
                y_prediction.append(2)
            else:
                y_prediction.append(3)
        
        return y_prediction

# Usage
y = data_train['price_range']
# data_train['processor_performance'] = data_train['n_cores'] * data_train['clock_speed']
nb = NaiveBayes()
nb.fit(data_train.drop('price_range', axis = 1), y)
# data_validation['processor_performance'] = data_validation['n_cores'] * data_validation['clock_speed']
X_test = data_validation.drop('price_range', axis = 1)
y_pred = nb.predict(X_test)
y_true_test = data_validation['price_range'] 
precision_score_naive = precision_score(y_true_test, y_pred, average='macro')
recall_score_naive = recall_score(y_true_test, y_pred, average='macro')
print(f"Accuracy score: {accuracy_score(y_true_test, y_pred)}")
print(f"Precision score: {precision_score_naive}")
print(f"Recall score: {recall_score_naive}")



Accuracy score: 0.7666666666666667
Precision score: 0.7700290185190727
Recall score: 0.7656934306352077


## MULTINOMIAL NAIVE BAYES SCIKIT-LEARN

In [4]:
y = data_train['price_range']
X = data_train.drop('price_range', axis = 1)
MultinomialNaiveBayesModel = MultinomialNB()
MultinomialNaiveBayesModel.fit(X, y)

y_true_test = data_validation['price_range']
X_test = data_validation.drop('price_range', axis = 1)
y_pred_test = MultinomialNaiveBayesModel.predict(X_test)
accuracy_score_3 = accuracy_score(y_true_test, y_pred_test)
precision_score_3 = precision_score(y_true_test, y_pred_test, average='macro')
recall_score_3 = recall_score(y_true_test, y_pred_test, average='macro')

print(f'Accuracy Score of Multinomial Naive Bayes Algorithm : {accuracy_score_3}')
print(f'Precision Score of Multinomial Naive Bayes Algorithm : {precision_score_3}')
print(f'Recall Score of Multinomial Naive Bayes Algorithm : {recall_score_3}')

Accuracy Score of Multinomial Naive Bayes Algorithm : 0.5316666666666666
Precision Score of Multinomial Naive Bayes Algorithm : 0.5194204225246274
Recall Score of Multinomial Naive Bayes Algorithm : 0.5323794448063915


## EXPORT MODEL

In [5]:
with open('Tubes2_AI/model-naive-bayes-1.pkl', 'wb') as file:
    pickle.dump(nb, file)

## READ MODEL

In [6]:
with open('Tubes2_AI/model-naive-bayes-1.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

## SUBMISSION TO KAGGLE

In [7]:
training_set = data_train.append(data_validation)
y = training_set['price_range']
X = training_set.drop('price_range', axis = 1)
data_test= pd.read_csv("Tubes2_AI/data/test.csv")

X_test = data_test.drop('id', axis = 1)

nb = NaiveBayes()
nb.fit(X, y)
y_test= nb.predict(X_test)
submission = pd.concat([data_test['id'], pd.DataFrame(y_test, columns = ['price_range'])], axis = 1)

submission.to_csv('Tubes2_AI/out2.csv', index = False)
submission

Unnamed: 0,id,price_range
0,0,0
1,1,3
2,2,3
3,3,2
4,4,0
...,...,...
1995,1995,1
1996,1996,1
1997,1997,3
1998,1998,0


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0cf4af83-b6f2-43d0-9e47-37804035f63d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>