#### Importing modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

#### Categorical Naive Bayes

In [2]:
class CategoricalNaiveBayes:
    # Constructor
    def __init__(self):
        self.probs = dict()
        self.cond_probs = dict()
        self.targets = list()
        self.columns = list()
    
    # Fit method
    def fit(self, x, y, column_names):
        self.__init__()
        # Preparing DataFrame
        dataset = pd.DataFrame(data=x, index=None, columns=column_names[:-1])
        target_column_name = column_names[-1]
        dataset[target_column_name] = y
        
        # Preparing probabilities dictionary
        for column in dataset:
            self.probs[column] = dict()
            for value in dataset[column].unique():
                self.probs[column][value] = len(dataset.query('{0} == @value'.format(column))) / len(dataset)
        
        # Preparing conditional_probabilities dictionary
        for column in dataset.drop([target_column_name], axis=1):
            self.cond_probs[column] = dict()
            for value1 in dataset[column].unique():
                for value2 in dataset[target_column_name].unique():
                    self.cond_probs[column][f'{value1}-{value2}'] = len(dataset.query('{0} == @value1 & {1} == @value2'.format(column, target_column_name))) / len(dataset.query('{0} == @value2'.format(target_column_name)))
        
        self.targets = dataset[target_column_name].unique()
        self.columns = column_names
    
    # Predict method
    def predict(self, x):
        predicts = list()
        for row in x:
            target_prob_dict = dict()
            for target in self.targets:
                row_cond_probs = [self.cond_probs[column][f'{value}-{target}'] for column, value in zip(self.columns, row)]
                target_prob_dict[target] = ( np.prod(row_cond_probs) * self.probs[self.columns[-1]][target] )
            predicts.append( max(target_prob_dict, key=target_prob_dict.get) )
        return predicts

#### Loading dataset

In [3]:
dataset = pd.read_csv("dataset/categ.csv")
del dataset["Unnamed: 0"]

In [4]:
dataset

Unnamed: 0,size,material,color,sleeves,demand
0,S,nylon,white,long,medium
1,XL,polyester,cream,short,high
2,S,silk,blue,short,medium
3,M,cotton,black,short,medium
4,XL,polyester,orange,long,medium
...,...,...,...,...,...
9995,M,nylon,black,long,medium
9996,L,cotton,white,long,high
9997,XXL,nylon,black,long,medium
9998,S,linen,white,long,high


#### Dividing dataset into data / target

In [5]:
columns = dataset.columns.to_numpy()
target = (dataset["demand"]).to_numpy()
del dataset["demand"]
data = dataset.to_numpy()

#### Evaluating Classifier's Average Efficency

In [6]:
cv = StratifiedKFold(n_splits=5, shuffle=True)
efficency = []
for train_index, test_index in cv.split(data, target):
    train_x, test_x = data[train_index], data[test_index]
    train_y, test_y = target[train_index], target[test_index]
    nb = CategoricalNaiveBayes()
    nb.fit(train_x, train_y, columns)
    pred_y = nb.predict(test_x)
    efficency.append(accuracy_score(test_y, pred_y))

In [7]:
print(f"Average classification efficency (\"categ.csv\" dataset) = {np.average(efficency) * 100}%")

Average classification efficency ("categ.csv" dataset) = 81.42000000000002%


#### Testing classifier on "categ2.csv" dataset

In [8]:
dataset = pd.read_csv("dataset/categ_2.csv")
del dataset["Unnamed: 0"]

In [9]:
dataset

Unnamed: 0,size,material,color,sleeves,demand
0,L,cotton,green,long,medium
1,M,cotton,cream,short,high
2,XL,silk,red,short,low
3,M,nylon,red,short,high
4,L,linen,red,long,high
...,...,...,...,...,...
9995,M,polyester,white,short,medium
9996,M,linen,white,long,high
9997,XXL,cotton,cream,short,high
9998,L,nylon,cream,long,medium


In [10]:
columns = dataset.columns.to_numpy()
target = (dataset["demand"]).to_numpy()
del dataset["demand"]
data = dataset.to_numpy()

In [11]:
target_pred = nb.predict(data)

In [12]:
print(f"Average classification efficency (\"categ2.csv\" dataset) = {accuracy_score(target, target_pred) * 100}%")

Average classification efficency ("categ2.csv" dataset) = 80.63%
