In [10]:
import numpy as np

In [6]:
def fit(X_train, Y_train):
    result = {}
    result["total_data"] = len(Y_train)
    classes = set(Y_train)    
    for y in classes:
        result[y] = {}
        num_features = X_train.shape[1]
        indices = Y_train == y
        result[y]["total_count"] = indices.sum()
        X_current_class = X_train[indices]
        Y_current_class = Y_train[indices]
        for j in range(num_features):
            result[y][j] = {}
            current_feature_values = set(X_current_class[:, j])
            for current_value in current_feature_values:
                result[y][j][current_value] = (X_current_class[:, j] == current_value).sum()
    return result

In [12]:
def probability(dictionary, x, cur_class):
    out_prob = 0
    features = len(x)
    for j in range(features):
        count_Y_equals_ai = np.log(dictionary[cur_class]["total_count"] + len(set(dictionary[cur_class][j])))
        count_Xj_equals_xj_and_Y_equals_ai = np.log(dictionary[cur_class][j][x[j]] if x[j] in dictionary[cur_class][j] else 1)
        P_Xj_equals_xj_given_Y_equals_ai = count_Xj_equals_xj_and_Y_equals_ai - count_Y_equals_ai
        out_prob += P_Xj_equals_xj_given_Y_equals_ai
    return out_prob

def predict_single_point(dictionary, x):
    classes = dictionary.keys()
    max_prob_class = None
    max_prob = -np.inf
    first = True
    for y in classes:
        if y == "total_data":
            continue
        P_X_equals_x_by_Y_equals_ai = probability(dictionary, x, y)
        if first or P_X_equals_x_by_Y_equals_ai > max_prob :
            first = False
            max_prob = P_X_equals_x_by_Y_equals_ai
            max_prob_class = y
    
    return max_prob_class

def predict(dictionary, X_test):
    Y_pred = []
    for x in X_test:
        Y_pred.append(predict_single_point(dictionary, x))
    return Y_pred

In [2]:
def level(X):
    mean = X.mean()
    one_third_value = mean/2
    two_third_value = 1.5 * mean
    for i in range(len(X)):
        if X[i] < one_third_value:
            X[i] = 1
        elif X[i] < mean:
            X[i] = 2
        elif X[i] < two_third_value:
            X[i] = 3
        else:
            X[i] = 4
    return X

In [3]:
from sklearn import datasets
iris_ds = datasets.load_iris()

In [4]:
from sklearn.model_selection import train_test_split
X = iris_ds.data
Y = iris_ds.target

for i in range(X.shape[1]):
    X[:, i] = level(X[:, i])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [7]:
d = fit(X_train, Y_train)

In [8]:
for key in d:
    print(key)
    print(d[key])
    """
    if type(d[key]) == int:
        continue
    for key1 in d[key]:
        print(key1)
        for key2 in d[key][key1]:
            print(key2)
            print(d[key][key1][key2])
    """

total_data
112
0
{'total_count': 36, 0: {2.0: 36}, 1: {2.0: 6, 3.0: 30}, 2: {1.0: 35, 2.0: 1}, 3: {1.0: 35, 2.0: 1}}
1
{'total_count': 39, 0: {2.0: 21, 3.0: 18}, 1: {2.0: 32, 3.0: 7}, 2: {2.0: 6, 3.0: 33}, 3: {2.0: 8, 3.0: 31}}
2
{'total_count': 37, 0: {2.0: 4, 3.0: 33}, 1: {2.0: 24, 3.0: 13}, 2: {3.0: 21, 4.0: 16}, 3: {3.0: 5, 4.0: 32}}


In [13]:
Y_train_pred = predict(d,X_train)
Y_test_pred = predict(d, X_test)
print(Y_train_pred)
print(Y_test_pred)

[1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 0, 1, 0, 2, 1, 0, 0, 2, 2, 0, 2, 0, 2, 0, 1, 0, 1, 1, 0, 1, 2, 2, 0, 2, 1, 0, 0, 1, 1, 1, 2, 0, 1, 1, 1, 2, 0, 2, 0, 1, 0, 0, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 0, 1, 0, 0, 0, 2, 1, 1, 2, 0, 0, 0, 2, 2, 1, 1, 0, 2, 2, 2, 0, 1, 1, 2, 2, 2, 0, 0, 0, 2, 1, 1, 2, 1, 2, 0, 0, 2, 0, 2, 1, 1, 1, 1, 1, 2, 0, 2]
[2, 2, 2, 2, 0, 2, 1, 0, 2, 0, 0, 0, 2, 1, 2, 1, 1, 0, 0, 0, 1, 0, 2, 1, 2, 2, 1, 0, 0, 2, 0, 0, 0, 1, 1, 2, 1, 2]


In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_train, Y_train_pred)

array([[36,  0,  0],
       [ 0, 39,  0],
       [ 0,  4, 33]], dtype=int64)

In [17]:
confusion_matrix(Y_test, Y_test_pred)

array([[14,  0,  0],
       [ 0, 10,  1],
       [ 0,  0, 13]], dtype=int64)