In [2]:
from sklearn import datasets
import pandas as pd
import numpy as np

In [3]:
iris = datasets.load_iris()

In [4]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [5]:
def abc(k, *val):
    if k < val[0]:
        return 0
    else:
        return 1

In [None]:
df.sl.apply(abc, args=(5,))

In [7]:
def label(val, *boundaries):
    if (val < boundaries[0]):
        return 0
    elif (val < boundaries[1]):
        return 1
    elif (val < boundaries[2]):
        return 2
    else:
        return 3

def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [8]:
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df.head()

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,1,2,0,0
1,4.9,3.0,1.4,0.2,0,1,0,0
2,4.7,3.2,1.3,0.2,0,2,0,0
3,4.6,3.1,1.5,0.2,0,2,0,0
4,5.0,3.6,1.4,0.2,0,2,0,0


In [9]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [10]:
set(df['sl_labeled'])

{0, 1, 2, 3}

In [11]:
df["output"] = iris.target

In [12]:
df.tail()

Unnamed: 0,sl_labeled,sw_labeled,pl_labeled,pw_labeled,output
145,2,1,2,3,2
146,2,0,2,3,2
147,2,1,2,3,2
148,2,2,3,3,2
149,2,1,2,2,2


In [12]:
X, Y = df.values[:, :-1], df.values[:, -1]

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 1)

In [14]:
train_df = pd.DataFrame(x_train)
train_df['output'] = y_train

In [15]:
def fit(data):
    output_name = data.columns[-1]
    features = data.columns[0:-1]
    #print(output_name)
    counts = {}
    counts["total_count"] = len(data)
    possible_outputs = set(data[output_name])
    for output in possible_outputs:
        counts[output] = {}
        smallData = data[data[output_name] == output]
        counts[output]["total_count"] = len(smallData)
        for i in range(len(features)):
            f = features[i]
            counts[output][i] = {}
            possible_values = set(data[f])
            for value in possible_values:
                val_count = len(smallData[smallData[f] == value])
                counts[output][i][value] = val_count
    return counts

In [16]:
def probability(dictionary, x, current_class):
    output = 0
    features = dictionary[current_class].keys()
    for j in range(len(features) - 1):
        current_x_j = x[j]
        #count_class_and_feature = 0
        #if current_x_j in dictionary[current_class][j]:
        count_class_and_feature = dictionary[current_class][j][current_x_j] + 1
        possible_values_current_feature = len(dictionary[current_class][j].keys())
        count_class = dictionary[current_class]['total_count'] + possible_values_current_feature
        p = np.log(count_class_and_feature) - np.log(count_class)
        output = output + p
    count_class = dictionary[current_class]['total_count']
    total_count = dictionary['total_count']
    class_prob = np.log(count_class) - np.log(total_count)
    output = output + class_prob
    return output

In [17]:
def predictSinglePoint(x, dictionary):
    classes = dictionary.keys()
    #print(classes)
    best_p = -1
    best_class = -1
    first_run = True
    for current_class in classes:
        if (current_class == 'total_count'):
            continue
        p_current_class = probability(dictionary, x, current_class)
        if (first_run or p_current_class > best_p):
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [18]:
def predict(dictionary, x_test):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(x, dictionary)
        y_pred.append(x_class)
    return y_pred

In [19]:
dictionary = fit(train_df)
dictionary

{'total_count': 112,
 0: {'total_count': 37,
  0: {0: 24, 1: 13, 2: 0, 3: 0},
  1: {0: 1, 1: 7, 2: 25, 3: 4},
  2: {0: 37, 1: 0, 2: 0, 3: 0},
  3: {0: 37, 1: 0, 2: 0, 3: 0}},
 1: {'total_count': 34,
  0: {0: 3, 1: 12, 2: 17, 3: 2},
  1: {0: 10, 1: 18, 2: 6, 3: 0},
  2: {0: 0, 1: 6, 2: 28, 3: 0},
  3: {0: 0, 1: 8, 2: 26, 3: 0}},
 2: {'total_count': 41,
  0: {0: 1, 1: 4, 2: 27, 3: 9},
  1: {0: 4, 1: 24, 2: 12, 3: 1},
  2: {0: 0, 1: 0, 2: 17, 3: 24},
  3: {0: 0, 1: 0, 2: 15, 3: 26}}}

In [20]:
y_pred = predict(dictionary, x_test)

In [21]:
y_test = list(y_test)

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [23]:
confusion_matrix(y_test, y_pred)

array([[13,  0,  0],
       [ 0, 16,  0],
       [ 0,  0,  9]], dtype=int64)

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
mnb = MultinomialNB()
mnb.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
y_pred = mnb.predict(x_test)
confusion_matrix(y_test, y_pred)

array([[13,  0,  0],
       [ 0,  0, 16],
       [ 0,  0,  9]], dtype=int64)

In [27]:
from sklearn.naive_bayes import GaussianNB

In [28]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 1) 

In [29]:
nn = GaussianNB()
nn.fit(x_train, y_train)

GaussianNB(priors=None)

In [30]:
y_pred = nn.predict(x_test)
confusion_matrix(y_test, y_pred)

array([[13,  0,  0],
       [ 0, 16,  0],
       [ 0,  0,  9]], dtype=int64)

In [31]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB(priors=None)

In [32]:
y_pred = gnb.predict(x_test)
print(confusion_matrix(y_test, y_pred))
gnb.score(x_test,y_test)

[[13  0  0]
 [ 0 16  0]
 [ 0  0  9]]


1.0