In [5]:
import numpy as np 
import pandas as pd 	
import matplotlib.pyplot as plt 
import math
from sklearn.model_selection import train_test_split

df = pd.read_csv('Iris.csv')
X = df.drop([df.columns[-1]], axis = 1)
y = df[df.columns[-1]]

In [2]:
features = list(X.columns)
print(features)

['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_size = X_train.shape[0]
num_feats = X_train.shape[1]
print(train_size, num_feats)

120 4


In [19]:
likelihoods = {}
class_priors = {}

In [38]:
for feature in features:
    likelihoods[feature] = {}
    for outcome in np.unique(y_train):
        likelihoods[feature].update({outcome:{}})
        class_priors.update({outcome: 0})

In [39]:
""" P(c) - Prior Class Probability """
for outcome in np.unique(y_train):
    outcome_count = sum(y_train == outcome)
    class_priors[outcome] = outcome_count / train_size
print(class_priors)

{'Iris-setosa': 0.3333333333333333, 'Iris-versicolor': 0.3416666666666667, 'Iris-virginica': 0.325}


In [40]:
""" P(x|c) - Likelihoods """
for feature in features:
  for outcome in np.unique(y_train):
    likelihoods[feature][outcome]['mean'] = X_train[feature][y_train[y_train == outcome].index.values.tolist()].mean()
    likelihoods[feature][outcome]['variance'] = X_train[feature][y_train[y_train == outcome].index.values.tolist()].var()
  print(likelihoods)
print(likelihoods)

{'SepalLengthCm': {'Iris-setosa': {'mean': 4.99, 'variance': 0.1270769230769231}, 'Iris-versicolor': {'mean': 5.919512195121952, 'variance': 0.29410975609756096}, 'Iris-virginica': {'mean': 6.533333333333333, 'variance': 0.4275438596491228}}, 'SepalWidthCm': {'Iris-setosa': {}, 'Iris-versicolor': {}, 'Iris-virginica': {}}, 'PetalLengthCm': {'Iris-setosa': {}, 'Iris-versicolor': {}, 'Iris-virginica': {}}, 'PetalWidthCm': {'Iris-setosa': {}, 'Iris-versicolor': {}, 'Iris-virginica': {}}}
{'SepalLengthCm': {'Iris-setosa': {'mean': 4.99, 'variance': 0.1270769230769231}, 'Iris-versicolor': {'mean': 5.919512195121952, 'variance': 0.29410975609756096}, 'Iris-virginica': {'mean': 6.533333333333333, 'variance': 0.4275438596491228}}, 'SepalWidthCm': {'Iris-setosa': {'mean': 3.439999999999999, 'variance': 0.1588717948717949}, 'Iris-versicolor': {'mean': 2.770731707317073, 'variance': 0.1026219512195122}, 'Iris-virginica': {'mean': 2.9666666666666663, 'variance': 0.10175438596491228}}, 'PetalLength

In [45]:
y_train[y_train == outcome].index.values.tolist()[:10]

[146, 142, 133, 137, 109, 105, 122, 123, 117, 113]

In [35]:
""" Calculates Posterior probability P(c|x) """

y_pred = []
test = np.array(X_test)

for query in test:
  probs_outcome = {}

  """
    Note: No Need to calculate evidence i.e P(x) since it is constant fot the given sample.
            Therfore, it does not affect classification and can be ignored
  """
  for outcome in np.unique(y_train):
    prior = class_priors[outcome]
    likelihood = 1
    evidence_temp = 1

    for feat, feat_val in zip(features, query):
      mean = likelihoods[feat][outcome]['mean']
      var = likelihoods[feat][outcome]['variance']
      likelihood *= (1/math.sqrt(2*math.pi*var)) * np.exp(-(feat_val - mean)**2 / (2*var))

    posterior_numerator = (likelihood * prior)
    probs_outcome[outcome] = posterior_numerator


  pred = max(probs_outcome, key = lambda x: probs_outcome[x])
  y_pred.append(pred)

print(probs_outcome)

{'Iris-setosa': 1.2071223550898493, 'Iris-versicolor': 3.4172819017927923e-15, 'Iris-virginica': 1.2585135394636042e-22}


In [36]:
np.sum(y_pred==y_test)/len(y_test)

1.0