<a href="https://colab.research.google.com/github/jackson-gregoire/MachineLearningLearning/blob/main/naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# All of the data (classes are in the last column -- 101)
train = np.loadtxt('/content/drive/MyDrive/data/training_data.txt')
test = np.loadtxt('/content/drive/MyDrive/data/testing_data.txt')
val = np.loadtxt('/content/drive/MyDrive/data/validation_data.txt')

In [4]:
# Feasture data for training, validation, and testing 
x_train = train[:,:-1]
x_val = val[:,:-1]
x_test = test[:,:-1]

# Class/target data for training, validation, and testing
y_train = train[:,-1]
y_val = val[:,-1]
y_test = test[:,-1]

In [44]:
'''
  More modular fit and predict methods for training and testing.

  Note: something weird is going on with the "training" for as my decision rule
  stands you get 0% training accuracy but when you flip the inequality in the
  decision rule you get 100% (meaning I have the classes mixed up? Not sure, they
  are labeled oddly). However, the 0% training acc produces a 10% higher testing
  accuracy.
'''
def fit(x_data, labels):
  p11, p01, p12, p02  = np.zeros(100), np.zeros(100), np.zeros(100), np.zeros(100)
  predictions = [] # Will hold our class predictions
  N = len(x_data)

  # Estimate the priors
  priors = np.array([np.mean(np.where(labels == 1, 1, 0)), 
                   np.mean(np.where(labels == 2, 1, 0))])
  
  '''
  There's definelty going to be a cleaner way of getting the indv likelihoods
  but I'll have to come back to it.
  '''
  # Get the likelihoods
  for x,y in zip(x_data, labels):
    for idx, val in enumerate(x):
      # Rememeber to normalize by N (the length of X) not by the length of vector
      # Also again, this decomposition is fairly uncessary since I add them below
      # however, when debugging the original code I needed to write this out for
      # clarity.
      if y == 1 and val == 1: p11[idx] += 1/N
      elif y == 1 and val == 0: p01[idx] += 1/N
      elif y == 2 and val == 1: p12[idx] += 1/N
      else: p02[idx] += 1/N

  p1 = p11 + p01
  p2 = p12 + p02

  # Calculate the posteriors
  for x,y in zip(x_data, labels):
    pc1 = np.log(priors[0])
    pc2 = np.log(priors[1])

    for idx, val in enumerate(x):
      if y == 1: pc1 += val*np.log(p1[idx]) + (1-val)*np.log(1-p1[idx])
      else: pc2 += val*np.log(p2[idx]) + (1-val)*np.log(1-p2[idx])

    print(pc1, pc2)
    if pc1 > pc2: predictions.append(1)
    else: predictions.append(2)

  print(f"Training Accuracy: {np.mean(np.where(predictions == labels, 1, 0))*100}%")
    
  return p1, p2, predictions, priors[0], priors[1]

def predict(test_data, test_labels, p1, p2, prior1, prior2):
  N = len(test_data)
  predictions = []

  for x,y in zip(test_data, test_labels):
    pc1 = np.log(prior1)
    pc2 = np.log(prior2)

    for idx, val in enumerate(x):
      pc1 += val*np.log(p1[idx]) + (1-val)*np.log(1-p1[idx])
      pc2 += val*np.log(p2[idx]) + (1-val)*np.log(1-p2[idx])

    if pc1 > pc2: predictions.append(1)
    else: predictions.append(2)
    print(f"Prediction: {predictions[-1]}", f", Actual: {y}")

  print(f"Training Accuracy: {np.mean(np.where(predictions == test_labels, 1, 0))*100}%")

In [None]:
p1, p2, predictions , prior1, prior2 = fit(x_train, y_train)

In [None]:
predict(x_test, y_test, p1,p2, prior1, prior2)

In [None]:

'''
  --Next--
  Finding optimal sigma from validation data using a given prior.

'''
# Our hyperparameter for validation will be sigma (used in the prior formulation)
sigma = np.arange(start = -5, stop = 6)
val_acc = []
feature_p = 1 - np.mean(x_val, axis = 0)

# Finding best sigma on validation set
for s in sigma:
  priors = np.array([1/(1+np.e**(-s)), 1 - (1/(1+np.e**(-s)))])
  predictions = []

  for x,y in zip(x_val, y_val):
    pc1 = priors[0]
    pc2 = priors[1]

    for idx, val in enumerate(x):
      pc1 *= feature_p[idx]**(1-val)*(1-feature_p[idx])**val + 10**(-10)
      pc2 *= (1-feature_p[idx])**(1-val)*(feature_p[idx])**val + 10**(-10)

    if pc1 > pc2: predictions.append(1)
    else: predictions.append(2)

  accuracy = np.mean(np.where(predictions == y_val, 1, 0))
  val_acc.append(accuracy)
  print(f"Accuracy for {s}: {accuracy*100}%")

print('-'*50)
print(f'Highest validation accuracy: {np.max(val_acc)*100}%')

In [None]:
'''
  --Old--
  None of this is modular, nor takes into account/updates the priors/likelihoods as 
  new information is introduced.
'''

priors = np.array([np.mean(np.where(y_train == 1, 1, 0)), 
                   np.mean(np.where(y_train == 2, 1, 0))])

# This will be the occurence of 1, while q hat will represent the lack of, or
# presence of 0. p<x input><class>
# This was the right idea, but ultimately wasn't doing what I thought it was
# as the sum of each classes likelihoods was 1, thus producing nan's when taking
# logs.
#p11 = np.mean(np.where(x_train[y_train == 1] == 1, 1, 0,), axis = 0) # L(x = 1 | C = 1)
#p01 = np.mean(np.where(x_train[y_train == 1] == 0, 1, 0), axis = 0) # L(x = 0 | C = 1)
#p12 = np.mean(np.where(x_train[y_train == 2] == 1, 1, 0), axis = 0) # L(x = 1 | C = 2)
#p02 = np.mean(np.where(x_train[y_train == 2] == 0, 1, 0), axis = 0) # L(x = 0 | C = 2)

p11 = np.zeros(100)
p01 = np.zeros(100)
p12 = np.zeros(100)
p02 = np.zeros(100)

for x,y in zip(x_train, y_train):
  for idx, val in enumerate(x):
    # Rememeber to normalize by N (the length of X) not by the length of vector
    if y == 1 and val == 1: p11[idx] += 1/len(x_train) 
    elif y == 1 and val == 0: p01[idx] += 1/len(x_train)
    elif y == 2 and val == 1: p12[idx] += 1/len(x_train)
    else: p02[idx] += 1/len(x_train)

p1 = p11 + p01
p2 = p12 + p02
predictions = [] # Will hold our class predictions

for x,y in zip(x_train, y_train):
  pc1 = np.log(priors[0])
  pc2 = np.log(priors[1])

  for idx, val in enumerate(x):
    if y == 1: pc1 += val*np.log(p1[idx]) + (1-val)*np.log(1-p1[idx])
    else: pc2 += val*np.log(p2[idx]) + (1-val)*np.log(1-p2[idx])


  if pc1 < pc2: predictions.append(1)
  else: predictions.append(2)
  #print(f"Posterior For C = 1: {pc1}")
  #print(f"Posterior For C = 2: {pc2}")
  print(f'Prediction: {predictions[-1]}')
  print(f"Actual: {y}")
  print('-'*30)

print(f"Accuracy: {np.mean(np.where(predictions == y_train, 1, 0))*100}%")




In [None]:
'''
  --Old--
  Using numerical estimation of priors from the data. This one uses a weird
  formulation for Bernoulli, I'm going to rewrite for with more widely published
  formulation.
'''
priors = np.array([np.mean(np.where(y_train == 1, 1, 0)), 
                   np.mean(np.where(y_train == 2, 1, 0))])

# This would only take into account half of the conditional likelihoods I think?
feature_p = 1 - np.mean(x_train, axis = 0)
predictions = []

for x,y in zip(x_train, y_train):
  pc1 = priors[0]
  pc2 = priors[1]

  for idx, val in enumerate(x):
    pc1 *= feature_p[idx]**(1-val)*(1-feature_p[idx])**val
    pc2 *= (1-feature_p[idx])**(1-val)*(feature_p[idx])**val

  if pc1 > pc2: predictions.append(1)
  else: predictions.append(2)

accuracy = np.mean(np.where(predictions == y_train, 1, 0))
print(f"Accuracy: {accuracy*100}%")

Accuracy: 50.625%
