In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

# Gaussian Naive Bayes

## Train test split

In [2]:
def train_test_split(df, test_size):

  if isinstance(test_size, float):
    test_size = round(test_size*len(df))
  
  indices = df.index.tolist() # random.sample takes list, set, dictionary
  test_indices = random.sample(population=indices, k=test_size)

  test = df.loc[test_indices]
  train = df.drop(test_indices)

  return train, test

## Calculate Prior Probabilities

In [3]:
def calculate_prior_probabilities(df):

  prior_probabilities = df.groupby(by = 'target').apply(lambda x: len(x)/len(df))

  return np.log(prior_probabilities).values

In [4]:
# calculate_prior_probabilities(df)

# [Prior_probability(setosa), Prior_probability(versicolor), Prior_probability(virginica)]

## Find mean, variance

In [5]:
def return_statistics(df):

  mean = df.groupby(by='target').apply(lambda x: x.mean(axis=0))
  variance = df.groupby(by='target').apply(lambda x: x.var(axis=0))

  return (mean.values, variance.values)

In [6]:
# mean, variance = return_statistics(df)
# print(mean)
# print(variance)

#             s_l  s_w  p_l  p_w
# setosa
# versicolor
# virginica

## Find Gaussian Probability density

In [7]:
# P(x=12 | 'setosa')

def calculate_probability_density(mean, variance, x):

  probability_density = (1 / np.sqrt(2*np.pi*variance) ) * np.exp( (-(x - mean)**2)  / ( 2*variance ) )

  return probability_density

## Posterior Probabilities

In [8]:
def calculate_posterior_probabilities(df_row, mean, variance, n_unique_labels, n_cols):
  
  posterior_probabilities = []
  
  # calculate probabilities wrt each label to find max
  for i in range(n_unique_labels):
    posterior = 0

    # for each feature
    for j in range(n_cols):
      posterior += np.log(calculate_probability_density(mean[i][j], variance[i][j], df_row[j]))
    posterior_probabilities.append(posterior)
  
  return posterior_probabilities

In [9]:
# calculate_posterior_probabilities()

# [posterior_probability['setosa'], posterior_probability['versicolor'], posterior_probability['virginica']]

## Fit model on training dataset

In [10]:
def NBA_fit(df):

  n_cols = len(df.columns)-1
  unique_labels = df['target'].unique()
  n_unique_labels = len(unique_labels)

  mean, variance = return_statistics(df)
  prior_probabilities = calculate_prior_probabilities(df) # returns log

  return {
      'n_cols': n_cols,
      'unique_labels': unique_labels,
      'n_unique_labels': n_unique_labels,
      'mean': mean,
      'variance': variance,
      'prior_probabilities': prior_probabilities
  }
  

In [11]:
# nba = NBA_fit(df)

# Returns a dictonary containing statistical and other important info

## Predict

In [12]:
def predict(test_df, nba):

  predictions = []
  for i in range(len(test_df)):

    prior = nba['prior_probabilities']
    posterior = calculate_posterior_probabilities(test_df.iloc[i, :-1], nba['mean'], nba['variance'], nba['n_unique_labels'], nba['n_cols'])  # returns log
    probabilities = prior + posterior
    # one with max prob will be the output 
    mx_idx = np.argmax(probabilities)

    predictions.append(nba['unique_labels'][mx_idx])  # add log values

  return predictions

In [13]:
# predictions = predict(test_df, nba)

# returns label

## Load Dataset

In [15]:
names =  ["Sex", "Length", "Diameter", "Height", "Whole weight", 
           "Shucked weight", "Viscera weight","Shell weight", "target"]
df = pd.read_csv('abalone.data', header=None, names=names)

df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,target
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


No null values present

In [16]:
# train test split
train_df, test_df = train_test_split(df, 0.2)

# fit model
nba = NBA_fit(train_df)

# make predictions
predictions = predict(test_df, nba)

# accuracy
accuracy = len(test_df.loc[predictions == test_df['target']])/len(test_df) * 100
accuracy

  mean = df.groupby(by='target').apply(lambda x: x.mean(axis=0))
  variance = df.groupby(by='target').apply(lambda x: x.var(axis=0))


UFuncTypeError: ufunc 'subtract' did not contain a loop with signature matching types (dtype('<U1'), dtype('float64')) -> None