## - Name: Kendrick Kee
## - UOW ID: 7366814

In [12]:
#To import relevant libraries
import pandas as pd
import numpy as np
import random
import sys
from math import *

### Define a function to split dataset

In [13]:
# Split the data into training set and testing set
def train_test_split(data, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(data))
    
    data_index = data.index.tolist()
    test_index = random.sample(population=data_index, k=test_size)
    
    test_set = data.loc[test_index]
    train_set = data.drop(test_index)
    
    return train_set, test_set

### Define a function to get a summary "dictionary" of dataset as parameter

In [14]:
# Get relevant details about the feature of the dataset for each class label
# If a continuous feature
        ### a tuple of mean, standard deviation and length of dataset is returned
# If data is categorical and is one hot encoded
        ### a tuple of value(1 or 0) and its count with respect to the class (0,1,2)
        ### along with a count of the class is returned
def get_summary(dataset):
    
    #dictionary to stores details
    #of each label class
    summary = {}
    
    #loop over unique target values
    for i in dataset.iloc[:,-1].unique():
        
        #list of details of features for each class (i)
        a = []
        
        #loop for all the features except label column
        for j in range(len(dataset.columns)-1):
            
            #store the size of unique values in current feature
            size = len(dataset.iloc[:,j].unique())
            #if size is less than 5, then categorical feature
            if(size < 5):
                lst = list()
                #subset dataset for each class(i)
                #store as df
                df = dataset[dataset.iloc[:,-1]==i]
                
                #loop for unique values in categorical features
                for k in dataset.iloc[:,j].unique():
                    
                    #for each unique values, store the
                    #value and count of the value in feature
                    lst.append(k)
                    lst.append(len(df[df.iloc[:,j] == k]))
                
                #make a tuple out of the list
                a.append(tuple([lst[0], lst[1], lst[2], lst[3], len(df)]))
                
            #else continuous feature
            else:
                a.append((dataset[dataset.iloc[:,-1]==i].mean(axis=0)[j],dataset[dataset.iloc[:,-1]==i].std(axis=0)[j], len(dataset[dataset.iloc[:,-1]==i])))
                
        summary[i] = a
    return summary

### Define Gaussian Naive Bayes function

In [15]:
# Calculate probability for continuous fetaures
# Calculate the Gaussian probability distribution function for x
def calculate_probability_Gaussian(x, mean, stdev,total_rows):
    if stdev == 0 or isnan(stdev):
        return 1/total_rows#if stdev is 0, return the probability of "Add one count"
        
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    if exponent == 0:
        #number overflow occurs exponent == smallest possibe float by system
        exponent = sys.float_info.min
    return 1 / (sqrt(2 * pi) * stdev) * exponent

#### To prevent numerical underflow, when exponent underflows out of pythons float precision, exponent will become small possible float by system

#### To prevent the zero frequency/count problem, this function catches occurences of 0/null standard deviation and returns the probablity of 1/number of observations

### Define Navie Bayes function

In [16]:
# Calculate probability for categorical features
def calculate_probability(x, X1, count_1, X2, count_2, class_count,total_rows):
    if x == X1:
        #if zero frequency occurs, add 1 to count and return the probability
        if count_1/class_count == 0:
            return 1/total_rows
        
        return count_1/class_count
    else:
        #if zero frequency occurs, add 1 to count and return the probability
        if count_2/class_count == 0:
            return 1/total_rows
        
        return count_2/class_count
    

#### Similarly, to prevent 0 frequency error, returning "add one count" probabilty instead of 0

### Define functions for probability computation

In [17]:
# Calculate the probabilities of predicting each class for a given row
# for continuous features use Gaussian probability function
# for categorical feature calculate_probability function
def calculate_class_probabilities(summaries, row):
    
    #get the length of the dataset
    #sum up all the counts of each label class
    total_rows = sum([summaries[label][0][2] for label in summaries])
    
    #instantiate a dictionary to store probability 
    #of each label class for a given row
    probabilities = dict()
    
    #get the class value: class_value
    #get the summaries for each class: class_summaries
    for class_value, class_summaries in summaries.items():
        
        #get the probability of each label class e.g.
        #if class label 1 has a length of 12345
        #and length of dataset is 234567
        #then this probability is 12345/234567
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        
        #loop over all the class_summaries
        #i.e. summaries of each feature
        for i in range(len(class_summaries)):
            
            #since the summaries for categorical variable contains 5 values, 
            #while the ones for continuous variable contains 3 values
            #categorical fetaure
            if len(class_summaries[i]) > 3:
                X1, count_1, X2, count_2, class_count = class_summaries[i]
                probabilities[class_value] = \
                probabilities[class_value] * calculate_probability(row[i],X1, count_1, X2, count_2, class_count,total_rows)
            #continuous feature
            else:
                mean, stdev, _ = class_summaries[i]
                probabilities[class_value] = \
                probabilities[class_value] * calculate_probability_Gaussian(row[i], mean, stdev,total_rows)
                
    return probabilities

In [18]:
# Predict the class for a given row
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    
    #instantiate variable to store 
    #best label: best_label
    #best probability: best_prob
    best_label, best_prob = None, -1
    
    #get the best probability
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Driver function for Naive Bayes

In [19]:
# Naive Bayes Algorithm
# predict values for the test set
def naive_bayes(train, test):
    summary = get_summary(train)
    predictions = list()
    for row in test.values:
        output = predict(summary, row)
        predictions.append(output)
    return(predictions)

## Accuracy function to determine regression metrics

In [20]:
# Calculated as:
# check for equality of predicted value and labels in test_set
# calculates the sum of correct prediction
# divides the sum by length of test_set

    
def accuracy(predictions, data_set):
    y_test = list(data_set.iloc[:,-1])
    correct_count = 0
    sum_error = 0.0
    rsme_error = 0.0
    for i in range(len(y_test)):
        if predictions[i] == y_test[i]:
            correct_count += 1
        sum_error += abs(predictions[i] - y_test[i])
        prediction_error = abs(predictions[i] - y_test[i])
        rsme_error = (prediction_error**2)
    print(f'Number of exact matches in predictions: {correct_count}/{len(y_test)}')        
    print(f'MEAN SQUARED ERROR: {np.square(np.subtract(y_test,predictions)).mean()}')
    print(f'ROOT MEAN SQUARED ERROR: {sqrt(rsme_error/float(len(y_test)))}')
    print(f'MEAN ABSOLUTE ERROR: {sum_error/float(len(y_test))}')
    return (round(correct_count/len(data_set)*100,3))

#### This function computes common regression metrics such as MSE, MAE and RSME to allow for easy evaluation of the model.

## Loading the Dataset

In [21]:
names =  ["Sex", "Length", "Diameter", "Height", "Whole weight", 
           "Shucked weight", "Viscera weight","Shell weight", "Rings"]
df = pd.read_csv('abalone.data', header=None, names=names)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


Data Preprocessing

In [22]:
# Check for missing data and type of data
df.head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             5 non-null      object 
 1   Length          5 non-null      float64
 2   Diameter        5 non-null      float64
 3   Height          5 non-null      float64
 4   Whole weight    5 non-null      float64
 5   Shucked weight  5 non-null      float64
 6   Viscera weight  5 non-null      float64
 7   Shell weight    5 non-null      float64
 8   Rings           5 non-null      int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 488.0+ bytes


In [23]:
encode = {"Sex": {"M":1,"F":2,"I":3}}
df = df.replace(encode)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [24]:
# Get the statistical info of the numeric features
df.describe()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,1.95547,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.827815,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,1.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,1.0,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,2.0,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,3.0,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,3.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [25]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   int64  
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 293.8 KB


In [27]:
# Split the dataset into training and testing
train_set, test_set = train_test_split(df, 0.3)
train_set.describe()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,2924.0,2924.0,2924.0,2924.0,2924.0,2924.0,2924.0,2924.0,2924.0
mean,1.966826,0.524104,0.408057,0.139714,0.829171,0.359761,0.180738,0.238587,9.919289
std,0.823405,0.120497,0.099754,0.043084,0.490619,0.222053,0.109843,0.138107,3.200183
min,1.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,1.0,0.45,0.35,0.115,0.44,0.186,0.0935,0.13,8.0
50%,2.0,0.545,0.425,0.145,0.80275,0.339,0.171,0.235,9.0
75%,3.0,0.615,0.48,0.165,1.1515,0.5005,0.252,0.33,11.0
max,3.0,0.815,0.65,1.13,2.8255,1.488,0.6415,0.897,29.0


In [28]:
test_set.describe()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0,1253.0
mean,1.92897,0.523731,0.40747,0.139054,0.827741,0.35845,0.180258,0.2394,9.967279
std,0.837747,0.119193,0.098068,0.038747,0.490046,0.22184,0.109121,0.14178,3.280504
min,1.0,0.135,0.105,0.02,0.014,0.0055,0.0025,0.004,3.0
25%,1.0,0.45,0.345,0.11,0.4445,0.1865,0.092,0.132,8.0
50%,2.0,0.54,0.42,0.14,0.785,0.3295,0.17,0.228,9.0
75%,3.0,0.615,0.48,0.165,1.1595,0.5065,0.2565,0.325,11.0
max,3.0,0.78,0.63,0.25,2.7795,1.351,0.76,1.005,27.0


In [29]:
# Test the model on training set
train_pred = naive_bayes(train_set, train_set)
print('Accuracy of prediction for training set:', accuracy(train_pred, train_set))

Number of exact matches in predictions: 473/2924
MEAN SQUARED ERROR: 30.225376196990425
ROOT MEAN SQUARED ERROR: 0.1849316803638273
MEAN ABSOLUTE ERROR: 3.963406292749658
Accuracy of prediction for training set: 16.176


In [30]:
# Test the model on testing set
test_pred = naive_bayes(train_set, test_set)
print('Accuracy of prediction for testing set:', accuracy(test_pred, test_set))

Number of exact matches in predictions: 195/1253
MEAN SQUARED ERROR: 30.708699122106943
ROOT MEAN SQUARED ERROR: 0.14125195547029415
MEAN ABSOLUTE ERROR: 3.9936153232242617
Accuracy of prediction for testing set: 15.563


## Observations
The accuracy of the model is extremely poor at 17% as it is unable to predict the exact number of rings. This is expected as predicting the number of rings poses as more of a regression problem than a classification one due to the continous nature of "Rings" in this context

Therefore, we will observe its other metrics commonly used in regression to evaluate the model.

The MSE of both models are very close for both the training and testing dataset, this suggests that no overfitting occured. However, the MAE for both models is 3.6, this means that the model on average predicts the number of rings to be +- 3.6 which depending on the context and importance of accuracy of getting the exact ring count may deem this model to be good or bad.

The RMSE for the testing dataset appears to be 0 whist its training counterpart has a 0.2 RMSE. This could mean that the model used for the testing dataset is alot more accurate than its training counterpart based on that metric.