In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [13]:
# In the load method the parameter allow_pickle must 
# be set to True for unpacking the data
npzfile = np.load('mnist.npz', allow_pickle=True)
print(npzfile.files) # prints ['X_train', 'y_train', 'X_test', 'y_test']
# You can access train/test data and labels as follows

npzfile['train']  # outputs the X_train dataset, prints array([[0., 0., 0., ..., 0., 0., 0.], with shape (60000, 784) which contains 60000 images of 28x28 pixels.

['test', 'test_labels', 'train', 'train_labels']


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [14]:
def NBTrain(train_patterns, class_labels, num_features, num_classes, laplace_correction=0):
    # Count the total number of training instances
    num_patterns = np.shape(train_patterns)[0]
    # Count the total number of features in each instance
    num_attributes = np.shape(train_patterns)[1]
    # Initialize arrays for storing log probabilities of features and classes
    feature_log_probs = np.zeros((num_features, num_classes, num_attributes))
    class_log_probs = np.zeros(num_classes)
    # Loop over each training instance and feature
    for pattern_index in range(num_patterns):
        for attribute_index in range(num_attributes):
            # Count occurrences of each feature for each class
            feature_log_probs[train_patterns[pattern_index][attribute_index], class_labels[pattern_index], attribute_index] += 1
    # Loop over each class and feature
    for class_index in range(num_classes):
        for attribute_index in range(num_attributes):
            # Calculate log probabilities of each feature for each class, applying Laplace smoothing if specified
            feature_log_probs[:, class_index, attribute_index] = np.log((feature_log_probs[:, class_index, attribute_index] + laplace_correction) / (np.sum(feature_log_probs[:, class_index, attribute_index]) + num_features * laplace_correction))
    # Calculate log probabilities of each class
    class_log_probs = np.log(np.bincount(class_labels) / num_patterns)
    # Return the log probabilities of features and classes
    return feature_log_probs, class_log_probs

In [15]:
def NBClassify(test_patterns, feature_log_probs, class_log_probs):
    num_patterns = np.shape(test_patterns)[0]
    num_attributes = np.shape(test_patterns)[1]

    predicted_classes = np.zeros(num_patterns, dtype=int)

    for pattern_index in range(num_patterns):
        log_posterior_probs = np.zeros(class_log_probs.shape)
        for attribute_index in range(num_attributes):
            log_posterior_probs += feature_log_probs[test_patterns[pattern_index][attribute_index], :, attribute_index]
        log_posterior_probs += class_log_probs
        predicted_classes[pattern_index] = np.argmax(log_posterior_probs)

    return predicted_classes

In [16]:
import pandas as pd
from sklearn.metrics import accuracy_score

train_features = npzfile['train']
train_labels = npzfile['train_labels']

test_features = npzfile['test']
test_labels = npzfile['test_labels']
laplace_values = [0, 0.001, 0.01, 0.1, 0.2, 0.4, 1, 10]

results_data = []

feature_values_count = np.unique(train_features).size
class_count = np.unique(train_labels).size

for laplace_val in laplace_values:
    print(laplace_val)
    feature_probs, class_probs = NBTrain(train_features, train_labels, feature_values_count, class_count, laplace_val)
    
    predicted_train_labels = NBClassify(train_features, feature_probs, class_probs)
    predicted_test_labels = NBClassify(test_features, feature_probs, class_probs)
    
    training_error = 1 - accuracy_score(train_labels, predicted_train_labels)
    testing_error = 1 - accuracy_score(test_labels, predicted_test_labels)
    
    results_data.append({'Error Train': training_error, 'Error Test': testing_error,'LC': laplace_val})
    print(laplace_val)
results_table = pd.DataFrame(results_data)

0


IndexError: index 1 is out of bounds for axis 0 with size 1