In [1]:
import sys
sys.path.append('..')

In [2]:
from utils.read import read_data, read_df
from model.naive_bayes import Bayes_Classifier

In [3]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [4]:
# Read data
train_landmarks = "../dataset/train_landmarks.csv"
test_landmarks = "../dataset/test_landmarks.csv"
train = read_df(train_landmarks)
test = read_df(test_landmarks)

In [5]:
print("Total observations: ", len(train) + len(test))
print("Train observations: ", len(train))
print("Test observations: ", len(test))

Total observations:  34190
Train observations:  27352
Test observations:  6838


In [6]:
unique_class_count = {}
for idx in train.index:
    if train.loc[idx, 'label'] not in unique_class_count:
        unique_class_count[train.loc[idx, 'label']] = 1
    else:
        unique_class_count[train.loc[idx, 'label']] += 1
for idx in test.index:
    if test.loc[idx, 'label'] not in unique_class_count:
        unique_class_count[test.loc[idx, 'label']] = 1
    else:
        unique_class_count[test.loc[idx, 'label']] += 1

In [7]:
print("Unique classes: ", len(unique_class_count))
print("Class distribution: ", unique_class_count)

Unique classes:  26
Class distribution:  {'Y': 1315, 'F': 1315, 'W': 1315, 'D': 1315, 'V': 1315, 'Z': 1315, 'X': 1315, 'K': 1315, 'U': 1315, 'B': 1315, 'C': 1315, 'A': 1315, 'N': 1315, 'L': 1315, 'P': 1315, 'E': 1315, 'I': 1315, 'R': 1315, 'H': 1315, 'G': 1315, 'O': 1315, 'Q': 1315, 'J': 1315, 'T': 1315, 'M': 1315, 'S': 1315}


In [8]:
input_sizes = [50, 100, 150, 200, 250, 300]
kernel_sizes = [1, 3, 5, 7, 9, 11]
kernel_decay_methods = ["distance", "none"]
prob_type = ["individual"] # "collective"

In [9]:
import pandas as pd

# Create a dataframe to store the results
columns = ["input_size", "prob_type", "kernel_size", "decay_method", "accuracy"]
results = pd.DataFrame(columns=columns)

In [10]:
# model = Bayes_Classifier("landmarks", 250, 250, "individual", 9, "distance")
# model.train(train)

# # Test the model
# df = model.classify(test)
# df["correct"] = df["label"] == df["predicted_label"]
# accuracy = df["correct"].sum() / len(df)

In [11]:
# f1 = f1_score(df["label"], df["predicted_label"], average="macro")
# precision = precision_score(df["label"], df["predicted_label"], average="macro")
# recall = recall_score(df["label"], df["predicted_label"], average="macro")
# print("Accuracy: ", accuracy)
# print("F1: ", f1)
# print("Precision: ", precision)
# print("Recall: ", recall)

In [12]:
# Read image data
train_dir = "../dataset/train_binary"
test_dir = "../dataset/test_binary"
test_features, test_labels = read_data(test_dir, flatten=1, grayscale=1, resize=(50, 50))

In [13]:
from utils.read import read_data
from utils.preprocessing import DimensionReduction
from sklearn.preprocessing import StandardScaler
import numpy as np

train_features, train_labels = read_data(train_dir, flatten=1, grayscale=1, resize=(50, 50))
scaler = StandardScaler()
scaler.fit(train_features)
train_features = scaler.transform(train_features)
test_features = scaler.transform(test_features)

train_features = np.array(train_features)
pca = DimensionReduction(train_features,n_component=588)
train_projected = pca.pca_transform(train_features)
test_projected = pca.pca_transform(test_features)


In [14]:
naive_bayes = Bayes_Classifier(feature_type='pixels')
naive_bayes.train((train_projected, train_labels))
preds = naive_bayes.classify((test_projected, test_labels))
# Create a new df with results
df = pd.DataFrame(columns=["label", "predicted_label"])
df["label"] = test_labels
df["predicted_label"] = preds
df["correct"] = df["label"] == df["predicted_label"]
accuracy = df["correct"].sum() / len(df)
f1 = f1_score(df["label"], df["predicted_label"], average="macro")
precision = precision_score(df["label"], df["predicted_label"], average="macro")
recall = recall_score(df["label"], df["predicted_label"], average="macro")
print("Accuracy: ", accuracy)
print("F1: ", f1)
print("Precision: ", precision)
print("Recall: ", recall)

Accuracy:  0.08686750511845569
F1:  0.03268152998699529
Precision:  0.02517286368885013
Recall:  0.08686750511845569


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# for input_size in input_sizes:
#     for prob in prob_type:
#         for kernel_size in kernel_sizes:
#             if kernel_size > 1:
#                 for dcm in kernel_decay_methods:
#                     # Train the model
#                     model = Bayes_Classifier("landmarks", input_size, input_size, prob, kernel_size, dcm)
#                     model.train(train)

#                     # Test the model
#                     df = model.classify(test)
#                     df["correct"] = df["label"] == df["predicted_label"]
#                     accuracy = df["correct"].sum() / len(df)

#                     results.loc[len(results)] = {
#                         "input_size": input_size,
#                         "prob_type": prob,
#                         "kernel_size": kernel_size,
#                         "decay_method": dcm,
#                         "accuracy": accuracy
#                     }
#                     print("input_size: {}, prob_type: {}, kernel_size: {}, decay_method: {}, accuracy: {}".format(input_size, prob, kernel_size, dcm, accuracy))
#             else:
#                 # Train the model
#                 model = Bayes_Classifier("landmarks", input_size, input_size, prob, kernel_size, "none")
#                 model.train(train)

#                 # Test the model
#                 df = model.classify(test)
#                 df["correct"] = df["label"] == df["predicted_label"]
#                 accuracy = df["correct"].sum() / len(df)

#                 results.loc[len(results)] = {
#                     "input_size": input_size,
#                     "prob_type": prob,
#                     "kernel_size": kernel_size,
#                     "decay_method": None,
#                     "accuracy": accuracy
#                 }
#                 print("input_size: {}, prob_type: {}, kernel_size: {}, decay_method: {}, accuracy: {}".format(input_size, prob, kernel_size, None, accuracy))
            

In [16]:
# results.head(len(results))

In [17]:
# print("Best accuracy: {}".format(results["accuracy"].max()))
# results[results["accuracy"] == results["accuracy"].max()]