# Setup

In [None]:
# setup to see the execution time in each cell

!pip install ipython-autotime
!pip install directory_structure
# !pip install wandb
%load_ext autotime

In [None]:
import pandas as pd
import os
import glob
import PIL
from PIL import Image
import numpy as np
import cupy as cp
import matplotlib.pyplot as plt
# import wandb

In [None]:
DRIVER_ROOT_DIR = "/kaggle/input/" # HC Directory
# DRIVER_ROOT_DIR = "/content/drive/MyDrive/DTSC 870/Code" #MT Directory

# no augmentation
# DATASET_02_TRAIN = DRIVER_ROOT_DIR + "fer2013/02_FER/train"
# DATASET_02_TEST = DRIVER_ROOT_DIR + "fer2013/02_FER/test"

# augmentation v.1
# DATASET_02_TRAIN = DRIVER_ROOT_DIR + "fer2013-aug/Aug_train"
# DATASET_02_TEST = DRIVER_ROOT_DIR + "fer2013-aug-test/Aug_test"

# augmentation v.2
DATASET_02_TRAIN = DRIVER_ROOT_DIR + "fer2013-aug-train-2/Aug_train_2"
DATASET_02_TEST = DRIVER_ROOT_DIR + "fer2013-aug-test-2/Aug_test_2"
# DATASET_02_TEST = DRIVER_ROOT_DIR + "d/datasets/huihenrychen/fer2013-aug-test-2/Aug_test_2"

## Utility Functions

In [None]:
def get_data_df(dir):
    # modified code from: https://www.kaggle.com/namgalielei/simple-load-images-and-count-number-of-each-class

    train_df = pd.DataFrame()

    trainset = glob.glob(dir)

    train_df['file'] = [img.split("/")[-1] for img in trainset]
    train_df['class'] = [img.split("/")[-2] for img in trainset]

    return train_df

In [None]:
def generate_set(df, dir, classes_):

    # new_df = pd.DataFrame()
    pixels = []
    class_ = []

    # trainset = glob.glob(dir)
    for i in range(len(df.index)):
        # get the absolute img path
        # e.g., Brain_tumor_images/<train or test>/<class label>/<file name>
        path = dir + "/" + df.iloc[i]["class"] + "/" +df.iloc[i]["file"]
        # print(img)
        img = Image.open(path)
        # print("Img: {} \tClass: {}".format(np.array(img).flatten(), df.iloc[i]["class"]))
        pixels.append(cp.asnumpy(cp.array(img)).flatten())
        # pixels.append(np.array(img))

        # y_true encoding here
        class_.append(classes_.index(df.iloc[i]["class"]))

        # end loop here

    # return train_df
    return pixels, class_

## Generate the train and test sets

In [None]:
fer_df_train = get_data_df(DATASET_02_TRAIN+"/*/*.jpg")
fer_df_test = get_data_df(DATASET_02_TEST+"/*/*.jpg")

In [None]:
classes = fer_df_train["class"].unique().tolist()
# classes
# classes.index("surprise")

In [None]:
x_train, y_train = generate_set(fer_df_train, DATASET_02_TRAIN, classes)
x_test, y_test = generate_set(fer_df_test, DATASET_02_TEST, classes)

### EDA

In [None]:
fer_df_test.sample(10)

In [None]:
# fer_df_train.shape

In [None]:
x_train_df = pd.DataFrame()
x_train_df['class'] = y_train

# x_train_df

In [None]:
# x_train_df["class"].hist()

In [None]:
# print(np.array(x_train).shape)
# print(np.array(x_test).shape)

# Feature Engineering

In [None]:
# from sklearn.decomposition import PCA
from cuml.decomposition import PCA
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
import cv2
from skimage.feature import local_binary_pattern
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from matplotlib.pyplot import figure

## LBP & PCA

In [None]:
def generate_set2(df, dir, p, r, classes_, method_="uniform"):
  # Generate train and test sets through OpenCV with grayscale in all pixels

    pixels = []
    class_ = []
    eps = 1e-7 # 0.0000001

    # trainset = glob.glob(dir)
    for i in range(len(df.index)):
        # get the absolute img path
        # e.g., Brain_tumor_images/<train or test>/<class label>/<file name>
        path = dir + "/" + df.iloc[i]["class"] + "/" +df.iloc[i]["file"]
        img = cv2.imread(path)

        # convert the read img into grayscale
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        lbp_img = local_binary_pattern(img_gray, p, r, method=method_)
        # pixels.append(np.array(lbp_img).flatten())

        # compute the histogram
        (hist, _) = np.histogram(
            lbp_img.ravel(),
            bins=np.arange(0, p + 3),
            range=(0, p + 2)
          )

        hist = hist.astype("float")
        hist /= (hist.sum() + eps)
        pixels.append(hist)

        # y_true encoding here
        class_.append(classes_.index(df.iloc[i]["class"]))
        # end loop
        
    return pixels, class_

In [None]:
selected_exp_var = [0.7, 0.8, 0.9, 0.95, 0.97, 0.99, 1.0]

# none img augmentation
selected_num_comp = [13, 32, 104, 256, 425, 904, 2304]

# img augmentation
selected_num_comp_aug = [14, 32, 108, 269, 446, 940, 2265]

# combination of p and r values:
pr_list = [
  {"p": 8, "r": 1},
  {"p": 16, "r": 2},
  {"p": 24, "r": 3},
]

In [None]:
pr_index = 2

x_train_fs, y_train_fs = generate_set2(fer_df_train, DATASET_02_TRAIN, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)
x_test_fs, y_test_fs = generate_set2(fer_df_test, DATASET_02_TEST, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)

In [None]:
pca = PCA()
x_train_pca = pca.fit_transform(np.array(x_train_fs))
x_test_pca = pca.transform(np.array(x_test_fs))

In [None]:
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
exp_var_cumul_round = np.round_(exp_var_cumul, decimals = 4)

num_comp = range(1, exp_var_cumul_round.shape[0] + 1)

print(exp_var_cumul_round)

### PCA Visualization

In [None]:
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    x_train_pca,
    # x_train_df,
    labels=labels,
    # dimensions=(0, 1),
    dimensions=range(4),
    # color=x_train_df['class'],
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
# get a list of # of components
num_comp = range(1, exp_var_cumul_round.shape[0] + 1)

per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)

fig = px.line(
    x=num_comp,
    y=exp_var_cumul_round,
    labels={"x": "# Components", "y": "Cumulative Explained Variance"},
    title = "# of components V.S. variance",
    # markers=True
)

fig.show()

In [None]:
# cumsum_var_243 = [0.8618, 0.9531, 0.9796, 0.9907, 1.0]
cumsum_var_243_aug = [0.8611, 0.9525, 0.9793, 0.9905, 1.0]

selectd_num_comp243 = [2, 7, 13, 18, 26]

In [None]:
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
# labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]

fig = px.bar(
    x=range(1,len(per_var)+1), 
    y=per_var, 
    title='Scree Plot of # of components and % of explained variance',
    labels={"x": "# Components", "y": "% of Explained Variance"},
)
fig.show()

# Model

## Setup

In [None]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from numpy import mean

from cuml.svm import SVC

In [None]:
def generate_PCA_set(comp, x_train, x_test):
    pca = PCA(n_components=comp)
    __x_train__ = pca.fit_transform(x_train)
    __x_test__ = pca.transform(x_test)
    return __x_train__, __x_test__

In [None]:
def get_model():
    return SVC(random_state=1, multiclass_strategy="ovr")

def get_model2():
    return OneVsRestClassifier(SVC(random_state=1, multiclass_strategy="ovr"))

In [None]:
def feature_scale(_x_train_, _x_test_):
    sc = StandardScaler()
    
    train_sc = sc.fit_transform(_x_train_)
    test_sc = sc.transform(_x_test_)
    
    return train_sc, test_sc


def feature_scale2(_x_train_):
    sc = StandardScaler()
    
    train_sc2 = sc.fit_transform(_x_train_)
    
    return train_sc2

## Train

In [None]:
cumsum_var_81 = [0.8341, 0.9512, 0.9801, 0.9946, 1.0]
cumsum_var_81_aug = [0.8344, 0.9512, 0.9801, 0.9949, 1.0]
selectd_num_comp81 = [2, 4, 7, 8, 10]
lbp_exp81 = list()
pr_index = 0

x_train_, y_train_ = generate_set2(fer_df_train, DATASET_02_TRAIN, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)
x_test_, y_test_ = generate_set2(fer_df_test, DATASET_02_TEST, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)

for i in range(len(selectd_num_comp81)):
    x_train_pca, x_test_pca = generate_PCA_set(selectd_num_comp81[i], np.array(x_train_), np.array(x_test_))

    lbp_exp81.append({
        "explained_variance_%": cumsum_var_81_aug[i],
        "p_r": (pr_list[pr_index]["p"], pr_list[pr_index]["r"]),
        "model": get_model(),
        "x_train": x_train_pca,
        "y_train": y_train_,
        "x_test": x_test_pca,
        "y_test": y_test_,
    })

In [None]:
cumsum_var_162 = [0.8335, 0.9501, 0.9779, 0.9899, 1.0]
cumsum_var_162_aug = [0.8349, 0.9495, 0.9775, 0.9897, 1.0]

selectd_num_comp162 = [2, 6, 10, 13, 18]
lbp_exp162 = list()
pr_index = 1

x_train_, y_train_ = generate_set2(fer_df_train, DATASET_02_TRAIN, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)
x_test_, y_test_ = generate_set2(fer_df_test, DATASET_02_TEST, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)

for i in range(len(selectd_num_comp162)):    
    x_train_pca, x_test_pca = generate_PCA_set(selectd_num_comp162[i], np.array(x_train_), np.array(x_test_))

    lbp_exp162.append({
        "explained_variance_%": cumsum_var_162_aug[i],
        "p_r": (pr_list[pr_index]["p"], pr_list[pr_index]["r"]),
        "model": get_model(),
        "x_train": x_train_pca,
        "y_train": y_train_,
        "x_test": x_test_pca,
        "y_test": y_test_,
    })

In [None]:
cumsum_var_243 = [0.8618, 0.9531, 0.9796, 0.9907, 1.0]
cumsum_var_243_aug = [0.8611, 0.9525, 0.9793, 0.9905, 1.0]

selectd_num_comp243 = [2, 7, 13, 18, 26]
lbp_exp243 = list()
pr_index = 2

x_train_, y_train_ = generate_set2(fer_df_train, DATASET_02_TRAIN, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)
x_test_, y_test_ = generate_set2(fer_df_test, DATASET_02_TEST, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)

for i in range(len(selectd_num_comp243)):
    x_train_pca, x_test_pca = generate_PCA_set(selectd_num_comp243[i], np.array(x_train_), np.array(x_test_))

    lbp_exp243.append({
        "explained_variance_%": cumsum_var_243_aug[i],
        "p_r": (pr_list[pr_index]["p"], pr_list[pr_index]["r"]),
        "model": get_model(),
        "x_train": x_train_pca,
        "y_train": y_train_,
        "x_test": x_test_pca,
        "y_test": y_test_,
    })
    

    
    
# x_train_, y_train_ = generate_set2(fer_df_train, DATASET_02_TRAIN, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)
# x_test_, y_test_ = generate_set2(fer_df_test, DATASET_02_TEST, pr_list[pr_index]["p"], pr_list[pr_index]["r"], classes)

# i = -1
# x_train_pca, x_test_pca = generate_PCA_set(selectd_num_comp243[i], x_train_, x_test_)

# lbp_exp_best = {
#     "explained_variance_%": cumsum_var_243_aug[i],
#     "p_r": (pr_list[pr_index]["p"], pr_list[pr_index]["r"]),
#     "model": get_model(),
#     "x_train": x_train_pca,
#     "y_train": y_train_,
#     "x_test": x_test_pca,
#     "y_test": y_test_,
# }

### Experiment Tracking

In [None]:
# wandb.init(
#   project="LBP_81_PCA_SVM",
#   notes="LBP with PCA on 81 for pr value",
# )

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
def svm_lbp_pca_training(arr_):
    
    uw_pcaLBP_acc_result, w_pcaLBP_acc_result = list(), list()
    
    for i in range(len(arr_)):
        # scaling
        x_train_sc, x_test_sc = feature_scale(arr_[i]["x_train"], arr_[i]["x_test"])

        x_train_sc, x_test_sc = np.array(x_train_sc), np.array(x_test_sc)
        y_train = np.array(arr_[i]["y_train"], dtype=np.float32)

        # train the model
        # model = SVC(C=1.0, kernel='poly', gamma=1.0, degree=3, random_state=1, multiclass_strategy="ovr")
        model = arr_[i]["model"]
        # model = SVC(decision_function_shape="ovr", kernel="rbf", random_state=13)
        model.fit(x_train_sc, y_train)

        # predict the train set
        y_trainHat = model.predict(x_train_sc)

        # predict the train set
        y_testHat = model.predict(x_test_sc)

        # compute the unweighted accuracy
        uw_acc = balanced_accuracy_score(arr_[i]["y_test"], y_testHat)
        uw_pcaLBP_acc_result.append(uw_acc)
        print("The unweighted accuracy: {}".format(uw_acc))

        # compute the weighted accuracy
        w_acc = accuracy_score(arr_[i]["y_test"], y_testHat)
        print("The weighted accuracy: {}".format(w_acc))
        w_pcaLBP_acc_result.append(w_acc)
        # end loop
        
    return uw_pcaLBP_acc_result, w_pcaLBP_acc_result



def svm_lbp_pca_training2(arr_):
    
    # uw_pcaLBP_acc_result, w_pcaLBP_acc_result = list(), list()
    
    # for i in range(len(arr_)):
    # scaling
    x_train_sc, x_test_sc = feature_scale(arr_["x_train"], arr_["x_test"])

    x_train_sc, x_test_sc = np.array(x_train_sc), np.array(x_test_sc)
    y_train = np.array(arr_["y_train"], dtype=np.float32)

    # train the model
    # model = SVC(C=1.0, kernel='poly', gamma=1.0, degree=3, random_state=1, multiclass_strategy="ovr")
    model = arr_["model"]
    # model = SVC(decision_function_shape="ovr", kernel="rbf", random_state=13)
    model.fit(x_train_sc, y_train)

    # predict the train set
    y_trainHat = model.predict(x_train_sc)

    # predict the train set
    y_testHat = model.predict(x_test_sc)

    # compute the unweighted accuracy
    uw_acc = balanced_accuracy_score(arr_["y_test"], y_testHat)
    # uw_pcaLBP_acc_result.append(uw_acc)
    print("The unweighted accuracy: {}".format(uw_acc))

    # compute the weighted accuracy
    w_acc = accuracy_score(arr_["y_test"], y_testHat)
    print("The weighted accuracy: {}".format(w_acc))
    # w_pcaLBP_acc_result.append(w_acc)
    # end loop

    # return uw_pcaLBP_acc_result, w_pcaLBP_acc_result

In [None]:
uw_acc81, w_acc81 = svm_lbp_pca_training(lbp_exp81)

In [None]:
uw_acc162, w_acc162 = svm_lbp_pca_training(lbp_exp162)

In [None]:
uw_acc243, w_acc243 = svm_lbp_pca_training(lbp_exp243)

# svm_lbp_pca_training2(lbp_exp_best)

In [None]:
import plotly.graph_objects as go

# x_label = zip(cumsum_var_81, selectd_num_comp81)

fig = go.Figure()
fig.add_trace(go.Line(
    x=cumsum_var_81,
    y=uw_acc81,
    mode='lines',
    name='162 Unweighted Test Accuracy',
    marker=dict(
        color='red',
        size=10
    ))
)

fig.add_trace(go.Line(
    x=cumsum_var_81,
    y=w_acc81,
    mode='lines',
    name='162 Weighted Test Accuracy',
    marker=dict(
        color='green',
        size=10
    ))
)


fig.update_layout(
    title="Tuned parameter SVM with LBP (8,1) & PCA",
    xaxis_title="% of Explained Variance",
    yaxis_title="Accuracy (%)"
)
fig.update_traces(mode='markers+lines')

# wandb.log({"LBP_162_PCA_SVM": fig})

fig.show()

In [None]:
import plotly.graph_objects as go

# x_label = zip(cumsum_var_81, selectd_num_comp81)

fig = go.Figure()
fig.add_trace(go.Line(
    x=cumsum_var_81,
    y=uw_acc162,
    mode='lines',
    name='162 Unweighted Test Accuracy',
    marker=dict(
        color='red',
        size=10
    ))
)

fig.add_trace(go.Line(
    x=cumsum_var_81,
    y=w_acc162,
    mode='lines',
    name='162 Weighted Test Accuracy',
    marker=dict(
        color='green',
        size=10
    ))
)


fig.update_layout(
    title="Tuned parameter SVM with LBP (16,2) & PCA",
    xaxis_title="% of Explained Variance",
    yaxis_title="Accuracy (%)"
)
fig.update_traces(mode='markers+lines')

# wandb.log({"LBP_162_PCA_SVM": fig})

fig.show()

In [None]:
import plotly.graph_objects as go

# x_label = zip(cumsum_var_81, selectd_num_comp81)

fig = go.Figure()
fig.add_trace(go.Line(
    x=cumsum_var_81,
    y=uw_acc243,
    mode='lines',
    name='162 Unweighted Test Accuracy',
    marker=dict(
        color='red',
        size=10
    ))
)

fig.add_trace(go.Line(
    x=cumsum_var_81,
    y=w_acc243,
    mode='lines',
    name='162 Weighted Test Accuracy',
    marker=dict(
        color='green',
        size=10
    ))
)


fig.update_layout(
    title="Tuned parameter SVM with LBP (24,3) & PCA",
    xaxis_title="% of Explained Variance",
    yaxis_title="Accuracy (%)"
)
fig.update_traces(mode='markers+lines')

# wandb.log({"LBP_162_PCA_SVM": fig})

fig.show()

# Analysis

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from cuml.metrics import confusion_matrix

import plotly.express as px

from sklearn.metrics import roc_curve, auc
from matplotlib.pyplot import figure
from sklearn.preprocessing import LabelBinarizer

In [None]:
best_item = lbp_exp_best

x_train_sc_, x_test_sc_ = feature_scale(best_item["x_train"], best_item["x_test"])

x_train_sc_, x_test_sc_ = np.array(x_train_sc_), np.array(x_test_sc_)
y_train_ = np.array(best_item["y_train"], dtype=np.float32)

y_pred_ = best_item["model"].predict(x_test_sc_)
y_pred_ = np.array(y_pred_, dtype=np.int32)
y_test_ = np.array(best_item["y_test"], dtype=np.int32)

In [None]:
conf_matrix = cp.asnumpy(confusion_matrix(y_test_, y_pred_))
confM = conf_matrix

t = "confusion matrix for SVM on LBP (24, 3) with PCA (" + str(best_item["explained_variance_%"]) + " %) var"
fig = px.imshow(conf_matrix, text_auto=True, title=t)
fig.show()

In [None]:
per_class_acc = list()

for i in range(len(classes)):
    row_sum = np.array(confM[i]).sum()
    acc = (conf_matrix[i][i]/row_sum)*100
    per_class_acc.append(acc)
    
for i in range(len(classes)):
    print("[{}: {} - {}]".format(i, classes[i], per_class_acc[i]), end=", ")

In [None]:
y_pred2 = np.array(y_pred_).astype(int)

lb = LabelBinarizer()

end_y_pred2 = lb.fit_transform(y_pred2)
end_y_test = lb.fit_transform(y_test_)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(len(classes)):
    fpr[i], tpr[i], _ = roc_curve(end_y_test[:,i], end_y_pred2[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(end_y_test.ravel(), end_y_pred2.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Plot ROC curve
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]))
for i in range(len(classes)):
    plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                   ''.format(i, roc_auc[i]))

# figure(figsize=(8, 6), dpi=80)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(bbox_to_anchor=(1.04,1), borderaxespad=0)
plt.show()

In [None]:
print("Classification report for SVM on LBP (24, 3) with PCA (" + str(best_item["explained_variance_%"]) + " %) var")
print(classification_report(y_test, y_pred2, target_names=classes))