# Classifier Tests Jupyter Notebook


### Class: CPSC 322, Spring 2021


### Submitted By: Hailey Mueller and Chloe Crawford

In [1]:
import importlib

# Import myutils.py from mysklearn forlder
import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

# Import mypytable.py from mysklearn folder
import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

# Import myclassifiers.py from mysklearn folder
import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyRandomForestClassifier, MyDecisionTreeClassifier, MyNaiveBayesClassifier

# Import myevaluation.py from mysklearn folder
import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

## Random Forest Classifier

In [2]:
"""
Programmer: Chloe Crawford
Class: CPSC 322-01, Spring 2021
Assignment: Final Project
Date Last Updated: 5/05/21

Description: This python script calculates the accuracy, error rate, and confusion matrix of the 
    Random Forest Classifier using train test split.
"""

import numpy as np
from tabulate import tabulate 

# Get data from csv file
table = MyPyTable().load_from_file("input_files/winequality-red.csv")
y_col = table.get_column("quality", False)
x_cols = table.drop_col("quality")

# Split data into train and test cases
X_train, X_test, y_train, y_test = myutils.train_test_split(x_cols, y_col, .33)

# Use Random Forest to classify
testcase = MyRandomForestClassifier()
testcase.fit(X_train, y_train, X_test, y_test)
y_predicted = testcase.predict(X_test)

numCorrectPredictions = 0
numWrongPredictions = 0
for i in range(len(y_test)):
    values = [y_predicted[i], y_test[i]] #predicted/actual
    if(values[0]==values[1]):
        numCorrectPredictions = numCorrectPredictions+1
    else:
        numWrongPredictions = numWrongPredictions+1

accuracy = np.round((numCorrectPredictions)/(numCorrectPredictions+numWrongPredictions),3)
error_rate = np.round((numWrongPredictions)/(numCorrectPredictions+numWrongPredictions),3)

print("-----------------------------------------------------------")
print("Accuracy and Error Rate")
print("-----------------------------------------------------------")
print("Random Forest: accuracy = {}, error rate = {}".format(accuracy,error_rate))

# Confusion Matrix
y_true = []
y_pred = []
for i,fold in enumerate(X_test):
    values = [y_predicted[i], y_test[i]] #predicted/actual
    y_pred.append(values[0])
    y_true.append(values[1])

labels = [3,4,5,6,7,8]
lr_test = myevaluation.confusion_matrix(y_true, y_pred, labels)
labels.insert(0,"Quality")
labels.append("Total")
labels.append("Recognition (%)")

for i,row in enumerate(lr_test):
    row.insert(0,labels[i+1])
    rowSum = sum(row[1:])
    row.append(rowSum)
    numCorrect = row[i+1]
    if(rowSum==0):
        row.append(0)
    else:
        row.append(round((numCorrect/rowSum)*100,2))

print("\n-----------------------------------------------------------")
print("Confusion Matrix")
print("-----------------------------------------------------------")
print("Random Forest (Train Test Split Validation Results):")
print(tabulate(lr_test, headers=labels,tablefmt="rst"))

-----------------------------------------------------------
Accuracy and Error Rate
-----------------------------------------------------------
Random Forest: accuracy = 0.375, error rate = 0.625

-----------------------------------------------------------
Confusion Matrix
-----------------------------------------------------------
Random Forest (Train Test Split Validation Results):
  Quality    3    4    5    6    7    8    Total    Recognition (%)
        3    0    0    5    0    0    0        5                  0
        4    0    0   21    0    0    0       21                  0
        5    0    0  198    0    0    0      198                100
        6    0    0  245    0    0    0      245                  0
        7    0    0   52    0    0    0       52                  0
        8    0    0    7    0    0    0        7                  0


## Naive Bayes Classifier

In [3]:
"""
Programmer: Hailey Mueller
Class: CPSC 322-01, Spring 2021
Assignment: Final Project
Date Last Updated: 5/05/21

Description: This python script calculates the accuracy, error rate, and confusion matrix of the 
    Naive-Bayes Classifier using Stratified K-Fold Cross Validation.
"""

importlib.reload(mysklearn.mypytable)
importlib.reload(mysklearn.myevaluation)
import numpy as np
from tabulate import tabulate

# Get data from csv file
table = MyPyTable().load_from_file(os.path.join("input_files","winequality-red.csv"))
y_col = table.get_column("quality", False)
x_cols = table.drop_col("quality")

# Use Naive Bayes to classify
testcase = MyNaiveBayesClassifier()

#Returns x INDEXES
X_train, X_test = myevaluation.stratified_kfold_cross_validation(x_cols,y_col,n_splits=10)
X_train, X_test, y_train, y_test = myutils.getInstances(X_train, X_test, x_cols,y_col)

predicted_values = []
for i,fold in enumerate(X_train):
    train,test = myutils.normalize_values(X_train[i],X_test[i])
    testcase.fit(train,y_train[i])
    predicted_values.append(testcase.predict(test))

numCorrectPredictions = 0
numWrongPredictions = 0
for i,fold in enumerate(X_test):
    for index in range(len(fold)):
        values = [predicted_values[i][index], y_test[i][index]] #predicted/actual
        if(values[0]==values[1]):
            numCorrectPredictions = numCorrectPredictions+1
        else:
            numWrongPredictions = numWrongPredictions+1

accuracy = np.round((numCorrectPredictions)/(numCorrectPredictions+numWrongPredictions),3)
error_rate = np.round((numWrongPredictions)/(numCorrectPredictions+numWrongPredictions),3)

print("-----------------------------------------------------------")
print("Accuracy and Error Rate")
print("-----------------------------------------------------------")
print("Naive Bayes: accuracy = {}, error rate = {}".format(accuracy,error_rate))

# Confusion Matrix
y_true = []
y_pred = []
for i,fold in enumerate(X_test):
    for index in range(len(fold)):
        values = [predicted_values[i][index], y_test[i][index]] #predicted/actual
        y_pred.append(values[0])
        y_true.append(values[1])

labels = [3,4,5,6,7,8]
lr_test = myevaluation.confusion_matrix(y_true, y_pred, labels)
labels.insert(0,"Quality")
labels.append("Total")
labels.append("Recognition (%)")

for i,row in enumerate(lr_test):
    row.insert(0,labels[i+1])
    rowSum = sum(row[1:])
    row.append(rowSum)
    numCorrect = row[i+1]
    if(rowSum==0):
        row.append(0)
    else:
        row.append(round((numCorrect/rowSum)*100,2))

print("\n-----------------------------------------------------------")
print("Confusion Matrix")
print("-----------------------------------------------------------")
print("Naive-Bayes (Stratified 10-Fold Cross Validation Results):")
print(tabulate(lr_test, headers=labels,tablefmt="rst"))

-----------------------------------------------------------
Accuracy and Error Rate
-----------------------------------------------------------
Naive Bayes: accuracy = 0.6, error rate = 0.4

-----------------------------------------------------------
Confusion Matrix
-----------------------------------------------------------
Naive-Bayes (Stratified 10-Fold Cross Validation Results):
  Quality    3    4    5    6    7    8    Total    Recognition (%)
        3    0    0    9    1    0    0       10               0
        4    0    0   41   12    0    0       53               0
        5    0    1  553  118    9    0      681              81.2
        6    0    0  294  332   12    0      638              52.04
        7    0    0   79   47   73    0      199              36.68
        8    0    0   10    2    4    2       18              11.11


## Decision Tree Classifier

In [4]:
"""
Programmer: Hailey Mueller
Class: CPSC 322-01, Spring 2021
Assignment: Final Project
Date Last Updated: 5/05/21

Description: This python script calculates the accuracy, error rate, and confusion matrix of the 
    Decision Tree Classifier using Stratified K-Fold Cross Validation.
"""

importlib.reload(mysklearn.mypytable)
importlib.reload(mysklearn.myevaluation)
import numpy as np

# Get data from csv file
table = MyPyTable().load_from_file(os.path.join("input_files","winequality-red.csv"))
y_col = table.get_column("quality", False)
x_cols = table.drop_col("quality")

# Use Decision Tree to classify
testcase = MyDecisionTreeClassifier()

#Returns x INDEXES
X_train, X_test = myevaluation.stratified_kfold_cross_validation(x_cols,y_col,n_splits=10)
X_train, X_test, y_train, y_test = myutils.getInstances(X_train, X_test, x_cols,y_col)

predicted_values = []
for i,fold in enumerate(X_train):
    train = myutils.categorize_dataset(X_train[i])
    test = myutils.categorize_dataset(X_test[i])
    testcase.fit(train,y_train[i])
    predicted_values.append(testcase.predict(test))

#testcase.print_decision_rules(table.column_names[:-1],"quality")

numCorrectPredictions = 0
numWrongPredictions = 0
for i,fold in enumerate(X_test):
    for index in range(len(fold)):
        values = [predicted_values[i][index], y_test[i][index]] #predicted/actual
        if(values[0]==values[1]):
            numCorrectPredictions = numCorrectPredictions+1
        else:
            numWrongPredictions = numWrongPredictions+1

accuracy = np.round((numCorrectPredictions)/(numCorrectPredictions+numWrongPredictions),3)
error_rate = np.round((numWrongPredictions)/(numCorrectPredictions+numWrongPredictions),3)

print("-----------------------------------------------------------")
print("Accuracy and Error Rate")
print("-----------------------------------------------------------")
print("Decision Tree: accuracy = {}, error rate = {}".format(accuracy,error_rate))

# Confusion Matrix
y_true = []
y_pred = []
for i,fold in enumerate(X_test):
    for index in range(len(fold)):
        values = [predicted_values[i][index], y_test[i][index]] #predicted/actual
        y_pred.append(values[0])
        y_true.append(values[1])

labels = [3,4,5,6,7,8]
lr_test = myevaluation.confusion_matrix(y_true, y_pred, labels)
labels.insert(0,"Quality")
labels.append("Total")
labels.append("Recognition (%)")

for i,row in enumerate(lr_test):
    row.insert(0,labels[i+1])
    rowSum = sum(row[1:])
    row.append(rowSum)
    numCorrect = row[i+1]
    if(rowSum==0):
        row.append(0)
    else:
        row.append(round((numCorrect/rowSum)*100,2))

print("\n-----------------------------------------------------------")
print("Confusion Matrix")
print("-----------------------------------------------------------")
print("Decision Tree (Stratified 10-Fold Cross Validation Results):")
print(tabulate(lr_test, headers=labels,tablefmt="rst"))

-----------------------------------------------------------
Accuracy and Error Rate
-----------------------------------------------------------
Decision Tree: accuracy = 0.579, error rate = 0.421

-----------------------------------------------------------
Confusion Matrix
-----------------------------------------------------------
Decision Tree (Stratified 10-Fold Cross Validation Results):
  Quality    3    4    5    6    7    8    Total    Recognition (%)
        3    0    0    6    4    0    0       10               0
        4    2    0   26   23    2    0       53               0
        5    3    2  477  187   11    1      681              70.04
        6    2    1  190  393   52    0      638              61.6
        7    0    0   22  121   56    0      199              28.14
        8    0    0    4    6    8    0       18               0
