Initializations

In [89]:
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

TRAIN_RATIO = 0.6
VALIDATION_RATIO = 0.2
TEST_RATIO = 0.2 
QUALITY_THRESHOLD = 5

DATA_PATH = 'data/winequality-white.csv'



TASK 1 \\
Reads in data (expects the winequality-white.csv file in data subdirectory) 

In [90]:
X = pd.read_csv(DATA_PATH, header=0, delimiter=";")
y = np.array([1 if quality > QUALITY_THRESHOLD else 0 for quality in X['quality'] ])
X.drop('quality', axis=1, inplace=True)

TASK 2: \\
Calculates the ratio of ones and zeros

In [91]:
ratioOnes = sum(y) / len(y)
print(f"The Dataset is {ratioOnes * 100: .2f}% oness and {(1 - ratioOnes) * 100: .2f}% Zeros")

The Dataset is  66.52% oness and  33.48% Zeros


TASK 3: \\
Splits the data 60/20/20 for test/validate train, and reports the results \\
Data is saved in the data as appropriatley named csv files

In [92]:
X_temp, X_test, y_temp, y_test = train_test_split(X, y, train_size=TRAIN_RATIO + VALIDATION_RATIO, stratify=y)
X_train, X_validate, y_train, y_validate = train_test_split(X_temp, y_temp, train_size=TRAIN_RATIO / (TRAIN_RATIO + VALIDATION_RATIO), stratify=y_temp)
y_train, y_validate, y_test = pd.DataFrame(y_train), pd.DataFrame(y_validate), pd.DataFrame(y_test)
print(f"The size of the training set is {len(X_train)}")
print(f"The size of the validation set is {len(X_validate)}")
print(f"The size of the testing set is {len(X_test)}")
print(f"The ratio is approximatley {len(X_train) / len(X) * 100:.0f}% Train, {len(X_validate) / len(X) * 100:.0f}% Validate, {len(X_test) / len(X)*100:.0f}% Test")
X_train.to_csv("data/trainX.csv")
X_validate.to_csv("data/validateX.csv")
X_test.to_csv("data/testX.csv")
y_train.to_csv("data/trainy.csv")
y_validate.to_csv("data/validatey.csv")
y_test.to_csv("data/testy.csv")

The size of the training set is 2938
The size of the validation set is 980
The size of the testing set is 980
The ratio is approximatley 60% Train, 20% Validate, 20% Test


TASK 4: \\
Data is read from the data subdirectory, using naming conventions as seeen above \\
Features are then standardized, and a single example is printed

In [93]:

X_train = pd.read_csv("data/trainX.csv", index_col=0)
X_validate = pd.read_csv("data/validateX.csv", index_col=0)
X_test = pd.read_csv("data/testX.csv", index_col=0)
y_train = pd.read_csv("data/trainy.csv", index_col=0)['0'].values.tolist()
y_validate = pd.read_csv("data/validatey.csv", index_col=0)['0'].values.tolist()
y_test = pd.read_csv("data/testy.csv", index_col=0)['0'].values.tolist()
scalar = StandardScaler().fit(X_train)
standardized_X_train = scalar.transform(X_train) 
scalar = StandardScaler().fit(X_validate)
standardized_X_validate = scalar.transform(X_validate) 
scalar = StandardScaler().fit(X_test)
standardized_X_test = scalar.transform(X_test)


TASK 5: \\
A variaty of models are trained on the standardized data, and a report is generated for each based on validation data. \\

In [94]:
class ModelTemplate():
    def __init__(self, baseName, generator, variants):
        self.baseName = baseName
        self.generator = generator
        self.variants = variants

def createLikeModels(modelData, reportData, savedModels):
    for variant in modelData.variants:
        model = modelData.generator(variant)
        model.fit(standardized_X_train, y_train)
        prediction = model.predict(standardized_X_validate)
        trueP = 0
        falseP = 0
        trueN = 0
        falseN = 0

        for i in range(len(y_validate)):
            if y_validate[i]:
                if prediction[i]:
                    trueP += 1
                else:
                    falseN += 1
            else:
                if prediction[i]:
                    falseP += 1
                else:
                    trueN += 1

        accuracy = (trueP + trueN) / len(prediction)
        recall = trueP / (trueP + falseN)
        precision = trueP / (trueP + falseP)
        f1 = 2 * recall * precision / (recall + precision)

        reportData["model"].append(f"{modelData.baseName}_{variant}")
        reportData["accuracy"].append(accuracy)
        reportData["precision"].append(precision)
        reportData["recall"].append(recall)
        reportData["f1"].append(f1)

        savedModels[f"{modelData.baseName}_{variant}"] = model
    return

data = {
    "model": [],
    "accuracy": [],    
    "precision": [],
    "recall": [],
    "f1": []
}

trainedModels = {}

modelTemplates = []
modelTemplates.append(ModelTemplate("kNN", lambda x: neighbors.KNeighborsClassifier(n_neighbors=x), [1,3,5]))
modelTemplates.append(ModelTemplate("SVM", lambda x: SVC(kernel=x), ["rbf", "linear", "poly"]))
modelTemplates.append(ModelTemplate("TREE", lambda x: DecisionTreeClassifier(criterion=x), ["gini", "entropy"]))
modelTemplates.append(ModelTemplate("LOG_REG", lambda x: LogisticRegression(penalty=x, solver="liblinear"), ["l1", "l2"]))

for template in modelTemplates:
    createLikeModels(template, data, trainedModels)


results = pd.DataFrame(data, columns=["model", "accuracy", "precision", "recall", "f1"])
print(results)

          model  accuracy  precision    recall        f1
0         kNN_1  0.779592   0.831307  0.838957  0.835115
1         kNN_3  0.756122   0.800582  0.843558  0.821509
2         kNN_5  0.750000   0.791130  0.848160  0.818653
3       SVM_rbf  0.771429   0.801408  0.872699  0.835536
4    SVM_linear  0.755102   0.777628  0.884969  0.827834
5      SVM_poly  0.732653   0.735507  0.934049  0.822973
6     TREE_gini  0.713265   0.796800  0.763804  0.779953
7  TREE_entropy  0.707143   0.778626  0.782209  0.780413
8    LOG_REG_l1  0.753061   0.780822  0.874233  0.824891
9    LOG_REG_l2  0.754082   0.781122  0.875767  0.825741


TASK 6/7: \\
The best model is selected using the highest f1 score, and a report is generated based on testing data. \\
In this case, the best model ended up being SVM_rbf

In [95]:
def reportTraining(model):
    prediction = model.predict(standardized_X_test)
    trueP = 0
    falseP = 0
    trueN = 0
    falseN = 0

    for i in range(len(y_test)):
        if y_test[i]:
            if prediction[i]:
                trueP += 1
            else:
                falseN += 1
        else:
            if prediction[i]:
                falseP += 1
            else:
                trueN += 1

    accuracy = (trueP + trueN) / len(prediction)
    recall = trueP / (trueP + falseN)
    precision = trueP / (trueP + falseP)
    f1 = 2 * recall * precision / (recall + precision)
    print("Confusion Matrix:")
    print("--------------------------------")
    print(f"True Positives: {trueP}", end='')
    print(f"    False Positives: {falseP}")
    print(f"True Negatives: {trueN}", end='')
    print(f"    False Negatives: {falseN}")
    print("--------------------------------")
    print()
    print("Statistics:")
    print("--------------------------------")
    print(f"Accuracy: {accuracy: .4f}")
    print(f"Precision: {precision: .4f}")
    print(f"Recall: {recall: .4f}")
    print(f"F1: {f1: .4f}")



bestModel = results['model'].values[results['f1'].idxmax()]
print(f"The Best Model is {bestModel}")
print("------------------------------")
model = trainedModels[bestModel]
reportTraining(model)


The Best Model is SVM_rbf
------------------------------
Confusion Matrix:
--------------------------------
True Positives: 559    False Positives: 146
True Negatives: 182    False Negatives: 93
--------------------------------

Statistics:
--------------------------------
Accuracy:  0.7561
Precision:  0.7929
Recall:  0.8574
F1:  0.8239
