In [11]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree, neighbors, linear_model, svm, neural_network, metrics

# Problem 2a

## Generating Dataset

In [2]:
X = np.genfromtxt('winequality-red.csv', delimiter=';')

In [3]:
train_X = X[1:1001, 0:-1]
train_Y = np.array([X[1:1001, -1]>=6]).astype(int).squeeze()
test_X = X[1001::, 0:-1]
test_Y = np.array([X[1001::, -1]>=6]).astype(int).squeeze()

## Decision Tree

In [4]:
metrics = ["gini", "entropy"]
max_depth = [3, 5, 8]
for i in metrics:
    for j in max_depth:
        clf = tree.DecisionTreeClassifier(criterion=i, max_depth=int(j)).fit(train_X, train_Y)
        train_preds = clf.predict(train_X)
        test_preds = clf.predict(test_X)
        train_acc = clf.score(train_X, train_Y)*100
        test_acc = clf.score(test_X, test_Y)*100
        print("Criterion: {}, Max Depth: {}, Training Accuracy: {}, Test Accuracy: {}.\n".format(i, j, train_acc, test_acc))

Criterion: gini, Max Depth: 3, Training Accuracy: 73.8, Test Accuracy: 71.61936560934892.

Criterion: gini, Max Depth: 5, Training Accuracy: 79.4, Test Accuracy: 72.95492487479132.

Criterion: gini, Max Depth: 8, Training Accuracy: 89.1, Test Accuracy: 67.9465776293823.

Criterion: entropy, Max Depth: 3, Training Accuracy: 73.4, Test Accuracy: 71.78631051752922.

Criterion: entropy, Max Depth: 5, Training Accuracy: 77.7, Test Accuracy: 73.78964941569282.

Criterion: entropy, Max Depth: 8, Training Accuracy: 86.9, Test Accuracy: 69.44908180300501.



## K-Nearest Neighbors

In [5]:
n = [8, 12, 25]
for i in n:
    clf = neighbors.KNeighborsClassifier(n_neighbors=i).fit(train_X, train_Y)
    train_preds = clf.predict(train_X)
    test_preds = clf.predict(test_X)
    train_acc = clf.score(train_X, train_Y)*100
    test_acc = clf.score(test_X, test_Y)*100
    print("N-neighbors: {}, Training Accuracy: {}, Test Accuracy: {}.\n".format(i, train_acc, test_acc))

N-neighbors: 8, Training Accuracy: 75.5, Test Accuracy: 58.931552587646074.

N-neighbors: 12, Training Accuracy: 74.1, Test Accuracy: 59.59933222036727.

N-neighbors: 25, Training Accuracy: 71.8, Test Accuracy: 62.604340567612695.



## Logistic Regression

In [6]:
train_X_wbias = np.hstack((train_X, np.ones_like(train_Y)[:, np.newaxis]))
test_X_wbias = np.hstack((test_X, np.ones_like(test_Y)[:, np.newaxis]))
reg_method = ["l1", "l2", "elasticnet"]
for i in reg_method:
    if i == "l1":
        clf = linear_model.LogisticRegression(penalty=i, solver="saga", max_iter=100000).fit(train_X_wbias, train_Y)
    elif i == "elasticnet":
        clf = linear_model.LogisticRegression(penalty=i, solver="saga", l1_ratio=1, max_iter=100000).fit(train_X_wbias, train_Y)
    else:
        clf = linear_model.LogisticRegression(penalty=i, max_iter=100000).fit(train_X_wbias, train_Y)
    train_preds = clf.predict(train_X_wbias)
    test_preds = clf.predict(test_X_wbias)
    train_acc = clf.score(train_X_wbias, train_Y)*100
    test_acc = clf.score(test_X_wbias, test_Y)*100
    print("Penalty: {}, Training Accuracy: {}, Test Accuracy: {}.\n".format(i, train_acc, test_acc))

Penalty: l1, Training Accuracy: 72.8, Test Accuracy: 75.45909849749583.

Penalty: l2, Training Accuracy: 73.7, Test Accuracy: 76.62771285475793.

Penalty: elasticnet, Training Accuracy: 72.8, Test Accuracy: 75.45909849749583.



## Support Vector Machine

In [7]:
kernels = ["linear", "rbf"]
Cs = [0.01, 10, 1000]
for i in kernels:
    for j in Cs:
        clf = svm.SVC(C=j, kernel=i).fit(train_X, train_Y)
        train_preds = clf.predict(train_X)
        test_preds = clf.predict(test_X)
        train_acc = clf.score(train_X, train_Y)*100
        test_acc = clf.score(test_X, test_Y)*100
        print("Kernel: {}, C: {}, Training Accuracy: {}, Test Accuracy: {}.\n".format(i, j, train_acc, test_acc))
clf = svm.SVC(kernel="poly").fit(train_X, train_Y)
print("Kernel: poly, Training Accuracy: {}, Test Accuracy: {}.\n".format(train_acc, test_acc))

Kernel: linear, C: 0.01, Training Accuracy: 72.0, Test Accuracy: 72.45409015025042.

Kernel: linear, C: 10, Training Accuracy: 73.3, Test Accuracy: 76.62771285475793.

Kernel: linear, C: 1000, Training Accuracy: 73.2, Test Accuracy: 75.79298831385643.

Kernel: rbf, C: 0.01, Training Accuracy: 62.3, Test Accuracy: 60.60100166944908.

Kernel: rbf, C: 10, Training Accuracy: 71.5, Test Accuracy: 71.28547579298832.

Kernel: rbf, C: 1000, Training Accuracy: 77.5, Test Accuracy: 74.95826377295492.

Kernel: poly, Training Accuracy: 77.5, Test Accuracy: 74.95826377295492.



## Neural Network

In [8]:
act_fun = ["logistic", "tanh", "relu"]
for i in act_fun:
    clf = neural_network.MLPClassifier(hidden_layer_sizes=(10, 20, 10), activation=i, max_iter=100000).fit(train_X_wbias, train_Y)
    train_preds = clf.predict(train_X_wbias)
    test_preds = clf.predict(test_X_wbias)
    train_acc = clf.score(train_X_wbias, train_Y)*100
    test_acc = clf.score(test_X_wbias, test_Y)*100
    print("Set 1: Activation Function: {}, Training Accuracy: {}, Test Accuracy: {}.\n".format(i, train_acc, test_acc))
for i in act_fun:
    clf = neural_network.MLPClassifier(hidden_layer_sizes=(20, 50, 10), activation=i, max_iter=100000).fit(train_X_wbias, train_Y)
    train_preds = clf.predict(train_X_wbias)
    test_preds = clf.predict(test_X_wbias)
    train_acc = clf.score(train_X_wbias, train_Y)*100
    test_acc = clf.score(test_X_wbias, test_Y)*100
    print("Set 2: Activation Function: {}, Training Accuracy: {}, Test Accuracy: {}.\n".format(i, train_acc, test_acc))
for i in act_fun:
    clf = neural_network.MLPClassifier(hidden_layer_sizes=(30, 70, 10), activation=i, max_iter=100000).fit(train_X_wbias, train_Y)
    train_preds = clf.predict(train_X_wbias)
    test_preds = clf.predict(test_X_wbias)
    train_acc = clf.score(train_X_wbias, train_Y)*100
    test_acc = clf.score(test_X_wbias, test_Y)*100
    print("Set 3: Activation Function: {}, Training Accuracy: {}, Test Accuracy: {}.\n".format(i, train_acc, test_acc))

Set 1: Activation Function: logistic, Training Accuracy: 75.0, Test Accuracy: 75.79298831385643.

Set 1: Activation Function: tanh, Training Accuracy: 75.8, Test Accuracy: 73.12186978297161.

Set 1: Activation Function: relu, Training Accuracy: 74.4, Test Accuracy: 73.62270450751252.

Set 2: Activation Function: logistic, Training Accuracy: 74.7, Test Accuracy: 76.12687813021702.

Set 2: Activation Function: tanh, Training Accuracy: 76.6, Test Accuracy: 71.28547579298832.

Set 2: Activation Function: relu, Training Accuracy: 76.4, Test Accuracy: 72.78797996661102.

Set 3: Activation Function: logistic, Training Accuracy: 74.7, Test Accuracy: 75.79298831385643.

Set 3: Activation Function: tanh, Training Accuracy: 76.4, Test Accuracy: 72.45409015025042.

Set 3: Activation Function: relu, Training Accuracy: 75.4, Test Accuracy: 77.46243739565944.



# Problem 2b

A confusion matrix is a matrix that takes a prediction $i$ and try to categorize it with to a ground truth $j$, if $i$ and $j$ does not match, it is a false positive or negative, otherwise, it is a true positive and negative.

A precision takes components from confusion matrix (true positives and false positives) and put it in a ratio such that it represents a performance/efficacy value for the prediction algorithm.  

A recall matrix components acts as a precision, but on the true positive and false negative from the confusion matrix. It is a measurement of the ability of the prediction algorithm to output positive values.

A F1 matrix is the weighted average between the recall and precision.

## Best parameters from each category

### Decision Tree: Criterion = entropy, Max Depth = 5, Training Accuracy: 77.7, Test Accuracy: 73.45575959933221.

In [12]:
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5).fit(train_X, train_Y)
test_preds = clf.predict(test_X)
confusion = metrics.confusion_matrix(test_Y, test_preds)
precision = metrics.precision_score(test_Y, test_preds)
recall = metrics.recall_score(test_Y, test_preds)
f1 = metrics.f1_score(test_Y, test_preds)
print("Confusion Matrix =\n", confusion)
print("Precision = ", precision)
print("Recall = ", recall)
print("F1 = ", f1)

Confusion Matrix =
 [[145  94]
 [ 65 295]]
Precision =  0.7583547557840618
Recall =  0.8194444444444444
F1 =  0.787716955941255


### KNN: N-neighbors = 25, Training Accuracy: 71.8, Test Accuracy: 62.604340567612695.

In [13]:
clf = neighbors.KNeighborsClassifier(n_neighbors=25).fit(train_X, train_Y)
test_preds = clf.predict(test_X)
confusion = metrics.confusion_matrix(test_Y, test_preds)
precision = metrics.precision_score(test_Y, test_preds)
recall = metrics.recall_score(test_Y, test_preds)
f1 = metrics.f1_score(test_Y, test_preds)
print("Confusion Matrix =\n", confusion)
print("Precision = ", precision)
print("Recall = ", recall)
print("F1 = ", f1)

Confusion Matrix =
 [[133 106]
 [118 242]]
Precision =  0.6954022988505747
Recall =  0.6722222222222223
F1 =  0.6836158192090396


### Logistic Regression: Penalty = l2, Training Accuracy: 73.7, Test Accuracy: 76.62771285475793.

In [14]:
clf = linear_model.LogisticRegression(max_iter=100000).fit(train_X_wbias, train_Y)
test_preds = clf.predict(test_X_wbias)
confusion = metrics.confusion_matrix(test_Y, test_preds)
precision = metrics.precision_score(test_Y, test_preds)
recall = metrics.recall_score(test_Y, test_preds)
f1 = metrics.f1_score(test_Y, test_preds)
print("Confusion Matrix =\n", confusion)
print("Precision = ", precision)
print("Recall = ", recall)
print("F1 = ", f1)

Confusion Matrix =
 [[156  83]
 [ 57 303]]
Precision =  0.7849740932642487
Recall =  0.8416666666666667
F1 =  0.8123324396782843


### Support Vector Machine: Kernel = linear, C = 10, Training Accuracy: 73.3, Test Accuracy: 76.62771285475793.

In [15]:
clf = svm.SVC(C=10, kernel="linear").fit(train_X, train_Y)
test_preds = clf.predict(test_X)
confusion = metrics.confusion_matrix(test_Y, test_preds)
precision = metrics.precision_score(test_Y, test_preds)
recall = metrics.recall_score(test_Y, test_preds)
f1 = metrics.f1_score(test_Y, test_preds)
print("Confusion Matrix =\n", confusion)
print("Precision = ", precision)
print("Recall = ", recall)
print("F1 = ", f1)

Confusion Matrix =
 [[170  69]
 [ 71 289]]
Precision =  0.8072625698324022
Recall =  0.8027777777777778
F1 =  0.8050139275766017


### Neural Network: Hidden Layer Sizes = 20, 50, 10, Activation Function = Logistic

In [16]:
clf = neural_network.MLPClassifier(hidden_layer_sizes=(20, 50, 10), activation=i, max_iter=100000).fit(train_X_wbias, train_Y)
train_preds = clf.predict(train_X_wbias)
test_preds = clf.predict(test_X_wbias)
confusion = metrics.confusion_matrix(test_Y, test_preds)
precision = metrics.precision_score(test_Y, test_preds)
recall = metrics.recall_score(test_Y, test_preds)
f1 = metrics.f1_score(test_Y, test_preds)
print("Confusion Matrix =\n", confusion)
print("Precision = ", precision)
print("Recall = ", recall)
print("F1 = ", f1)

Confusion Matrix =
 [[159  80]
 [ 77 283]]
Precision =  0.7796143250688705
Recall =  0.7861111111111111
F1 =  0.7828492392807745


## Case study

Case A: I would sell the Logistic Regression model in this case because I want to maximize the number of positive value (good wine), therefore, I should look for the model with the highest Recall Score.

Case B: I would sell the Logisitic Regression model again in this case. The customer does not want to miss too many good wines, this implies that she wants a good recall score on the good wines. However, she also does not desire when classified good wines to actually taste bad (false positives), which implies a precision scoring. Together, I conclude that the best F1 score will satisfy her the most, which the Logistic Regression model is able to offer.