# CS74 Final Project
### Kevin King, Spring 2022
### Due: Tuesday, June 7th

Import general libraries needed for the final project (more to be imported later):

In [1]:
import autograd.numpy as np
from autograd import grad
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint 
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Training and Test Datasets

In [2]:
train_table = pd.read_table('Training.csv', delimiter=',').fillna('NULL')
test_table = pd.read_table('Test.csv', delimiter=',').fillna('NULL')

## Binary Classifier

In [4]:
# train data: y = classes, x = features
train_df = pd.DataFrame(train_table, columns = ['overall', 'reviewText','category', 'summary', 'verified', 'unixReviewTime', 'asin'])
y_train_df = train_df[['overall']].copy().astype(float)
x_train_df = train_df[['reviewText', 'category', 'summary', 'verified', 'unixReviewTime', 'asin']].copy()

# test data: x = features
test_df = pd.DataFrame(test_table, columns = ['overall', 'reviewText','category', 'summary', 'verified', 'unixReviewTime', 'asin'])
x_test_df = test_df[['reviewText', 'category', 'summary', 'verified', 'unixReviewTime', 'asin']].copy()

# vectorize train and test features - 2 different ones to improve accuracy
# vectorizer for reviewText
vectorizer_rt = TfidfVectorizer()
train_rt = vectorizer_rt.fit_transform(x_train_df.reviewText.tolist())
test_rt = vectorizer_rt.transform(x_test_df.reviewText.tolist())

#vectorizer for summary
vectorizer_sum = TfidfVectorizer()
train_sum = vectorizer_sum.fit_transform(x_train_df.summary.tolist())
test_sum = vectorizer_sum.transform(x_test_df.summary.tolist())

# train features, test features
train_features = hstack((train_rt, train_sum)).tocsr()
test_features = hstack((test_rt, test_sum)).tocsr()


#### Setup and Methods:

In [109]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score


# prints the predictions to a csv file for Kaggle
def make_csv(model_method, y_pred, type_name, num):
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{type_name}_{model_method}_{num}.csv', index=False)

# creates labels based on cutoff
def construct_labels(train_df, cutoff):
    train_label = train_df.overall.copy()
    train_label.loc[train_df.overall > cutoff] = 1
    train_label.loc[train_df.overall <= cutoff] = 0
    return train_label

# gets accuracy, precision, etc. and adds to results dictionary
def evaluate(results, y_labels, y_pred):
    # get values for accuracy, precision, recall, f1
    assert len(y_labels) == len(y_pred), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(y_labels, y_pred, average='macro')
    accuracy = accuracy_score(y_labels, y_pred)
    roc_auc = roc_auc_score(y_labels, y_pred)
    c_matrix = confusion_matrix(y_labels, y_pred)
    
    # append values to results dictionary
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1'].append(f1)
    results['ROC_AUC'].append(roc_auc)
    results['Confusion Matrix'].append(c_matrix)
    
    return results
    

#### Binary Classifier Method

In [114]:
def binary_classifier(model, cutoff, submission):  
    kf = KFold(n_splits=5)
    
    # results dictionary
    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : [],
               'ROC_AUC': [],
               'Confusion Matrix': []}
    
    train_labels = construct_labels(y_train_df, cutoff)
    matrix = np.zeros((2,2))
    
    # to submit to Kaggle
    if submission == True:
        model.fit(train_features, train_labels)
        y_pred = model.predict(test_features).astype(int)
        print(f"File {cutoff} ready for submission\n")
    
    # not for Kaggle - testing purposes
    elif submission == False:
        # uses k_folds to get F1 average
        for train_index, test_index in kf.split(train_features, train_labels):
            x_train = train_features[train_index]
            x_test = train_features[test_index]
            y_train = train_labels[train_index]
            y_test = train_labels[test_index]

            model.fit(x_train, y_train)
            y_pred = model.predict(x_test).astype(int)

            results = evaluate(results, y_test, y_pred)

        # compute output results
        f1_avg = sum(results['F1']) / len(results['F1'])
        accuracy_avg = sum(results['Accuracy']) / len(results['Accuracy'])
        precision_avg = sum(results['Precision']) / len(results['Precision'])
        roc_auc_avg = sum(results['ROC_AUC']) / len(results['ROC_AUC'])
        
        # confusion matrix
        matrix = np.zeros((2,2))
        for item in results['Confusion Matrix']:
            matrix = np.add(matrix, item)
        matrix_avg = matrix / len(results['Confusion Matrix'])

        # print out results
        print(f"Accuracy Average:{accuracy_avg}")
        print(f"Precision Average: {precision_avg}")
        print(f"F1 Score Average: {f1_avg}")
        print(f"ROC AUC Score Average: {roc_auc_avg}")
        print(f"Confusion Matrix Average:\n {matrix_avg[0]}\n {matrix_avg[1]}")
        print("\n")

    return y_pred

#### Import SK-Learn Models

In [115]:
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC

#### Cutoff 1

In [116]:
# switch to True for Kaggle submission mode
submission = False
cutoff = 1

print(f"BINARY CLASSIFIER - CUTOFF {cutoff}\n")

print(f"Logistic Regression Model: \n")
model = LogisticRegression()
model_name = "Logistic Regression"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Perceptron Model: \n")
model = Perceptron()
model_name = "Perceptron"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Linear SVC Model: \n")
model = LinearSVC()
model_name = "SVC"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Best Model: \n")
model = LogisticRegression(fit_intercept = False)
model_name = "Best"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

BINARY CLASSIFIER - CUTOFF 1

Logistic Regression Model: 

Accuracy Average:0.7908153870452184
Precision Average: 0.7559915567622599
F1 Score Average: 0.6799931086307246
ROC AUC Score Average: 0.6595720993613837
Confusion Matrix Average:
 [483.8 707.6]
 [ 513.6 4132.8]


Perceptron Model: 

Accuracy Average:0.7462094858242974
Precision Average: 0.6717359736701012
F1 Score Average: 0.6501400701552121
ROC AUC Score Average: 0.6569316921524655
Confusion Matrix Average:
 [570.4 621. ]
 [ 860.6 3785.8]


Linear SVC Model: 

Accuracy Average:0.7674507575710889
Precision Average: 0.7126321087861431
F1 Score Average: 0.670847304058305
ROC AUC Score Average: 0.6644959452385926
Confusion Matrix Average:
 [549.  642.4]
 [ 715.2 3931.2]


Best Model: 

Accuracy Average:0.7849914571389952
Precision Average: 0.7406847617912109
F1 Score Average: 0.6839033625538858
ROC AUC Score Average: 0.6695940162488385
Confusion Matrix Average:
 [528.2 663.2]
 [ 592.  4054.4]




#### Cutoff 2

In [122]:
# switch to True for Kaggle submission mode
submission = False
cutoff = 2

print(f"BINARY CLASSIFIER - CUTOFF {cutoff}\n")

print(f"Logistic Regression Model: \n")
model = LogisticRegression()
model_name = "Logistic Regression"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Perceptron Model: \n")
model = Perceptron()
model_name = "Perceptron"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Linear SVC Model: \n")
model = LinearSVC()
model_name = "SVC"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Best Model: \n")
model = LogisticRegression(fit_intercept = False, C=0.4, max_iter=1000)
model_name = "Best"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

BINARY CLASSIFIER - CUTOFF 2

Logistic Regression Model: 

Accuracy Average:0.7330550821586056
Precision Average: 0.7259076373737119
F1 Score Average: 0.7230177962191434
ROC AUC Score Average: 0.727084514926615
Confusion Matrix Average:
 [1678.6  704.6]
 [ 853.8 2600.8]


Perceptron Model: 

Accuracy Average:0.6898193019533809
Precision Average: 0.6803327140171975
F1 Score Average: 0.6777505273651421
ROC AUC Score Average: 0.6842525121166843
Confusion Matrix Average:
 [1614.6  768.6]
 [1042.2 2412.4]


Linear SVC Model: 

Accuracy Average:0.717638315496065
Precision Average: 0.709694186987321
F1 Score Average: 0.7070989557045472
ROC AUC Score Average: 0.7125236334364298
Confusion Matrix Average:
 [1681.   702.2]
 [ 946.2 2508.4]


Best Model: 

Accuracy Average:0.7374058402755267
Precision Average: 0.7293974312670842
F1 Score Average: 0.7271298528296007
ROC AUC Score Average: 0.7308487870780336
Confusion Matrix Average:
 [1674.4  708.8]
 [ 824.2 2630.4]




#### Cutoff 3

In [124]:
# switch to True for Kaggle submission mode
submission = False
cutoff = 3

print(f"BINARY CLASSIFIER - CUTOFF {cutoff}\n")

print(f"Logistic Regression Model: \n")
model = LogisticRegression()
model_name = "Logistic Regression"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Perceptron Model: \n")
model = Perceptron()
model_name = "Perceptron"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Linear SVC Model: \n")
model = LinearSVC()
model_name = "SVC"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Best Model: \n")
model = LogisticRegression(fit_intercept = False, max_iter=100, solver='saga')
model_name = "Best"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

BINARY CLASSIFIER - CUTOFF 3

Logistic Regression Model: 

Accuracy Average:0.7884504604153385
Precision Average: 0.7865316862097389
F1 Score Average: 0.7538160045349208
ROC AUC Score Average: 0.7510590028827466
Confusion Matrix Average:
 [3208.2  347.4]
 [ 887.6 1394.6]


Perceptron Model: 

Accuracy Average:0.744632582438418
Precision Average: 0.7257967623158794
F1 Score Average: 0.7150280097210108
ROC AUC Score Average: 0.7155361727849183
Confusion Matrix Average:
 [2954.6  601. ]
 [ 889.8 1392.4]


Linear SVC Model: 

Accuracy Average:0.7712181853919688
Precision Average: 0.7579095943570767
F1 Score Average: 0.7408103431285216
ROC AUC Score Average: 0.7402185125224704
Confusion Matrix Average:
 [3072.8  482.8]
 [ 852.8 1429.4]


Best Model: 

Accuracy Average:0.7885875758141865
Precision Average: 0.7822326065209326
F1 Score Average: 0.7569151314346408
ROC AUC Score Average: 0.7551918840997422
Confusion Matrix Average:
 [3167.   388.6]
 [ 845.6 1436.6]




#### Cutoff 4

In [119]:
# switch to True for Kaggle submission mode
submission = False
cutoff = 4

print(f"BINARY CLASSIFIER - CUTOFF {cutoff}\n")

print(f"Logistic Regression Model: \n")
model = LogisticRegression()
model_name = "Logistic Regression"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Perceptron Model: \n")
model = Perceptron()
model_name = "Perceptron"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Linear SVC Model: \n")
model = LinearSVC()
model_name = "SVC"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

print(f"Best Model: \n")
model = LogisticRegression(fit_intercept = False)
model_name = "Best"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "binary", cutoff)

BINARY CLASSIFIER - CUTOFF 4

Logistic Regression Model: 

Accuracy Average:0.8741284805680506
Precision Average: 0.8588986918360751
F1 Score Average: 0.7429660240813721
ROC AUC Score Average: 0.7075559835684679
Confusion Matrix Average:
 [4612.4   97. ]
 [637.8 490.6]


Perceptron Model: 

Accuracy Average:0.846310658465567
Precision Average: 0.7574091949902111
F1 Score Average: 0.7271409683389495
ROC AUC Score Average: 0.7191343998626767
Confusion Matrix Average:
 [4369.   340.4]
 [556.8 571.6]


Linear SVC Model: 

Accuracy Average:0.8652216433857489
Precision Average: 0.8074012068999933
F1 Score Average: 0.7450814748750274
ROC AUC Score Average: 0.7219175201320026
Confusion Matrix Average:
 [4501.6  207.8]
 [579.  549.4]


Best Model: 

Accuracy Average:0.8770410412412624
Precision Average: 0.8401897055976473
F1 Score Average: 0.7580952429093939
ROC AUC Score Average: 0.726447576079344
Confusion Matrix Average:
 [4572.   137.4]
 [580.4 548. ]




## Multiclass Classification

In [125]:
def multiclass(model_method, submission):
    kf = KFold(n_splits=5) # can change
    
    train_labels = train_df[['overall']].copy().astype(int)
    
    # Kaggle submission mode
    if submission == True:
        model.fit(train_features, train_labels)
        y_pred = model.predict(test_features).astype(int)
        print(f"File is ready for submission")
        
    elif submission == False:
        # uses k_folds to get F1 average
        for train_index, test_index in kf.split(train_features, train_labels):
            x_train = train_features[train_index]
            x_test = train_features[test_index]
            y_train = train_labels[train_index]
            y_test = train_labels[test_index]

            model.fit(x_train, y_train)
            y_pred = model.predict(x_test).astype(int)

            results = evaluate(results, y_test, y_pred)

        # compute output results
        f1_avg = sum(results['F1']) / len(results['F1'])
        accuracy_avg = sum(results['Accuracy']) / len(results['Accuracy'])
        precision_avg = sum(results['Precision']) / len(results['Precision'])
        roc_auc_avg = sum(results['ROC_AUC']) / len(results['ROC_AUC'])
        
        # confusion matrix
        matrix = np.zeros((2,2))
        for item in results['Confusion Matrix']:
            matrix = np.add(matrix, item)
        matrix_avg = matrix / len(results['Confusion Matrix'])
        
        # print outputs
        print(f"Accuracy Average:{accuracy_avg}")
        print(f"Precision Average: {precision_avg}")
        print(f"F1 Score Average: {f1_avg}")
        print(f"ROC AUC Score Average: {roc_auc_avg}")
        print(f"Confusion Matrix Average:\n {matrix_avg[0]}\n {matrix_avg[1]}")
        print("\n")
    
    return y_pred

#### Run the Multiclass Classifier

In [132]:
# switch to True for Kaggle submission mode
submission = False;

# different types of models

print(f"MULTICLASS\n")

print(f"Logistic Regression Model: \n")
model = LogisticRegression()
model_name = "Logistic Regression"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "multiclass", cutoff)

print(f"Perceptron Model: \n")
model = Perceptron()
model_name = "Perceptron"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "multiclass", cutoff)

print(f"Linear SVC Model: \n")
model = LinearSVC()
model_name = "SVC"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "multiclass", cutoff)

print(f"Best Model: \n")
model = LogisticRegression(fit_intercept=False, multi_class='ovr', class_weight='balanced')
model_name = "Best"
predictions = binary_classifier(model, cutoff, submission)
make_csv(model_name, predictions, "multiclass", cutoff)

MULTICLASS

Logistic Regression Model: 

Accuracy Average:0.7884504604153385
Precision Average: 0.7865316862097389
F1 Score Average: 0.7538160045349208
ROC AUC Score Average: 0.7510590028827466
Confusion Matrix Average:
 [3208.2  347.4]
 [ 887.6 1394.6]


Perceptron Model: 

Accuracy Average:0.744632582438418
Precision Average: 0.7257967623158794
F1 Score Average: 0.7150280097210108
ROC AUC Score Average: 0.7155361727849183
Confusion Matrix Average:
 [2954.6  601. ]
 [ 889.8 1392.4]


Linear SVC Model: 

Accuracy Average:0.7712181853919688
Precision Average: 0.7579095943570767
F1 Score Average: 0.7408103431285216
ROC AUC Score Average: 0.7402185125224704
Confusion Matrix Average:
 [3072.8  482.8]
 [ 852.8 1429.4]


Best Model: 

Accuracy Average:0.7882111746174171
Precision Average: 0.7737577867577881
F1 Score Average: 0.7629052167138334
ROC AUC Score Average: 0.7636954899147825
Confusion Matrix Average:
 [3035.   520.6]
 [ 715.8 1566.4]




## Clustering

#### New Test Data

In [14]:
# new test data
test_df = pd.DataFrame(test_table, columns = ['reviewText','category', 'summary'])
y_test_df = test_df[['category']].copy()
x_test_df = test_df[['reviewText','summary']].copy()

# label each category with numbers
labels = y_test_df.category.unique()
num_labels = len(labels)
label_indices = list(range(0, num_labels))
y_test_df.category.replace(labels, label_indices, inplace=True)
true_labels = np.array(y_test_df).flatten()
print(true_labels)

# vectorize features - reviewText, summary
vectorizer_rt = TfidfVectorizer(stop_words='english', min_df=0.2, max_df=0.9)
test_rt = vectorizer_rt.fit_transform(x_test_df.reviewText.tolist())

vectorizer_sum = TfidfVectorizer(stop_words='english', min_df=0.1, max_df=0.9)
test_sum = vectorizer_sum.fit_transform(x_test_df.summary.tolist())

test_features = hstack((test_rt, test_sum)).tocsr()

[0 0 0 ... 5 5 5]


#### Clustering Method

In [21]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import rand_score

def clustering(num_clusters):
    # vectorizes train and test features - words to numbers
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(test_features)
    s_score = silhouette_score(test_features, kmeans.labels_)
    r_score = rand_score(true_labels, kmeans.labels_)
    
    return s_score, r_score

#### Run the Clustering Method

In [22]:
num_clusters = num_labels
s, r = clustering(num_clusters)

print("CLUSTERING")
print(f"Silhouette Score: {s}")
print(f"Random Score: {r}")

CLUSTERING
Silhouette Score: 0.9554411607742243
Random Score: 0.6250878466819787
