# CS74 Final Project
### Kevin King, Spring 2022
### Due: Tuesday, June 7th

Import general libraries needed for the final project (more to be imported later):

In [1]:
import autograd.numpy as np
from autograd import grad
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Training and Test Datasets

In [2]:
train_table = pd.read_table('Training.csv', delimiter=',').fillna('NULL')
test_table = pd.read_table('Test.csv', delimiter=',').fillna('NULL')

## Binary Classifier

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# train data: y = classes, x = features
train_df = pd.DataFrame(train_table, columns = ['overall', 'reviewText','category', 'summary', 'verified', 'unixReviewTime', 'asin'])
y_train_df = train_df[['overall']].copy().astype(float)
x_train_df = train_df[['reviewText', 'category', 'summary', 'verified', 'unixReviewTime', 'asin']].copy()

# test data: x = features
test_df = pd.DataFrame(test_table, columns = ['overall', 'reviewText','category', 'summary', 'verified', 'unixReviewTime', 'asin'])
x_test_df = test_df[['reviewText', 'category', 'summary', 'verified', 'unixReviewTime', 'asin']].copy()

# vectorize train and test features - 2 different ones to improve accuracy
# vectorizer for reviewText
vectorizer_rt = TfidfVectorizer()
train_rt = vectorizer_rt.fit_transform(x_train_df.reviewText.tolist())
test_rt = vectorizer_rt.transform(x_test_df.reviewText.tolist())

#vectorizer for summary
vectorizer_sum = TfidfVectorizer()
train_sum = vectorizer_sum.fit_transform(x_train_df.summary.tolist())
test_sum = vectorizer_sum.transform(x_test_df.summary.tolist())

# train labels, train features, test features
train_features = hstack((train_rt, train_sum)).tocsr()
test_features = hstack((test_rt, test_sum)).tocsr()

#### Setup and Methods:

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold


# prints the predictions to a csv file for Kaggle
def make_csv(model_method, y_pred, type_name, num):
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{type_name}_{model_method}_{num}.csv', index=False)
    
def construct_labels(cutoff):
    train_label = y_train_df.overall.copy()
    train_label.loc[y_train_df.overall > cutoff] = 1
    train_label.loc[y_train_df.overall <= cutoff] = 0
    return train_label

def evaluate(y_test, y_pred):
    # results dictionary
    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}
    
    # get values for accuracy, precision, recall, f1
    assert len(y_test) == len(y_pred), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    
    # append values to results dictionary
    results['Accuracy'].append(accuracy)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1'].append(f1)
    
    return results
    

#### Binary Classifier Method

In [30]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

def binary_classifier(model_method, cutoff, submission, k_folds):  
    kf = KFold(n_splits=5)
    
    train_labels = construct_labels(cutoff)
    matrix = np.zeros((2,2))
    
    if model_method == "Logistic Regression":
        model = LogisticRegression(fit_intercept=False)
    elif model_method == "Perceptron":
        model = Perceptron(fit_intercept=False)
    elif model_method == "SVC":
        model = LinearSVC()
    
    # to submit to Kaggle
    if submission == True:
        model.fit(train_features, train_labels)
        y_pred = model.predict(test_features).astype(int)
        print(f"File {cutoff} ready for submission")
    
    # not for Kaggle - testing purposes
    elif submission == False:
        # uses k_folds to get F1 average
        if k_folds == True:
            for train_index, test_index in kf.split(train_features, train_labels):
                x_train = train_features[train_index]
                x_test = train_features[test_index]
                y_train = train_labels[train_index]
                y_test = train_labels[test_index]

                model.fit(x_train, y_train)
                y_pred = model.predict(x_test).astype(int)
                results = evaluate(y_test, y_pred)
                
                # confusion matrix
                matrix += confusion_matrix(y_test, y_pred)

                # compute output results
                f1_avg = sum(results['F1']) / len(results['F1'])
                accuracy_avg = sum(results['Accuracy']) / len(results['Accuracy'])

                print(f"Cutoff: {cutoff}")
                print(f"Accuracy Average:{accuracy_avg}")
                print(f"F1 Average: {f1_avg}")
                print("Confusion Matrix: \n", matrix)
                print("\n")
                
            matrix = matrix / len(results['F1'])
        
        # uses train_test_split 
        else:
            x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1)
            model.fit(x_train, y_train)
            y_pred = model.predict(x_test).astype(int)

            results = evaluate(y_test, y_pred)
            matrix += (y_test, y_pred)
            
            print(f"Cutoff: {cutoff}")
            print(f"F1 Macro: {results['F1']}")
            print(f"Accuracy:{results['Average']}")
            print("Confusion Matrix: \n", matrix)
            print("\n")
            
        
    
    return y_pred

#### Run the Binary Classifier
If submission mode, change it to True. If needed for testing purposes, change to False.
If you want K-Folds method for testing, make k_folds False.

In [31]:
submission = False
k_folds = True
max_cutoff = 4

# different types of models
models_list = ["Logistic Regression", "Perceptron", "SVC"]

for model in models_list:
    model_method = logreg
    print(f"BINARY CLASSIFIER - {model_method} Model\n")

    # run binary classifier
    while cutoff <= max_cutoff:
        predictions = binary_classifier(model_method, cutoff, submission, k_folds)
        make_csv(model_method, predictions, "binary", cutoff)
        cutoff += 1

BINARY CLASSIFIER - Logistic Regression Model

Cutoff: 1
Accuracy Average:0.7389516957862282
F1 Average: 0.6494404356482083
Confusion Matrix: 
 [[ 682. 1308.]
 [ 216. 3632.]]


Cutoff: 1
Accuracy Average:0.5125042822884549
F1 Average: 0.42238225530485235
Confusion Matrix: 
 [[1025. 1961.]
 [2409. 6281.]]


Cutoff: 1
Accuracy Average:0.8764988009592326
F1 Average: 0.7538423552874298
Confusion Matrix: 
 [[ 1523.  2447.]
 [ 2644. 10900.]]


Cutoff: 1
Accuracy Average:0.8963686193902021
F1 Average: 0.789156009772157
Confusion Matrix: 
 [[ 2058.  2908.]
 [ 2788. 15598.]]


Cutoff: 1
Accuracy Average:0.9006338872708584
F1 Average: 0.8046957567567817
Confusion Matrix: 
 [[ 2641.  3316.]
 [ 2960. 20272.]]


Cutoff: 2
Accuracy Average:0.8165467625899281
F1 Average: 0.8163376286793621
Confusion Matrix: 
 [[2285.  712.]
 [ 359. 2482.]]


Cutoff: 2
Accuracy Average:0.5008564576909901
F1 Average: 0.4959840923268871
Confusion Matrix: 
 [[4034. 1881.]
 [2104. 3657.]]


Cutoff: 2
Accuracy Average:0.62

## Multiclass Classification

In [7]:
from sklearn.metrics import f1_score

def multiclass(model_method, submission):
    kf = KFold(n_splits=5) # change?
    
    if model_method == "Logistic Regression":
        model = LogisticRegression(multi_class='multinomial', fit_intercept=False)
    elif model_method == "Perceptron":
        model = Perceptron()
    elif model_method == "SVC":
        model = LinearSVC(multi_class='ovr')
    
    train_labels = train_df[['overall']].copy().astype(int)
    
    if submission == True:
        model.fit(train_features, train_labels)
        y_pred = model.predict(test_features).astype(int)
        print(f"File is ready for submission")
        
    elif submission == False:
        for train_index, test_index in kf.split(train_features, train_labels):
            x_train = train_features[train_index]
            x_test = train_features[test_index]
            y_train = train_labels.iloc[train_index]
            y_test = train_labels.iloc[test_index]

            model.fit(x_train, y_train)
            y_pred = model.predict(x_test).astype(int)
            results = evaluate(y_test, y_pred)

        print(f"Cutoff: {cutoff}")
        f1_avg = sum(results['F1'])/len(results['F1'])
        print(f"F1 Average (k-fold): {f1_avg}")
        print("\n")
    
    return y_pred

#### Run the Multiclass Classifier

In [8]:
submission = False;

# different types of models
logreg = "Logistic Regression"
perceptron = "Perceptron"
svc = "SVC"

model_method = logreg
print(f"MULTICLASS - {model_method} MODEL\n")

# run the multiclass method
if submission == True:
    predictions = multiclass(model_method, submission)
    make_csv(model_method, predictions, "multiclass", 5)
    
elif submission == False:
    predictions = multiclass(model_method, submission)
    make_csv(model_method, predictions, "multiclass", 5)

MULTICLASS - Logistic Regression MODEL

Cutoff: 5
F1 Average (k-fold): 0.5752187239537705




## Clustering

#### New Test Data

In [44]:
# test data
test_df = pd.DataFrame(test_table, columns = ['reviewText','category'])
y_test_df = test_df[['category']].copy()
x_test_df = test_df[['reviewText']].copy()

# label each category with numbers
labels = y_test_df.category.unique()
num_labels = len(labels)
label_indices = list(range(0, num_labels))
y_test_df.category.replace(labels, label_indices, inplace=True)
true_labels = np.array(y_test_df).flatten()

# vectorize clusters
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.1, max_df=0.9)
test_features = vectorizer.fit_transform(x_test_df.reviewText.tolist())

#### Clustering Method

In [45]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import rand_score

def clustering(num_clusters):
    # vectorizes train and test features - words to numbers
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(test_features)
    s_score = silhouette_score(test_features, kmeans.labels_)
    r_score = rand_score(true_labels, kmeans.labels_)
    
    return s_score, r_score

#### Run the Clustering Method

In [46]:
num_clusters = 6
s, r = clustering(num_clusters)

print("CLUSTERING")
print(f"Silhouette Score: {s}")
print(f"Random Score: {r}")

CLUSTERING
Silhouette Score: 0.5997005311524475
Random Score: 0.6591235583216023
