# CS74 Final Project
### Kevin King, Spring 2022
### Due: Tuesday, June 7th

Import libraries needed for the final project:

In [1]:
import autograd.numpy as np
from autograd import grad
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Import training and test datasets:

In [2]:
train_table = pd.read_table('Training.csv', delimiter=',').fillna('NULL')
test_table = pd.read_table('Test.csv', delimiter=',').fillna('NULL')

### Binary Classifier

In [3]:
# prints the predictions to a csv file for Kaggle
def to_csv(y_pred, type):
    type_string = ""
    if type == "binary":
        type_string = "binary_class"
    elif type == "multiclass":
        type_string = "multiclass"
        
    out = {'id': range(len(y_pred)), 'predicted': y_pred}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{type_string}_{cutoff}.csv', index=False)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# train data: y = classes, x = features
train_df = pd.DataFrame(train_table, columns = ['overall', 'reviewText','category', 'verified'])
y_train_df = train_df[['overall']].copy().astype(float)
x_train_df = train_df[['reviewText', 'category', 'verified']].copy()

# test data: x = features
test_df = pd.DataFrame(test_table, columns = ['overall', 'reviewText','category', 'verified'])
x_test_df = test_df[['reviewText', 'category', 'verified']].copy()


In [15]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# vectorizes train and test features - words to numbers
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(x_train_df.reviewText.tolist())
test_features = vectorizer.transform(x_test_df.reviewText.tolist())

def construct_labels(cutoff):
    y_label = y_train_df.overall.copy()
    y_label.loc[y_train_df.overall > cutoff] = 1
    y_label.loc[y_train_df.overall <= cutoff] = 0
    return y_label

def evaluate(y_test, y_pred):
    assert len(y_test) == len(y_pred), 'labels array and predictions array must be the same length'
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy, precision, recall, f1

def binary_classifier(cutoff, submission):
    kf = KFold(n_splits=5) # change n_splits?
    
    # results dictionary
    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}
    
    train_labels = construct_labels(cutoff)
    model = LogisticRegression(fit_intercept=False)
    
    if submission == True:
        model.fit(train_features, train_labels)
        y_pred = model.predict(test_features).astype(int)
    
    elif submission == False:
        for train_index, test_index in kf.split(train_features, train_labels):
            x_train = train_features[train_index]
            x_test = train_features[test_index]
            y_train = train_labels[train_index]
            y_test = train_labels[test_index]

            model.fit(x_train, y_train)

            y_pred = model.predict(x_test).astype(int)
            a, p, r, f1 = evaluate(y_test, y_pred)
            results['Accuracy'].append(a)
            results['Precision'].append(p)
            results['Recall'].append(r)
            results['F1'].append(f1)

        f1_avg = sum(results['F1'])/len(results['F1'])
        print(f"F1 Average (k-fold): {f1_avg}")
        
    return results, y_pred

submission = False
binary_classifier(1, submission)

F1 Average (k-fold): 0.5916945263872193


({'Accuracy': [0.696985268927715,
   0.434737923946557,
   0.8247687564234327,
   0.8538883179170949,
   0.8600308377591228],
  'Precision': [0.6726949259939236,
   0.4035637177150162,
   0.6691966644563426,
   0.7530204652905287,
   0.7588472530227138],
  'Recall': [0.5902247725112046,
   0.33106465515096317,
   0.6183296150019597,
   0.654327346560965,
   0.6909403992098928],
  'F1': [0.5827172566536343,
   0.34196980740763006,
   0.6347171141676222,
   0.6832384720099887,
   0.7158299816972216]},
 array([1, 1, 1, ..., 1, 1, 1]))

Run the binary classifier for the possible cutoffs

In [385]:
# print(y_pred[0:100])
# print(np.count_nonzero(y_pred==0))

cutoff = 1
max_cutoffs = 4
submission = False;

if submission == True:
    while cutoff <= max_cutoffs:
        print(f"Cutoff: {cutoff}")
        y_pred, y_test = binary_classifier(cutoff, submission)

        to_csv(y_pred, "binary")
        cutoff += 1
    
elif submission == False:
    while cutoff <= max_cutoffs:
        print(f"Cutoff: {cutoff}")
        
        y_pred, y_test = binary_classifier(cutoff, submission)
        results = evaluate(y_pred, y_test)
        f1 = results['F1']
            
        print(f"F1 Macro: {f1}")
        print("\n")

        to_csv(y_pred, "binary")
        cutoff += 1

Cutoff: 1
F1 Macro: [0.6940908989590531]


Cutoff: 2
F1 Macro: [0.7794862271742495]


Cutoff: 3
F1 Macro: [0.791538140330554]


Cutoff: 4
F1 Macro: [0.6866609727788364]


