# CS74 Final Project
### Kevin King, Spring 2022
### Due: Tuesday, June 7th

Import libraries needed for the final project:

In [432]:
import autograd.numpy as np
from autograd import grad 
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint 
%matplotlib inline

Import training and test datasets:

In [433]:
train = pd.read_table('Training.csv', delimiter=',').fillna('NULL')
test = pd.read_table('Test.csv', delimiter=',').fillna('NULL')
# print(train['overall'])

### Binary Classifiers

#### Vectorizing the Features (x values)

In [434]:
cutoff = 3
# classes = 1,2,3,4,5
max_cutoff = 4

train_df = pd.DataFrame(train, columns = ['overall', 'reviewText','category'])
x_train = train_df[['reviewText', 'category']].copy()
y_train = train_df[['overall']].copy().astype(float)

test_df = pd.DataFrame(test, columns = ['overall', 'reviewText','category'])
x_test = test_df[['reviewText', 'category']].copy()

y_label = y_train.copy()
y_label.loc[y_train.overall>cutoff] = 1
y_label.loc[y_train.overall<=cutoff] = 0

#### In order to get numeric features for our text column, we will use Scikit-Learn's TFIDF vectorizer function (from TF-IDF_Tutorial.ipynb)

In [435]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initalize vectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.9)

# Fit vectorizer and transform text into features 
test_features = vectorizer.fit_transform(y_label.reviewText.tolist())
test_matrix = pd.DataFrame(test_features.todense())

# comment/uncomment in order to get the table
test_matrix

AttributeError: 'DataFrame' object has no attribute 'reviewText'

#### Sort the Labels into Binary Classes (y values)

In [429]:
from sklearn.linear_model import LogisticRegression 

model = LogisticRegression()
model.fit(test_matrix, y_label)
y_predict = model.predict(test_matrix)
print(y_predict)


  y = column_or_1d(y, warn=True)


ValueError: Found input variables with inconsistent numbers of samples: [4500, 29189]

#### Test features in the test file

In [431]:

# select features of the test file
x_test = pd.DataFrame([test['reviewText'], test['category']]).T
x_test.columns = ['text', 'category']
print(x_train)

y_train_labeled = y_train.copy()
y_train_labeled.loc[y_train.overall > cutoff] = 1
y_train_labeled.loc[y_train.overall <= cutoff] = 0


train_features = vectorizer.fit_transform(x_train.text.tolist())

# Extract the feature names (for visualization purposes)
extracted_features = vectorizer.get_feature_names_out()

# Create a dataframe showing what the TF-IDF feature values are for each word in each samples
review_text_dense_matrix = pd.DataFrame(train_features.todense())
review_text_dense_matrix.columns = extracted_features
review_text_dense_matrix.index = ['sample_'+str(x) for x in review_text_dense_matrix.index]

# Fit vectorizer and transform text into features 
test_features = vectorizer.transform(x_test.text.tolist())
test_dense_matrix = pd.DataFrame(test_features.todense())

test_dense_matrix

model = LogisticRegression()
model.fit(train_features, x_train_labeled.overall.tolist())
print(model.predict(test_features))


                                              reviewText    category
0          all of the reviews for this product are fake.  automotive
1                                 wrong part. our fault.  automotive
2                       this wire set it really sucks!!!  automotive
3      first use, it leaked instantly. even at 5 buck...  automotive
4                                             didn't fit  automotive
...                                                  ...         ...
29184  this is the same plush toy that the official d...        toys
29185  my grandson loved this. it is a great toy, he ...        toys
29186  my kiddo loves them! we are a rock climbing fa...        toys
29187  i bought this for my niece (age 2) and mailed ...        toys
29188  my daughter will love this! she's a huge ninja...        toys

[29189 rows x 2 columns]


AttributeError: 'DataFrame' object has no attribute 'text'

#### Test random line from test file

In [None]:
# get random line from the test file
num_lines = test[test.columns[0]].count()
rand_line = np.random.randint(0, num_lines)
test_sample = [test['reviewText'][rand_line]]
# print(f"Test Sample: {test_sample}")

# Transform into TF-IDF representation using vectorizers we fit earlier 
test_feature = vectorizer.transform(test_sample)
# Create visualization of output matrix
dense_matrix = pd.DataFrame(test_feature.todense())
dense_matrix.columns = extracted_features
dense_matrix.index = ['test_sample']

# dense_matrix


In [409]:
# draft code

# train_import = np.genfromtxt('Training.csv', delimiter=',', dtype=None, invalid_raise=False, filling_values='',
#                    names=('overall','verified','reviewTime','reviewerID','asin','reviewerName',
#                          'reviewText','summary','unixReviewTime','vote','image','style','category'))

# test_import = np.genfromtxt('Test.csv', delimiter=',', dtype=None, invalid_raise=False, filling_values='',
#                    names=('verified','reviewTime','reviewerID','asin','reviewerName',
#                          'reviewText','summary','unixReviewTime','vote','image','style','category'))

# y_train = all the classes of the train file
y_train = pd.DataFrame([train['overall']]).T
y_train.columns = ['overall']

# lines in train data below or equal to the cutoff
good = train[train['overall'] <= cutoff]

# lines in train data above the cutoff
bad = train[train['overall']> cutoff] 

# features of the train data below or equal to the cutoff
x_train = pd.DataFrame([train['overall'], train['reviewText'], train['category']]).T
x_train.columns = ['overall', 'text', 'category']
print(x_train)

# for item in train['category']:
#     print(item)
# separates by above and below cutoff

# good_category = pd.DataFrame([good['overall'], good['category']]).T
# good_category.columns = ['overall', 'category']

# good_reviewText = pd.DataFrame([good['overall'], good['reviewText']]).T
# good_reviewText.columns = ['overall', 'text']

      overall                                               text    category
0           1      all of the reviews for this product are fake.  automotive
1           1                             wrong part. our fault.  automotive
2           1                   this wire set it really sucks!!!  automotive
3           1  first use, it leaked instantly. even at 5 buck...  automotive
4           1                                         didn't fit  automotive
...       ...                                                ...         ...
29184       5  this is the same plush toy that the official d...        toys
29185       5  my grandson loved this. it is a great toy, he ...        toys
29186       5  my kiddo loves them! we are a rock climbing fa...        toys
29187       5  i bought this for my niece (age 2) and mailed ...        toys
29188       5  my daughter will love this! she's a huge ninja...        toys

[29189 rows x 3 columns]


### Other Draft Code - transferred from "CS74_final.ipynb"

In [None]:
def hyperparameter_tuning(cutoff, y_label):
    # Hyperparameter Tuning
    x_train, x_test, y_train, y_test = train_test_split(train_features, y_label, test_size=0.2)
    
    c_list = list(range(0,10,1))
                                                       
    params = {
        'penalty': ['l2'],
        'C': c_list,
        'solver': ['lbfgs','liblinear'],
    }

    logreg = LogisticRegression(max_iter=100)
    
    clf = GridSearchCV(logreg,                # model
                       param_grid = params,   # hyperparameters
                       scoring='f1_macro',        # metric for scoring
                       cv=6, refit=True) 

    clf.fit(x_train, y_train)
    
    return clf

def print_and_csv_ht(cutoff, clf):
    print("Tuned Hyperparameters :", clf.best_params_)
    print("F1 Macro :",clf.best_score_)
    print(clf.best_estimator_)
    
    prediction = clf.predict(test_features).astype(int)
    out = {'id': range(len(prediction)), 'predicted': prediction}
    outdf = pd.DataFrame(out)
    outdf.to_csv(f'{cutoff}.csv', index=False)

def binary_classifier(cutoff, submission):
    # labels classes 
    y_label = construct_labels(cutoff)
    
    # model
    model = LogisticRegression(fit_intercept=False, solver='saga') # change max_iter
    
    # results dictionary
    results = {'Accuracy' : [],
               'Precision' : [],
               'Recall' : [],
               'F1' : []}
    
    # model prediction
    if submission == True:
        model.fit(train_features, y_label)
        y_pred = model.predict(test_features).astype(int) # use this when submitting to Kaggle
    
    elif submission == False:
        x_train, x_test, y_train, y_test = train_test_split(train_features, y_label, test_size=0.1)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test).astype(int)
    
        a, p, r, f1 = evaluate(y_test, y_pred)
        results['Accuracy'].append(a)
        results['Precision'].append(p)
        results['Recall'].append(r)
        results['F1'].append(f1)
        
    return results, y_pred
    
# print(y_pred[0:100])
# print(np.count_nonzero(y_pred==0))

cutoff = 1
max_cutoffs = 4
submission = False;

if submission == True:
    while cutoff <= max_cutoffs:
        print(f"Cutoff: {cutoff}")
        results, y_pred = binary_classifier(cutoff, submission)
        make_csv(y_pred, "binary", cutoff)
        cutoff += 1
    
elif submission == False:
    while cutoff <= max_cutoffs:
        print(f"Cutoff: {cutoff}")
        results, y_pred = binary_classifier(cutoff, submission)
        f1 = results['F1']
        print(f"F1 Macro: {f1}")
        print("\n")

        make_csv(y_pred, "binary", cutoff)
        cutoff += 1