# Hard-Easy Binary Models

In [1]:
import sys
import json
import pandas as pd
import numpy as np

In [2]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
with open('df_by_usr.json', 'r', encoding='UTF-8') as f:
    data = f.readlines()
    df_by_usr_data = list(map(json.loads, data))
    
df_by_usr = pd.DataFrame(df_by_usr_data)

In [4]:
df_by_usr.head(3)

Unnamed: 0,avg_star_delta,reviewer_label,text_agg,user_id
0,-0.318182,med,"Ummm, due to the star ratings on Yelp we made ...",--3WaS23LcIXtxyFULJHTA
1,-0.947368,hard,"In my opinion, this restaurant has the best fo...",--4rAAfZnEIAKJE80aIiYg
2,0.333333,med,All is right with the world. After going to th...,--CIuK7sUpaNzalLAlHJKA


In [5]:
df_hard_easy = df_by_usr[df_by_usr.reviewer_label != 'med'] 

In [6]:
df_hard_easy.head(5)

Unnamed: 0,avg_star_delta,reviewer_label,text_agg,user_id
1,-0.947368,hard,"In my opinion, this restaurant has the best fo...",--4rAAfZnEIAKJE80aIiYg
3,0.9,easy,One of the best buffets I have been to for the...,--HCoE1ghaAlcaAfshICgw
5,-0.666667,hard,"At 1200+ reviews, there's basically nothing to...",--WLHsm-AC4jcol2gOkmCQ
6,-0.65625,hard,"Popular sandwich place, located on the Vegas s...",-00kdEIhCt-ODaV4BS-EAg
10,-0.5,hard,The food was pretty good. \n\nPrices a little ...,-0Xu57zrI3Rxi8wGZsnWKQ


In [7]:
train_data, test_data, train_target, test_target = train_test_split(df_hard_easy['text_agg'],
                                                                    df_hard_easy['reviewer_label'])

In [8]:
train_data[0:5]

31038    The food is good but the owners are so rude  a...
4089     Just got out of meetings and wanted Thai food....
13002    This is the original Fat willy's here in the v...
4106     I had a chance to stop by and meet the new GM ...
24613    Best AYCE sushi in Mississauga. Sushi quality ...
Name: text_agg, dtype: object

In [9]:
train_target[0:5]

31038    hard
4089     easy
13002    easy
4106     easy
24613    easy
Name: reviewer_label, dtype: object

In [10]:
str_test_data = []
for text in test_data:
    str_test_data.append(str(text))

In [11]:
str_train_data = []
for text in train_data:
    str_train_data.append(str(text))

In [12]:
vectorizer = TfidfVectorizer(binary=True, ngram_range=(1,2), max_df=0.95) 

In [13]:
train_features = vectorizer.fit_transform(str_train_data)

In [14]:
test_features = vectorizer.transform(str_test_data)

In [15]:
from sklearn.linear_model import PassiveAggressiveClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html
# http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf

In [16]:
pac = PassiveAggressiveClassifier().fit(train_features, train_target)



In [17]:
pac.score(test_features, test_target)

0.9491758241758241

In [19]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression(C=100000000.0, class_weight=None, dual=False,
             fit_intercept=True, intercept_scaling=1, max_iter=100,
             multi_class='multinomial', n_jobs=1, penalty='l2',
             random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
             warm_start=False)
logistic.fit(train_features, train_target)
print(logistic.score(test_features, test_target))

0.9378434065934066


In [30]:
from sklearn.linear_model import SGDClassifier

sdgc = SGDClassifier(loss='log', penalty='elasticnet', l1_ratio=0.2)
sdgc.fit(train_features, train_target)
print(sdgc.score(test_features, test_target))



0.9271978021978022


In [20]:
pac2 = PassiveAggressiveClassifier(fit_intercept=False).fit(train_features, train_target)
pac2.score(test_features, test_target)



0.9488324175824175

In [21]:
from sklearn.model_selection import GridSearchCV

grid_params = dict(C=np.arange(0.5, 1.5, 0.5))
gs_pac = PassiveAggressiveClassifier()
gs = GridSearchCV(estimator=gs_pac,
                  param_grid=grid_params,
                  cv=10)

gs.fit(train_features, train_target)
gs.score(test_features, test_target)



0.946771978021978

In [22]:
gs.best_params_

{'C': 1.0}

In [25]:
grid_params = dict(C=np.arange(0.9, 1.1, 0.5))
gs_pac = PassiveAggressiveClassifier()
gs = GridSearchCV(estimator=gs_pac,
                  param_grid=grid_params,
                  cv=5)

gs.fit(train_features, train_target)
print(gs.score(test_features, test_target))
print(gs.best_params_)



0.9478021978021978
{'C': 0.9}


In [26]:
gs.score(test_features, test_target)

0.9478021978021978

In [27]:
grid_params = dict(C=np.arange(0.9, 1.1, 0.01))
gs_pac = PassiveAggressiveClassifier(fit_intercept=False)
gs = GridSearchCV(estimator=gs_pac,
                  param_grid=grid_params,
                  cv=5)

gs.fit(train_features, train_target)
print(gs.score(test_features, test_target))
print(gs.best_params_)



0.9481456043956044
{'C': 0.9700000000000001}


In [29]:
# Best Model thus far for only hard and easy reviewers 
pac_best = PassiveAggressiveClassifier(C=0.97).fit(train_features, train_target)
pac_best.score(test_features, test_target)

0.9491758241758241