# Amazon Fine Food Reviews - Helpfulness

In [1]:
from time import time
from operator import itemgetter
#import nltk
import sqlite3
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model.stochastic_gradient import SGDClassifier

#### Import data  

In [2]:
con = sqlite3.connect('./input/database.sqlite')
df = pd.read_sql_query("""
SELECT Id, UserId, HelpfulnessNumerator, HelpfulnessDenominator, Score, Summary, Text, Time
FROM Reviews where HelpfulnessDenominator > 10
""", con)

#### Pre-processing

In [3]:
df['Time'] = pd.to_datetime(df['Time'], unit='s')
df['WordCount'] = map(lambda x: len(x.split()), df['Text'])
df['CharCount'] = map(lambda x: len(x), df['Text'])
df['SentenceCount'] = map(lambda x: len(x.splitlines()), df['Text'])
df['ARI'] = 4.71 * (df['CharCount'].astype(float) / df['WordCount']) + 0.5 * (df['WordCount'].astype(float) / df['SentenceCount']) - 21.43
df['Year'] = pd.DatetimeIndex(df['Time']).year
df['Month'] = pd.DatetimeIndex(df['Time']).month
le = LabelEncoder()
df['UserId_le'] = le.fit_transform(df['UserId'])
df['HelpfulnessDenominator'].fillna(0, inplace=True)
df['HelpfulnessNumerator'].fillna(0, inplace=True)
df['HelpfulnessRatio'] = np.divide(df['HelpfulnessNumerator'].astype(float), df['HelpfulnessDenominator'])
df['HelpfulnessRatio'].fillna(0, inplace=True)
df.drop('HelpfulnessNumerator', axis=1, inplace=True)
df['HelpfulnessRatio'] = [1 if x > 1.0 else x for x in  df['HelpfulnessRatio']]
df['HelpfulnessLabel'] = np.where(df['HelpfulnessRatio'] > 0.8, 1, 0) 
X_train, X_test, y_train, y_test = train_test_split(df[['Id', 'UserId_le', 'HelpfulnessDenominator', 'Score', 'WordCount', 'CharCount', 'SentenceCount', 'ARI']], df['HelpfulnessLabel'], test_size=0.2, random_state=432)

In [4]:
scaler = StandardScaler()
#perform feature scaling to be more effective 
scaler.fit(X_train)
X_train = scaler.transform(X_train)
clf = SGDClassifier(n_iter = 25)
cross_val_score(clf, X_train, y_train)

array([ 0.79594689,  0.79437456,  0.79762321])

In [5]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [6]:
param_grid = {
    'loss' : ['hinge', 'epsilon_insensitive', 'squared_hinge'],
    'penalty' : ['l2', 'l1', 'elasticnet'],
    'learning_rate' : ['constant', 'optimal', 'invscaling'],
    'alpha' : 10.0**-np.arange(1,7),
    'n_iter' : [10, 25, 30],
    'average' : [True, False],
    'eta0' : 10.0**-np.arange(1,4),
    'random_state' : [43]
}
clf = SGDClassifier()
grid_search = GridSearchCV(clf, param_grid, n_jobs=-1)
start = time()
grid_search.fit(X_train, y_train)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_)))
report(grid_search.cv_results_)

GridSearchCV took 404.00 seconds for 24 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.801 (std: 0.001)
Parameters: {'n_iter': 10, 'eta0': 0.001, 'loss': 'hinge', 'average': True, 'penalty': 'elasticnet', 'random_state': 43, 'alpha': 0.001, 'learning_rate': 'constant'}

Model with rank: 2
Mean validation score: 0.801 (std: 0.001)
Parameters: {'n_iter': 10, 'eta0': 0.10000000000000001, 'loss': 'hinge', 'average': True, 'penalty': 'l1', 'random_state': 43, 'alpha': 0.0001, 'learning_rate': 'invscaling'}

Model with rank: 3
Mean validation score: 0.801 (std: 0.001)
Parameters: {'n_iter': 10, 'eta0': 0.10000000000000001, 'loss': 'hinge', 'average': False, 'penalty': 'elasticnet', 'random_state': 43, 'alpha': 9.9999999999999995e-07, 'learning_rate': 'invscaling'}



### Test Error rate

In [7]:
X_test = scaler.transform(X_test)
grid_search.score(X_test, y_test)

0.79361751688795712