In [1]:
import pandas as pd
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

ratings= getDF('reviews_Beauty_5.json.gz')

In [2]:
ratings.rename(columns={'reviewerID': 'user_id', 
                        'asin': 'item_id', 
                        'reviewerName': 'user_name', 
                        'reviewText': 'review_text',
                        'summary': 'review_summary',
                        'overall': 'score'},
               inplace=True)


In [3]:
ratings.user_id = ratings.user_id.astype('category').cat.codes.values
ratings.item_id = ratings.item_id.astype('category').cat.codes.values
# Add IDs for embeddings.
ratings['user_emb_id'] = ratings['user_id']
ratings['item_emb_id'] = ratings['item_id']

In [4]:
import spacy
import numpy as np
import sklearn.metrics
nlp_en = spacy.load('en', vectors='en_glove_cc_300_1m')
example_reviews = ratings.review_summary.tolist()
example_embs = np.vstack([nlp_en(e).vector for e in example_reviews])
#sklearn.metrics.pairwise.cosine_similarity(example_embs)

In [5]:
# Multinomial Naive Bayes Classifier
def naive_bayes_classifier(train_x, train_y):
    from sklearn.naive_bayes import MultinomialNB
    model = MultinomialNB(alpha=0.01)
    model.fit(train_x, train_y)
    return model

# KNN Classifier
def knn_classifier(train_x, train_y):
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier()
    model.fit(train_x, train_y)
    return model

# Logistic Regression Classifier
def logistic_regression_classifier(train_x, train_y):
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(penalty='l2')
    model.fit(train_x, train_y)
    return model
 

# Random Forest Classifier
def random_forest_classifier(train_x, train_y):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=8)
    model.fit(train_x, train_y)
    return model
 
# Decision Tree Classifier 
def decision_tree_classifier(train_x, train_y):
    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    model.fit(train_x, train_y)
    return model
 

# GBDT(Gradient Boosting Decision Tree) Classifier
def gradient_boosting_classifier(train_x, train_y):
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=200)
    model.fit(train_x, train_y)
    return model


# SVM Classifier
def svm_classifier(train_x, train_y):
    from sklearn.svm import SVC
    model = SVC(kernel='rbf', probability=True)
    model.fit(train_x, train_y)
    return model
 


In [6]:
scoregeq4 = ratings.score.where(ratings.score>=4, 0)
scoregeq4 = scoregeq4.where(ratings.score<4, 1)


In [7]:
a = np.array(scoregeq4)[np.newaxis].T
data = np.append(a, example_embs, axis=1)
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

In [8]:
train_y = train[:, 0]
train_x = train[:, 1:]
test_y = test[:, 0]
test_x = test[:, 1:]

In [10]:
import time
from sklearn import metrics
if __name__ == '__main__':
    
    classifiers = {'NB':naive_bayes_classifier,
                  'KNN':knn_classifier,
                   'LR':logistic_regression_classifier,
                   'RF':random_forest_classifier,
                   'DT':decision_tree_classifier,
                  'SVM':svm_classifier,
                 'GBDT':gradient_boosting_classifier} 
    
    test_classifiers = ['LR', 'RF', 'DT']
    is_binary_class = (len(np.unique(train_y)) == 2)
    
    for classifier in test_classifiers:
        print ('******************* %s ********************' % classifier)
        start_time = time.time()
        model = classifiers[classifier](train_x, train_y)
        print ('training took %fs!' % (time.time() - start_time))
        predict = model.predict(test_x)
        #print(is_binary_class)
        if is_binary_class:
            precision = metrics.precision_score(test_y, predict)
            recall = metrics.recall_score(test_y, predict)
            print ('precision: %.2f%%, recall: %.2f%%' % (100 * precision, 100 * recall))
        accuracy = metrics.accuracy_score(test_y, predict)
        print ('accuracy: %.2f%%' % (100 * accuracy))

******************* LR ********************
training took 69.624401s!
precision: 84.96%, recall: 95.34%
accuracy: 83.22%
******************* RF ********************


  from numpy.core.umath_tests import inner1d


training took 30.648948s!
precision: 86.28%, recall: 91.43%
accuracy: 82.00%
******************* DT ********************
training took 157.062050s!
precision: 86.01%, recall: 84.92%
accuracy: 77.49%
