# Amazon reviews Classification

In [1]:
# Necessary Imports

import numpy as np
import gzip
import random
from sklearn.metrics import f1_score
from collections import defaultdict
from sklearn.svm import LinearSVC, LinearSVR
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import kendalltau
from scipy.sparse import vstack



In [2]:
# define directory
amazon_review_dir = "../amazon_reviews/"

[Data source](http://jmcauley.ucsd.edu/data/amazon/)

I download the 5-core review sets for the following product types: "Video Games", "Beauty", "Cell Phones and Accessories", and "Musical Instruments". 

First load in the reviews:

In [10]:
def load_amazon_reviews_no_dups(corpus_path,test_size=2000,seed=42):
    '''loads a gzipped amazon review corpus, sampling a test set of 2000 reviews, with the rest becoming
    training provided that they are not reviews of the same product or written by the same reviewers
    as training data'''
    g = gzip.open(corpus_path, 'r')
    all_reviews = [eval(line) for line in g]
    random.seed(seed)
    random.shuffle(all_reviews)
    test_set = random.sample(all_reviews, test_size)
    
    test_reviewer = set([review["reviewerID"] for review in test_set])
    test_product = set([review["asin"] for review in test_set])
    
    train_set = [review for review in all_reviews if review["reviewerID"] not in test_reviewer and review["asin"] not in test_product]
    
    return train_set,test_set

Use SVM Regression as baseline:

In [29]:
def prepare_for_vectorizer_regression(reviews):
    '''
    convert the reviews format into a list of texts and the overall rating
    '''
    
    texts = []
    classes = []
    
    for review in reviews:
        rating = review["overall"]
        classes.append(rating) 
        texts.append(review["summary"] + " " + review["reviewText"])
        
    return texts, classes

def prepare_for_regression(train,test,max_n=2):
    '''convert lists of reviews train and test to spare feature matrices X_train and X_test,
    and lists of binary polarity classifications train_class and test_class'''
    vectorizer = CountVectorizer(ngram_range=(1,max_n),min_df=2)
    train_texts, train_class = prepare_for_vectorizer_regression(train)
    test_texts, test_class = prepare_for_vectorizer_regression(test)
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train,train_class, X_test,test_class


def calculate_kdtau(path, max_n = 2):
    '''
    return the Kendall's Tau for the corresponding dataset at the specified path
    '''
    train_set, test_set = load_amazon_reviews_no_dups(amazon_review_dir + path)
    train, train_class, test, test_class = prepare_for_regression(train_set,test_set, max_n = max_n)
    svr = LinearSVR()
    svr.fit(train, train_class)
    
    return kendalltau(test_class, svr.predict(test)), train.shape


In [28]:
regression_results = dict()

for name, path in zip(names, paths):
    print("For", name)
    kdtau, trainshape = calculate_kdtau(path)
    regression_results[name] = (kdtau, trainshape)
    print("Kendall's Tau:", kdtau)
    print("trainshape:", trainshape)
    print("-----")
    


For Video games
Kendall's Tau: KendalltauResult(correlation=0.40585498119116903, pvalue=1.510440335427315e-126)
trainshape: (126333, 1128763)
-----
For Beauty
Kendall's Tau: KendalltauResult(correlation=0.4434249962697953, pvalue=2.3135221071252014e-148)
trainshape: (118829, 496856)
-----
For Cellphones and accessories
Kendall's Tau: KendalltauResult(correlation=0.44195392960366864, pvalue=3.805334351230243e-149)
trainshape: (110419, 495892)
-----


Use SVM Ranking method:

In [32]:
def convert_to_pairwise(data,ratings):
    '''covert a normal collection of data with ordinal ratings into a pairwise classification task
    by randomly choosing one comparison datapoint with a different rating and taking the direction of
    the difference as the class of the new datapoint'''

    new_feature_vector = []
    labels = []
    
    for i in range(data.shape[0]):
        
        if i % 10000 == 0:
            print(i)
        
        current_rating = ratings[i]
        diff_rating = current_rating
        diff_ind = 0
        
        while diff_rating == current_rating:
            
            j = random.choice(range(data.shape[0]))
            diff_rating = ratings[j]
            diff_ind = j
        
        diff_feature_vector = data[i, :] - data[diff_ind, :]
        new_feature_vector.append(diff_feature_vector)
        
        if current_rating > diff_rating:
            labels.append(1)
        else:
            labels.append(0)
    print('end iteration')      
    new_matrix = vstack(new_feature_vector)
    return new_matrix, labels
            
        
             


In [70]:
def calculate_kdtau_pairwise(path, max_n = 2):
    '''
    return the Kendall's Tau for the corresponding dataset at the specified path
    '''
    train_set, test_set = load_amazon_reviews_no_dups(amazon_review_dir + path)
    train, train_class, test, test_class = prepare_for_regression(train_set,test_set, max_n = max_n)
    pairwise_data, pairwise_class = convert_to_pairwise(train,train_class)
    svr = LinearSVC()
    svr.fit(pairwise_data, pairwise_class)
    scores = test @ svr.coef_.T
    
    return kendalltau(test_class, scores), train.shape

In [71]:
pairwise_results = dict()

for name, path in zip(names, paths):
    print("For", name)
    kdtau, trainshape = calculate_kdtau_pairwise(path)
    pairwise_results[name] = (kdtau, trainshape)
    print("Kendall's Tau:", kdtau)
    print("trainshape:", trainshape)
    print("-----")

For Video games
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
end iteration
Kendall's Tau: KendalltauResult(correlation=0.49280871188427516, pvalue=1.2787581938933162e-185)
trainshape: (126333, 1128763)
-----
For Beauty
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
end iteration
Kendall's Tau: KendalltauResult(correlation=0.5404327850813158, pvalue=2.241975583784509e-219)
trainshape: (118829, 496856)
-----
For Cellphones and accessories
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
end iteration
Kendall's Tau: KendalltauResult(correlation=0.5501302804219165, pvalue=5.866806050815261e-230)
trainshape: (110419, 495892)
-----
