In [1]:
!pwd

/home/torrensk


In [2]:
import pandas as pd
import os
import re

import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  48


## Importing, processing and cleaning all data in parallel

In [6]:
def import_file(f):
    reviews = []
    
    category = re.search('amazon_reviews_us_(.+?)_v1|amazon_reviews_(.+?)_US_v1', f).group(1)
    
    col_list = ['product_title', 'star_rating',
        'helpful_votes', 'total_votes', 'verified_purchase',
        'review_headline', 'review_body']
    df = pd.read_csv("./archive-6/" + str(f), sep='\t', error_bad_lines=False, usecols=col_list)
    
    df['category'] = category
    df = df[df.filter(like='review_').notnull().all(1)]
    df.verified_purchase = df.verified_purchase.eq('Y').mul(1)
    print(f + ": " + str(df.shape))
    reviews.append(df.values.tolist())
    return(reviews)

In [10]:
num_threads = mp.cpu_count()
pool = mp.Pool(num_threads)

amazon_reviews_us_Digital_Video_Games_v1_00.tsv: (144720, 8)
amazon_reviews_us_Major_Appliances_v1_00.tsv: (96832, 8)
amazon_reviews_us_Video_v1_00.tsv: (380549, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Baby_v1_00.tsv: (1749022, 8)


In [11]:
files = os.listdir('archive-6')
r = pool.map(import_file, files)

In [12]:
flat_list = [item for sublist in r for item in sublist]

In [13]:
flat_list_2 = [item for sublist in flat_list for item in sublist]

In [14]:
len(flat_list_2)

2371123

In [15]:
pool.close()

## Convert to Pyspark Dataframe

In [16]:
import pyspark

In [17]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('amazon_reviews').getOrCreate()



In [18]:
from pyspark.sql.types import StructField,StringType,IntegerType,StructType,FloatType
schema = StructType(
    [
     StructField(name="product_title", dataType=StringType()),
     StructField(name="star_rating", dataType=StringType()),
     StructField(name="helpful_votes", dataType=StringType()), 
     StructField(name="total_votes", dataType=StringType()), 
     StructField(name="verified_purchase", dataType=StringType()), 
     StructField(name="review_headline", dataType=StringType()), 
     StructField(name="review_body", dataType=StringType()), 
     StructField(name="category", dataType=StringType()),    
    ]
)

In [None]:
parallize_reviews = spark.sparkContext.parallelize(flat_list_2)
reviews_df = spark.createDataFrame(parallize_reviews, schema)

In [13]:
reviews_df.show(5, truncate = True)

+-----------------------------------------------------------------------------------------------+-----------+-------------+-----------+-----------------+---------------+---------------------------------------------------------------------------------------------------------------------------------+----------------+
|product_title                                                                                  |star_rating|helpful_votes|total_votes|verified_purchase|review_headline|review_body                                                                                                                      |category        |
+-----------------------------------------------------------------------------------------------+-----------+-------------+-----------+-----------------+---------------+---------------------------------------------------------------------------------------------------------------------------------+----------------+
|Best Hand Clothes Wringer                       

## Modeling

In [15]:
reviews_df = reviews_df.withColumn("star_rating",reviews_df["star_rating"].cast(IntegerType()))
reviews_df = reviews_df.withColumn("helpful_votes",reviews_df["helpful_votes"].cast(IntegerType()))
reviews_df = reviews_df.withColumn("total_votes",reviews_df["total_votes"].cast(IntegerType()))
reviews_df = reviews_df.withColumn("verified_purchase",reviews_df["verified_purchase"].cast(IntegerType()))

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec, StringIndexer, VectorAssembler, Normalizer
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

tokenizer_pt = Tokenizer(inputCol='product_title', outputCol='pt_token')
tokenizer_rh = Tokenizer(inputCol='review_headline', outputCol='rh_token')
tokenizer_rb = Tokenizer(inputCol='review_body', outputCol='rb_token')
tokenizer_cat = Tokenizer(inputCol='category', outputCol='cat_token')
remover_pt = StopWordsRemover(inputCol='pt_token', outputCol='pt_stop')
remover_rh = StopWordsRemover(inputCol='rh_token', outputCol='rh_stop')
remover_rb = StopWordsRemover(inputCol='rb_token', outputCol='rb_stop')
w2v_pt = Word2Vec(vectorSize=3, minCount=0, inputCol="pt_stop", outputCol="pt_vec")
w2v_rh = Word2Vec(vectorSize=3, minCount=0, inputCol="rh_stop", outputCol="rh_vec")
w2v_rb = Word2Vec(vectorSize=5, minCount=0, inputCol="rb_stop", outputCol="rb_vec")
w2v_cat = Word2Vec(vectorSize=1, minCount=0, inputCol="cat_token", outputCol="cat_vec")
# labeler = StringIndexer(inputCol='star_rating',outputCol='label', stringOrderType='alphabetAsc')
assembler = VectorAssembler(inputCols=['helpful_votes', 'total_votes', 'verified_purchase', 'pt_vec', 'rh_vec', 'rb_vec', 'cat_vec'], outputCol='features')
normalizer = Normalizer(inputCol='features', outputCol='norm_features')


lr = LogisticRegression(featuresCol='norm_features', labelCol='star_rating')
dtc = DecisionTreeClassifier(featuresCol='norm_features', labelCol='star_rating')
rfc = RandomForestClassifier(featuresCol='norm_features',labelCol='star_rating')

# build your pipeline
pipeline = Pipeline(stages=[tokenizer_pt, tokenizer_rh, tokenizer_rb, tokenizer_cat,
                            remover_pt, remover_rh, remover_rb,
                            w2v_pt, w2v_rh, w2v_rb, w2v_cat
                            #,labeler
                            , assembler, normalizer])#.fit(reviews_df)

In [17]:
# run your pipeline
final_data = pipeline.fit(reviews_df).transform(reviews_df).select('norm_features', 'star_rating')

In [18]:
# split your training set into 0.7/0.3 (train/test)
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [19]:
# Train the models (its three models, so it might take some time)
lr_model = lr.fit(train_data)
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)

In [20]:
lr_predictions = lr_model.transform(test_data)
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="star_rating", predictionCol="prediction", metricName="accuracy")
lr_acc = acc_evaluator.evaluate(lr_predictions)
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

In [22]:
print("Here are the results!")
print('-'*80)
print('A logistic regression classifier had an accuracy of: {0:2.2f}%'.format(lr_acc*100))
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

Here are the results!
--------------------------------------------------------------------------------
A logistic regression classifier had an accuracy of: 63.27%
--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 63.00%
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 64.27%
