In [1]:
!pwd

/home/torrensk


In [2]:
import pandas as pd
import os
import re

import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  24


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
spark = SparkSession.builder.appName('amazon').getOrCreate()

schema = StructType([ \
    StructField("marketplace",       StringType(),    True), \
    StructField("customer_id",       StringType(),    True), \
    StructField("review_id",         StringType(),    True), \
    StructField("product_id",        StringType(),    True), \
    StructField("product_parent",    StringType(),    True), \
    StructField("product_title",     StringType(),    True), \
    StructField("product_category",  StringType(),    True), \
    StructField("star_rating",       IntegerType(),   True), \
    StructField("helpful_votes",     IntegerType(),   True), \
    StructField("total_votes",       IntegerType(),   True), \
    StructField("vine",              StringType(),    True), \
    StructField("verified_purchase", StringType(),    True), \
    StructField("review_headline",   StringType(),    True), \
    StructField("review_body",       StringType(),    True), \
    StructField("review_date",       TimestampType(), True), \
  ])



In [6]:
path = 'archive-5/'

In [None]:
files = [#'archive-5/amazon_reviews_multilingual_US_v1_00.tsv',
'archive-5/amazon_reviews_us_Apparel_v1_00.tsv',
'archive-5/amazon_reviews_us_Automotive_v1_00.tsv',
'archive-5/amazon_reviews_us_Baby_v1_00.tsv',
'archive-5/amazon_reviews_us_Beauty_v1_00.tsv',
'archive-5/amazon_reviews_us_Books_v1_02.tsv',
'archive-5/amazon_reviews_us_Camera_v1_00.tsv',
#'archive-5/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv',
#'archive-5/amazon_reviews_us_Digital_Music_Purchase_v1_00.tsv',
#'archive-5/amazon_reviews_us_Digital_Software_v1_00.tsv',
#'archive-5/amazon_reviews_us_Digital_Video_Download_v1_00.tsv',
#'archive-5/amazon_reviews_us_Digital_Video_Games_v1_00.tsv',
'archive-5/amazon_reviews_us_Electronics_v1_00.tsv',
'archive-5/amazon_reviews_us_Furniture_v1_00.tsv',
#'archive-5/amazon_reviews_us_Gift_Card_v1_00.tsv',
'archive-5/amazon_reviews_us_Grocery_v1_00.tsv',
'archive-5/amazon_reviews_us_Health_Personal_Care_v1_00.tsv',
'archive-5/amazon_reviews_us_Major_Appliances_v1_00.tsv',
#'archive-5/amazon_reviews_us_Mobile_Apps_v1_00.tsv',
#'archive-5/amazon_reviews_us_Mobile_Electronics_v1_00.tsv',
#'archive-5/amazon_reviews_us_Music_v1_00.tsv',
'archive-5/amazon_reviews_us_Musical_Instruments_v1_00.tsv',
'archive-5/amazon_reviews_us_Office_Products_v1_00.tsv',
'archive-5/amazon_reviews_us_Outdoors_v1_00.tsv',
#'archive-5/amazon_reviews_us_PC_v1_00.tsv',
#'archive-5/amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv',
'archive-5/amazon_reviews_us_Pet_Products_v1_00.tsv',
'archive-5/amazon_reviews_us_Shoes_v1_00.tsv',
'archive-5/amazon_reviews_us_Software_v1_00.tsv',
'archive-5/amazon_reviews_us_Sports_v1_00.tsv',
'archive-5/amazon_reviews_us_Tools_v1_00.tsv',
'archive-5/amazon_reviews_us_Toys_v1_00.tsv',
#'archive-5/amazon_reviews_us_Video_DVD_v1_00.tsv',
#'archive-5/amazon_reviews_us_Video_Games_v1_00.tsv',
#'archive-5/amazon_reviews_us_Video_v1_00.tsv',
'archive-5/amazon_reviews_us_Watches_v1_00.tsv',
#'archive-5/amazon_reviews_us_Wireless_v1_00.tsv']

In [7]:
data = spark.read.csv(path, schema=schema, header=True, sep='\t', mode='DROPMALFORMED')

In [8]:
data.count()

109830520

In [9]:
path2 = 'archive-6/'

In [10]:
data2 = spark.read.csv(path2, schema=schema, header=True, sep='\t', mode='DROPMALFORMED')

In [11]:
data2.count()

2375868

In [16]:
#filter out missing review titles and bodies
data2 = data2.filter(data2.review_body.isNotNull())
data2 = data2.filter(data2.review_headline.isNotNull())
data2 = data2.select('product_title', 'star_rating',
        'helpful_votes', 'total_votes', 'verified_purchase',
        'review_headline', 'review_body', 'product_category')
print(data2.count())
print(data2.columns)

2375640
['product_title', 'star_rating', 'helpful_votes', 'total_votes', 'verified_purchase', 'review_headline', 'review_body', 'product_category']


In [17]:
data2 = data2.withColumn("verified_purchase",data2["verified_purchase"].cast(IntegerType()))

In [18]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec, StringIndexer, VectorAssembler, Normalizer, OneHotEncoder
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

onehotencoder_v_purch = OneHotEncoder(inputCol="verified_purchase", outputCol="ver_purch")
tokenizer_pt = Tokenizer(inputCol='product_title', outputCol='pt_token')
tokenizer_rh = Tokenizer(inputCol='review_headline', outputCol='rh_token')
tokenizer_rb = Tokenizer(inputCol='review_body', outputCol='rb_token')
tokenizer_cat = Tokenizer(inputCol='product_category', outputCol='cat_token')
remover_pt = StopWordsRemover(inputCol='pt_token', outputCol='pt_stop')
remover_rh = StopWordsRemover(inputCol='rh_token', outputCol='rh_stop')
remover_rb = StopWordsRemover(inputCol='rb_token', outputCol='rb_stop')
w2v_pt = Word2Vec(vectorSize=3, minCount=0, inputCol="pt_stop", outputCol="pt_vec")
w2v_rh = Word2Vec(vectorSize=3, minCount=0, inputCol="rh_stop", outputCol="rh_vec")
w2v_rb = Word2Vec(vectorSize=5, minCount=0, inputCol="rb_stop", outputCol="rb_vec")
w2v_cat = Word2Vec(vectorSize=1, minCount=0, inputCol="cat_token", outputCol="cat_vec")
# labeler = StringIndexer(inputCol='star_rating',outputCol='label', stringOrderType='alphabetAsc')
assembler = VectorAssembler(inputCols=['helpful_votes', 'total_votes', 'ver_purch', 'pt_vec', 'rh_vec', 'rb_vec', 'cat_vec'], outputCol='features')
normalizer = Normalizer(inputCol='features', outputCol='norm_features')


lr = LogisticRegression(featuresCol='norm_features', labelCol='star_rating')
dtc = DecisionTreeClassifier(featuresCol='norm_features', labelCol='star_rating')
rfc = RandomForestClassifier(featuresCol='norm_features',labelCol='star_rating')

# build your pipeline
pipeline = Pipeline(stages=[onehotencoder_v_purch, tokenizer_pt, tokenizer_rh, tokenizer_rb, tokenizer_cat,
                            remover_pt, remover_rh, remover_rb,
                            w2v_pt, w2v_rh, w2v_rb, w2v_cat
                            #,labeler
                            , assembler, normalizer])#.fit(reviews_df)

In [19]:
# run your pipeline
final_data = pipeline.fit(data2).transform(data2).select('norm_features', 'star_rating')

Py4JJavaError: An error occurred while calling o106.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 15 in stage 18.0 failed 1 times, most recent failure: Lost task 15.0 in stage 18.0 (TID 517) (cn364 executor driver): java.lang.NullPointerException: Value at index 0 is null
	at org.apache.spark.sql.errors.QueryExecutionErrors$.valueIsNullError(QueryExecutionErrors.scala:1545)
	at org.apache.spark.sql.Row.getAnyValAs(Row.scala:527)
	at org.apache.spark.sql.Row.getDouble(Row.scala:271)
	at org.apache.spark.sql.Row.getDouble$(Row.scala:271)
	at org.apache.spark.sql.catalyst.expressions.GenericRow.getDouble(rows.scala:166)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.$anonfun$getOutputAttrGroupFromData$3(OneHotEncoder.scala:520)
	at scala.runtime.java8.JFunction1$mcDI$sp.apply(JFunction1$mcDI$sp.java:23)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.$anonfun$getOutputAttrGroupFromData$2(OneHotEncoder.scala:520)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2309)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1183)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1177)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1222)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.getOutputAttrGroupFromData(OneHotEncoder.scala:521)
	at org.apache.spark.ml.feature.OneHotEncoder.fit(OneHotEncoder.scala:196)
	at org.apache.spark.ml.feature.OneHotEncoder.fit(OneHotEncoder.scala:128)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.NullPointerException: Value at index 0 is null
	at org.apache.spark.sql.errors.QueryExecutionErrors$.valueIsNullError(QueryExecutionErrors.scala:1545)
	at org.apache.spark.sql.Row.getAnyValAs(Row.scala:527)
	at org.apache.spark.sql.Row.getDouble(Row.scala:271)
	at org.apache.spark.sql.Row.getDouble$(Row.scala:271)
	at org.apache.spark.sql.catalyst.expressions.GenericRow.getDouble(rows.scala:166)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.$anonfun$getOutputAttrGroupFromData$3(OneHotEncoder.scala:520)
	at scala.runtime.java8.JFunction1$mcDI$sp.apply(JFunction1$mcDI$sp.java:23)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.ml.feature.OneHotEncoderCommon$.$anonfun$getOutputAttrGroupFromData$2(OneHotEncoder.scala:520)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$3(RDD.scala:1230)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$5(RDD.scala:1231)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


## Importing, processing and cleaning all data in parallel

In [3]:
def import_file(f):
    reviews = []
    
    category = re.search('amazon_reviews_us_(.+?)_v1|amazon_reviews_(.+?)_US_v1', f).group(1)
    
    col_list = ['product_title', 'star_rating',
        'helpful_votes', 'total_votes', 'verified_purchase',
        'review_headline', 'review_body']
    df = pd.read_csv("./archive-5/" + str(f), sep='\t', error_bad_lines=False, usecols=col_list)
    
    df['category'] = category
    df = df[df.filter(like='review_').notnull().all(1)]
    df.verified_purchase = df.verified_purchase.eq('Y').mul(1)
    print(f + ": " + str(df.shape))
    reviews.append(df.values.tolist())
    return(reviews)

In [4]:
num_threads = mp.cpu_count()
pool = mp.Pool(num_threads)

amazon_reviews_us_Gift_Card_v1_00.tsv: (148305, 8)
amazon_reviews_us_Major_Appliances_v1_00.tsv: (96832, 8)
amazon_reviews_us_Mobile_Electronics_v1_00.tsv: (104849, 8)
amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv: (85925, 8)
amazon_reviews_us_Digital_Video_Games_v1_00.tsv: (144720, 8)
amazon_reviews_us_Digital_Software_v1_00.tsv: (101836, 8)
amazon_reviews_us_Software_v1_00.tsv: (341247, 8)
amazon_reviews_us_Video_v1_00.tsv: (380549, 8)
amazon_reviews_us_Furniture_v1_00.tsv: (791595, 8)
amazon_reviews_us_Watches_v1_00.tsv: (960082, 8)
amazon_reviews_us_Musical_Instruments_v1_00.tsv: (903981, 8)
amazon_reviews_us_Digital_Music_Purchase_v1_00.tsv: (1681462, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Tools_v1_00.tsv: (1740042, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Baby_v1_00.tsv: (1749022, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Outdoors_v1_00.tsv: (2299744, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Grocery_v1_00.tsv: (2393351, 8)
amazon_reviews_us_Camera_v1_00.tsv: (1800805, 8)
amazon_reviews_us_Pet_Products_v1_00.tsv: (2639755, 8)
amazon_reviews_us_Video_Games_v1_00.tsv: (1780193, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Office_Products_v1_00.tsv: (2640226, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Digital_Video_Download_v1_00.tsv: (3998252, 8)
amazon_reviews_us_Automotive_v1_00.tsv: (3510465, 8)
amazon_reviews_us_Mobile_Apps_v1_00.tsv: (5007900, 8)
amazon_reviews_us_Shoes_v1_00.tsv: (4358476, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Sports_v1_00.tsv: (4832781, 8)
amazon_reviews_us_Electronics_v1_00.tsv: (3090984, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Beauty_v1_00.tsv: (5094132, 8)
amazon_reviews_us_Toys_v1_00.tsv: (4859523, 8)
amazon_reviews_us_Apparel_v1_00.tsv: (5881113, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Health_Personal_Care_v1_00.tsv: (5312732, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Books_v1_02.tsv: (3105329, 8)
amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv: (5100394, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_Music_v1_00.tsv: (4740654, 8)
amazon_reviews_multilingual_US_v1_00.tsv: (6900804, 8)
amazon_reviews_us_Video_DVD_v1_00.tsv: (5049215, 8)


  self._target(*self._args, **self._kwargs)


amazon_reviews_us_PC_v1_00.tsv: (6906772, 8)
amazon_reviews_us_Wireless_v1_00.tsv: (8991085, 8)


Process ForkPoolWorker-7:
Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 125, in worker
    put((job, i, result))
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 347, in put
    self._writer.send_bytes(obj)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/connection.py", line 393, in _send_bytes
    header = struct.pack("!i", n)
struct.error: 'i' format requires -2147483648 <= number <= 2147483647

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 25

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/connection.py", line 393, in _send_bytes
    header = struct.pack("!i", n)
struct.error: 'i' format requires -2147483648 <= number <= 2147483647

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 130, in worker
    put((job, i, (False, wrapped)))
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 347, in put
    self._writer.send_bytes(obj)
  File "/accre/arch/easybuild/

Process ForkPoolWorker-30:
Process ForkPoolWorker-39:
Process ForkPoolWorker-116:
Process ForkPoolWorker-180:
Process ForkPoolWorker-40:
Process ForkPoolWorker-150:
Process ForkPoolWorker-211:
Process ForkPoolWorker-13:
Process ForkPoolWorker-217:
Process ForkPoolWorker-184:
Process ForkPoolWorker-146:
Process ForkPoolWorker-202:
Process ForkPoolWorker-115:
Process ForkPoolWorker-179:
Process ForkPoolWorker-41:
Process ForkPoolWorker-210:
Process ForkPoolWorker-222:
Process ForkPoolWorker-201:
Process ForkPoolWorker-114:
Process ForkPoolWorker-9:
Process ForkPoolWorker-130:
Process ForkPoolWorker-209:
Process ForkPoolWorker-214:
Process ForkPoolWorker-28:
Process ForkPoolWorker-178:
Process ForkPoolWorker-98:
Process ForkPoolWorker-15:
Process ForkPoolWorker-43:
Process ForkPoolWorker-183:
Process ForkPoolWorker-159:
Process ForkPoolWorker-148:
Process ForkPoolWorker-221:
Process ForkPoolWorker-213:
Process ForkPoolWorker-219:
Process ForkPoolWorker-189:
Process ForkPoolWorker-131:
Pro

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arc

Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, i

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multipr

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multipr

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlo

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semloc

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchron

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
KeyboardInterrupt
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinD

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, i

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiproc

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self.

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semloc

KeyboardInterrupt
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinD

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    t

  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/queues.py", line 3

In [None]:
files = os.listdir('archive-5')
r = pool.map(import_file, files)

Process ForkPoolWorker-257:
Process ForkPoolWorker-263:
Process ForkPoolWorker-262:
Process ForkPoolWorker-258:
Process ForkPoolWorker-259:
Process ForkPoolWorker-260:
Process ForkPoolWorker-261:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/accre/arch/easybuild/software/BinDist/Anaconda3/5.0.1/lib/python3.6/multiprocessing/pool.py", line 

KeyboardInterrupt: 

Process ForkPoolWorker-358:
Process ForkPoolWorker-349:
Process ForkPoolWorker-359:
Process ForkPoolWorker-357:
Process ForkPoolWorker-363:
Process ForkPoolWorker-316:
Process ForkPoolWorker-362:
Process ForkPoolWorker-364:
Process ForkPoolWorker-360:
Process ForkPoolWorker-272:
Process ForkPoolWorker-341:
Process ForkPoolWorker-267:
Process ForkPoolWorker-365:
Process ForkPoolWorker-302:
Process ForkPoolWorker-268:
Process ForkPoolWorker-361:
Process ForkPoolWorker-279:
Process ForkPoolWorker-348:
Process ForkPoolWorker-352:
Process ForkPoolWorker-280:
Process ForkPoolWorker-353:
Process ForkPoolWorker-346:
Process ForkPoolWorker-347:
Process ForkPoolWorker-337:
Process ForkPoolWorker-309:
Process ForkPoolWorker-333:
Process ForkPoolWorker-273:
Process ForkPoolWorker-270:
Process ForkPoolWorker-277:
Process ForkPoolWorker-292:
Process ForkPoolWorker-334:
Process ForkPoolWorker-293:
Process ForkPoolWorker-275:
Process ForkPoolWorker-326:
Process ForkPoolWorker-342:
Process ForkPoolWork

In [None]:
flat_list = [item for sublist in r for item in sublist]

In [None]:
flat_list_2 = [item for sublist in flat_list for item in sublist]

In [None]:
len(flat_list_2)

In [None]:
pool.close()

In [None]:
l = flat_list_2[1:100000]

In [None]:
len(r)

## Convert to Pyspark Dataframe

In [9]:
import pyspark

In [10]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('amazon_reviews').getOrCreate()



In [11]:
from pyspark.sql.types import StructField,StringType,IntegerType,StructType,FloatType
schema = StructType(
    [
     StructField(name="product_title", dataType=StringType()),
     StructField(name="star_rating", dataType=StringType()),
     StructField(name="helpful_votes", dataType=StringType()), 
     StructField(name="total_votes", dataType=StringType()), 
     StructField(name="verified_purchase", dataType=StringType()), 
     StructField(name="review_headline", dataType=StringType()), 
     StructField(name="review_body", dataType=StringType()), 
     StructField(name="category", dataType=StringType()),    
    ]
)

In [12]:
parallize_reviews = spark.sparkContext.parallelize(l)
reviews_df = spark.createDataFrame(parallize_reviews, schema)

In [13]:
reviews_df.show(5, truncate = False)

+-----------------------------------------------------------------------------------------------+-----------+-------------+-----------+-----------------+---------------+---------------------------------------------------------------------------------------------------------------------------------+----------------+
|product_title                                                                                  |star_rating|helpful_votes|total_votes|verified_purchase|review_headline|review_body                                                                                                                      |category        |
+-----------------------------------------------------------------------------------------------+-----------+-------------+-----------+-----------------+---------------+---------------------------------------------------------------------------------------------------------------------------------+----------------+
|Best Hand Clothes Wringer                       

## Modeling

In [15]:
reviews_df = reviews_df.withColumn("star_rating",reviews_df["star_rating"].cast(IntegerType()))
reviews_df = reviews_df.withColumn("helpful_votes",reviews_df["helpful_votes"].cast(IntegerType()))
reviews_df = reviews_df.withColumn("total_votes",reviews_df["total_votes"].cast(IntegerType()))
reviews_df = reviews_df.withColumn("verified_purchase",reviews_df["verified_purchase"].cast(IntegerType()))

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec, StringIndexer, VectorAssembler, Normalizer
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier

tokenizer_pt = Tokenizer(inputCol='product_title', outputCol='pt_token')
tokenizer_rh = Tokenizer(inputCol='review_headline', outputCol='rh_token')
tokenizer_rb = Tokenizer(inputCol='review_body', outputCol='rb_token')
tokenizer_cat = Tokenizer(inputCol='category', outputCol='cat_token')
remover_pt = StopWordsRemover(inputCol='pt_token', outputCol='pt_stop')
remover_rh = StopWordsRemover(inputCol='rh_token', outputCol='rh_stop')
remover_rb = StopWordsRemover(inputCol='rb_token', outputCol='rb_stop')
w2v_pt = Word2Vec(vectorSize=3, minCount=0, inputCol="pt_stop", outputCol="pt_vec")
w2v_rh = Word2Vec(vectorSize=3, minCount=0, inputCol="rh_stop", outputCol="rh_vec")
w2v_rb = Word2Vec(vectorSize=5, minCount=0, inputCol="rb_stop", outputCol="rb_vec")
w2v_cat = Word2Vec(vectorSize=1, minCount=0, inputCol="cat_token", outputCol="cat_vec")
# labeler = StringIndexer(inputCol='star_rating',outputCol='label', stringOrderType='alphabetAsc')
assembler = VectorAssembler(inputCols=['helpful_votes', 'total_votes', 'verified_purchase', 'pt_vec', 'rh_vec', 'rb_vec', 'cat_vec'], outputCol='features')
normalizer = Normalizer(inputCol='features', outputCol='norm_features')


lr = LogisticRegression(featuresCol='norm_features', labelCol='star_rating')
dtc = DecisionTreeClassifier(featuresCol='norm_features', labelCol='star_rating')
rfc = RandomForestClassifier(featuresCol='norm_features',labelCol='star_rating')

# build your pipeline
pipeline = Pipeline(stages=[tokenizer_pt, tokenizer_rh, tokenizer_rb, tokenizer_cat,
                            remover_pt, remover_rh, remover_rb,
                            w2v_pt, w2v_rh, w2v_rb, w2v_cat
                            #,labeler
                            , assembler, normalizer])#.fit(reviews_df)

In [17]:
# run your pipeline
final_data = pipeline.fit(reviews_df).transform(reviews_df).select('norm_features', 'star_rating')

In [18]:
# split your training set into 0.7/0.3 (train/test)
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [19]:
# Train the models (its three models, so it might take some time)
lr_model = lr.fit(train_data)
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)

In [20]:
lr_predictions = lr_model.transform(test_data)
dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
acc_evaluator = MulticlassClassificationEvaluator(labelCol="star_rating", predictionCol="prediction", metricName="accuracy")
lr_acc = acc_evaluator.evaluate(lr_predictions)
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

In [22]:
print("Here are the results!")
print('-'*80)
print('A logistic regression classifier had an accuracy of: {0:2.2f}%'.format(lr_acc*100))
print('-'*80)
print('A single decision tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))
print('-'*80)
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

Here are the results!
--------------------------------------------------------------------------------
A logistic regression classifier had an accuracy of: 63.27%
--------------------------------------------------------------------------------
A single decision tree had an accuracy of: 63.00%
--------------------------------------------------------------------------------
A random forest ensemble had an accuracy of: 64.27%
