In [1]:
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import * 
from pyspark import SparkContext, SparkConf

In [2]:
#conf = SparkConf().setAll([('spark.executor.memory', '5g'), 
#                           ('spark.driver.memory','5g'),
#                           ('spark.driver.maxResultSize','0')])


spark = SparkSession \
            .builder.master('local[*]')\
            .appName("reviews")\
            .getOrCreate()

sqlContext = SQLContext(sparkContext=spark.sparkContext, 
                        sparkSession=spark)

In [3]:
train_spark  = spark.read.csv("kindle_reduced.csv",inferSchema="true", header="true", escape="_")

In [4]:
train_spark.count()

983

In [5]:
train=train_spark.drop("reviewerName", "unixReviewTime")

In [6]:
train.show()

+----------+-------+-------+--------------------+--------------------+--------------------+--------------------+
|      asin|helpful|overall|          reviewText|          reviewTime|          reviewerID|             summary|
+----------+-------+-------+--------------------+--------------------+--------------------+--------------------+
|B00J4S6YWC| [0, 0]|      5|ARC provided by a...|         06 21, 2014|       AUSBN91MCI3WM|  A Very Sexy Cruise|
|B00HCZUBH8| [2, 4]|      5|Wild Ride by Nanc...|          03 3, 2014|      A141H51I3H4B1S|A Changing Gears ...|
|B006RZNR3Y| [0, 0]|      5|Well thought out ...|         07 10, 2014|       AP8TKDM76TROZ|"We don't take ki...|
|B00J47H8H8| [3, 4]|      3|This is book four...|         03 21, 2014|      A19DWIC1T7127Y| I'm losing interest|
|B00LRZLRMM| [0, 0]|      5|I really enjoyed ...|         07 14, 2014|       AM5P5MI4PU2KH|      Boner Brothers|
|B00DWGFFBI| [3, 4]|      5|PLEASURING THE LA...|         11 19, 2013|       AM09IO8QXEB1B|SENSU

In [7]:
type(train)

pyspark.sql.dataframe.DataFrame

In [8]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

#tokenize text (make words into an array)
tokenizer = Tokenizer(inputCol='reviewText', outputCol='words_token')
df_words_token = tokenizer.transform(train).select('reviewerID', 'words_token')

#remove basic words
remover = StopWordsRemover(inputCol='words_token', outputCol='words_clean')
train_no_stop=remover.transform(df_words_token).select('reviewerID', 'words_clean')

In [9]:
train_no_stop.show(5)
#much better

+--------------+--------------------+
|    reviewerID|         words_clean|
+--------------+--------------------+
| AUSBN91MCI3WM|[arc, provided, a...|
|A141H51I3H4B1S|[wild, ride, nanc...|
| AP8TKDM76TROZ|[well, thought, s...|
|A19DWIC1T7127Y|[book, four, five...|
| AM5P5MI4PU2KH|[really, enjoyed,...|
+--------------+--------------------+
only showing top 5 rows



In [10]:
words_clean=train_no_stop.drop("reviewerID")

Thanks to: https://stackoverflow.com/questions/53579444/efficient-text-preprocessing-using-pyspark-clean-tokenize-stopwords-stemming

In [11]:
from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="words_clean", outputCol="model")
word2Vec.setMaxIter(10)
#Word2Vec...
word2Vec.getMaxIter()
10
word2Vec.clear(word2Vec.maxIter)
model = word2Vec.fit(words_clean)
model.getMinCount()
5
model.setInputCol("words_clean")
#Word2VecModel...
model.getVectors().show()


+----------+--------------------+
|      word|              vector|
+----------+--------------------+
|   serious|[0.00281329010613...|
|       of.|[0.03748139366507...|
|  terrible|[-0.0173931941390...|
|purchasing|[0.02821515314280...|
|   michael|[-0.0443890839815...|
|     looks|[-0.0124790789559...|
|     alpha|[0.07029642909765...|
|   choice.|[-0.0078909918665...|
|   action.|[0.01338462624698...|
|     ideas|[-0.0057330969721...|
|     sweet|[-0.3039416074752...|
|      used|[-0.0325959771871...|
|       eye|[0.06601892411708...|
|   allowed|[-0.0328284129500...|
|    great.|[-0.0603251904249...|
|  opinions|[0.09353712946176...|
| beautiful|[-0.1893045157194...|
| providing|[-0.0372162051498...|
|   writing|[-0.1420489549636...|
|     sarah|[-0.1177786514163...|
+----------+--------------------+
only showing top 20 rows



In [12]:
train = train.withColumn(
    'HelpfulRecords',
    when(col('helpful') == '[0, 0]', 0)\
    .otherwise(col('helpful'))
)

In [13]:
train.show()

+----------+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText|          reviewTime|          reviewerID|             summary|HelpfulRecords|
+----------+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------+
|B00J4S6YWC| [0, 0]|      5|ARC provided by a...|         06 21, 2014|       AUSBN91MCI3WM|  A Very Sexy Cruise|             0|
|B00HCZUBH8| [2, 4]|      5|Wild Ride by Nanc...|          03 3, 2014|      A141H51I3H4B1S|A Changing Gears ...|        [2, 4]|
|B006RZNR3Y| [0, 0]|      5|Well thought out ...|         07 10, 2014|       AP8TKDM76TROZ|"We don't take ki...|             0|
|B00J47H8H8| [3, 4]|      3|This is book four...|         03 21, 2014|      A19DWIC1T7127Y| I'm losing interest|        [3, 4]|
|B00LRZLRMM| [0, 0]|      5|I really enjoyed ...|         07 14, 2014|       AM5P5MI4PU2KH|      Boner B

In [14]:
help=train.select("HelpfulRecords")

In [15]:
help.show()

+--------------+
|HelpfulRecords|
+--------------+
|             0|
|        [2, 4]|
|             0|
|        [3, 4]|
|             0|
|        [3, 4]|
|        [4, 4]|
|        [0, 1]|
|             0|
|             0|
|             0|
|             0|
|             0|
|             0|
|        [0, 1]|
|        [1, 2]|
|        [1, 1]|
|             0|
|             0|
|        [2, 2]|
+--------------+
only showing top 20 rows



In [16]:
def frac(x):
    if len(x)>1:
        return len(x)
    else:
        return float(x)

In [17]:
from pyspark.sql.functions import udf

#udf = udf(lambda x: (x[1].cast(DoubleType()/x[4].castDoubleType()) if len(x) > 1 else x, StringType())  
udf = udf(lambda x: x[:1] if len(x) > 1 else x, StringType())  
df = train.withColumn('HelpfulRecords', udf(train['HelpFulRecords']).cast(IntegerType()))

In [18]:
df.show()

+----------+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText|          reviewTime|          reviewerID|             summary|HelpfulRecords|
+----------+-------+-------+--------------------+--------------------+--------------------+--------------------+--------------+
|B00J4S6YWC| [0, 0]|      5|ARC provided by a...|         06 21, 2014|       AUSBN91MCI3WM|  A Very Sexy Cruise|             0|
|B00HCZUBH8| [2, 4]|      5|Wild Ride by Nanc...|          03 3, 2014|      A141H51I3H4B1S|A Changing Gears ...|          null|
|B006RZNR3Y| [0, 0]|      5|Well thought out ...|         07 10, 2014|       AP8TKDM76TROZ|"We don't take ki...|             0|
|B00J47H8H8| [3, 4]|      3|This is book four...|         03 21, 2014|      A19DWIC1T7127Y| I'm losing interest|          null|
|B00LRZLRMM| [0, 0]|      5|I really enjoyed ...|         07 14, 2014|       AM5P5MI4PU2KH|      Boner B

In [19]:
df1=df.toPandas()

In [20]:
col = df1.HelpfulRecords

In [21]:
col.idxmax()

0

In [22]:
df1[834:]

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,summary,HelpfulRecords
834,B004WBJ676,"[161, 164]",2,I purchased this book because of the 200 great...,"03 14, 2012",A2YQHT0W72J4HT,A disappointing selection.,
835,B00K9SPJPO,"[0, 0]",5,I was drawn to this book because I know a lot ...,"05 19, 2014",AXC8GBRXGMRZG,informational and interesting,0.0
836,B00IGKCOCI,"[1, 1]",5,"First of all, I have to reveal that I love coc...","02 17, 2014",AA5HLEE8ND8TJ,Delicious,
837,B00F289E8Y,"[0, 0]",3,"This was a decent book, but not at all what I ...","12 30, 2013",A1168ILANAUMV8,This book was not what I expected.,0.0
838,B00GH4GCQ4,"[2, 2]",5,If your a fan of the GREAT Sandra Brown or Joa...,"01 26, 2014",AZQD4Y7VI8EAR,Loved this book!!!,
...,...,...,...,...,...,...,...,...
978,B00KA0AGJK,"[0, 1]",4,I loved the first BA novel and had been waitin...,"05 13, 2014",A334K0B8FVV77A,Fair follow up,
979,B00IXWHY5Q,"[1, 1]",5,"There are 3 novellas in this, each one is a st...","04 2, 2014",A15FF9W3JCFXV6,Off the world collection,
980,B00ASP8XNS,"[0, 0]",4,This was a good novel to read. Wished it was a...,"03 29, 2014",A2OXH8SW5T7PHJ,Knowing his secret,0.0
981,B008O0QUU2,"[0, 0]",5,Kristen Ashley came through once again! Breath...,"04 17, 2013",A22EWLW1PM53PE,Breathe,0.0


In [23]:
df2=train.toPandas()

In [24]:
import ast
def div(x):
    lit= ast.literal_eval(x)
    if lit[1]!=0:
        return lit[0]/lit[1]
    else:
        return 0

In [25]:
df2

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,summary,HelpfulRecords
0,B00J4S6YWC,"[0, 0]",5,ARC provided by author in exchange for an hone...,"06 21, 2014",AUSBN91MCI3WM,A Very Sexy Cruise,0
1,B00HCZUBH8,"[2, 4]",5,Wild Ride by Nancy WarrenChanging Gears Series...,"03 3, 2014",A141H51I3H4B1S,A Changing Gears Novel,"[2, 4]"
2,B006RZNR3Y,"[0, 0]",5,"Well thought out story, with many things going...","07 10, 2014",AP8TKDM76TROZ,"""We don't take kindly to """"no""""!""",0
3,B00J47H8H8,"[3, 4]",3,This is book four of a five part serial. By n...,"03 21, 2014",A19DWIC1T7127Y,I'm losing interest,"[3, 4]"
4,B00LRZLRMM,"[0, 0]",5,I really enjoyed this book. It kept me interes...,"07 14, 2014",AM5P5MI4PU2KH,Boner Brothers,0
...,...,...,...,...,...,...,...,...
978,B00KA0AGJK,"[0, 1]",4,I loved the first BA novel and had been waitin...,"05 13, 2014",A334K0B8FVV77A,Fair follow up,"[0, 1]"
979,B00IXWHY5Q,"[1, 1]",5,"There are 3 novellas in this, each one is a st...","04 2, 2014",A15FF9W3JCFXV6,Off the world collection,"[1, 1]"
980,B00ASP8XNS,"[0, 0]",4,This was a good novel to read. Wished it was a...,"03 29, 2014",A2OXH8SW5T7PHJ,Knowing his secret,0
981,B008O0QUU2,"[0, 0]",5,Kristen Ashley came through once again! Breath...,"04 17, 2013",A22EWLW1PM53PE,Breathe,0


In [26]:
df2["HelpfulRecords"]=df2["helpful"].apply(div)

In [27]:
df2

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,summary,HelpfulRecords
0,B00J4S6YWC,"[0, 0]",5,ARC provided by author in exchange for an hone...,"06 21, 2014",AUSBN91MCI3WM,A Very Sexy Cruise,0.00
1,B00HCZUBH8,"[2, 4]",5,Wild Ride by Nancy WarrenChanging Gears Series...,"03 3, 2014",A141H51I3H4B1S,A Changing Gears Novel,0.50
2,B006RZNR3Y,"[0, 0]",5,"Well thought out story, with many things going...","07 10, 2014",AP8TKDM76TROZ,"""We don't take kindly to """"no""""!""",0.00
3,B00J47H8H8,"[3, 4]",3,This is book four of a five part serial. By n...,"03 21, 2014",A19DWIC1T7127Y,I'm losing interest,0.75
4,B00LRZLRMM,"[0, 0]",5,I really enjoyed this book. It kept me interes...,"07 14, 2014",AM5P5MI4PU2KH,Boner Brothers,0.00
...,...,...,...,...,...,...,...,...
978,B00KA0AGJK,"[0, 1]",4,I loved the first BA novel and had been waitin...,"05 13, 2014",A334K0B8FVV77A,Fair follow up,0.00
979,B00IXWHY5Q,"[1, 1]",5,"There are 3 novellas in this, each one is a st...","04 2, 2014",A15FF9W3JCFXV6,Off the world collection,1.00
980,B00ASP8XNS,"[0, 0]",4,This was a good novel to read. Wished it was a...,"03 29, 2014",A2OXH8SW5T7PHJ,Knowing his secret,0.00
981,B008O0QUU2,"[0, 0]",5,Kristen Ashley came through once again! Breath...,"04 17, 2013",A22EWLW1PM53PE,Breathe,0.00


In [28]:
df2['weight'] = df2['HelpfulRecords'] + 1
df2

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,summary,HelpfulRecords,weight
0,B00J4S6YWC,"[0, 0]",5,ARC provided by author in exchange for an hone...,"06 21, 2014",AUSBN91MCI3WM,A Very Sexy Cruise,0.00,1.00
1,B00HCZUBH8,"[2, 4]",5,Wild Ride by Nancy WarrenChanging Gears Series...,"03 3, 2014",A141H51I3H4B1S,A Changing Gears Novel,0.50,1.50
2,B006RZNR3Y,"[0, 0]",5,"Well thought out story, with many things going...","07 10, 2014",AP8TKDM76TROZ,"""We don't take kindly to """"no""""!""",0.00,1.00
3,B00J47H8H8,"[3, 4]",3,This is book four of a five part serial. By n...,"03 21, 2014",A19DWIC1T7127Y,I'm losing interest,0.75,1.75
4,B00LRZLRMM,"[0, 0]",5,I really enjoyed this book. It kept me interes...,"07 14, 2014",AM5P5MI4PU2KH,Boner Brothers,0.00,1.00
...,...,...,...,...,...,...,...,...,...
978,B00KA0AGJK,"[0, 1]",4,I loved the first BA novel and had been waitin...,"05 13, 2014",A334K0B8FVV77A,Fair follow up,0.00,1.00
979,B00IXWHY5Q,"[1, 1]",5,"There are 3 novellas in this, each one is a st...","04 2, 2014",A15FF9W3JCFXV6,Off the world collection,1.00,2.00
980,B00ASP8XNS,"[0, 0]",4,This was a good novel to read. Wished it was a...,"03 29, 2014",A2OXH8SW5T7PHJ,Knowing his secret,0.00,1.00
981,B008O0QUU2,"[0, 0]",5,Kristen Ashley came through once again! Breath...,"04 17, 2013",A22EWLW1PM53PE,Breathe,0.00,1.00


In [29]:
df2['weighted_rating'] = df2['overall'] * df2['weight']
df2

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,summary,HelpfulRecords,weight,weighted_rating
0,B00J4S6YWC,"[0, 0]",5,ARC provided by author in exchange for an hone...,"06 21, 2014",AUSBN91MCI3WM,A Very Sexy Cruise,0.00,1.00,5.00
1,B00HCZUBH8,"[2, 4]",5,Wild Ride by Nancy WarrenChanging Gears Series...,"03 3, 2014",A141H51I3H4B1S,A Changing Gears Novel,0.50,1.50,7.50
2,B006RZNR3Y,"[0, 0]",5,"Well thought out story, with many things going...","07 10, 2014",AP8TKDM76TROZ,"""We don't take kindly to """"no""""!""",0.00,1.00,5.00
3,B00J47H8H8,"[3, 4]",3,This is book four of a five part serial. By n...,"03 21, 2014",A19DWIC1T7127Y,I'm losing interest,0.75,1.75,5.25
4,B00LRZLRMM,"[0, 0]",5,I really enjoyed this book. It kept me interes...,"07 14, 2014",AM5P5MI4PU2KH,Boner Brothers,0.00,1.00,5.00
...,...,...,...,...,...,...,...,...,...,...
978,B00KA0AGJK,"[0, 1]",4,I loved the first BA novel and had been waitin...,"05 13, 2014",A334K0B8FVV77A,Fair follow up,0.00,1.00,4.00
979,B00IXWHY5Q,"[1, 1]",5,"There are 3 novellas in this, each one is a st...","04 2, 2014",A15FF9W3JCFXV6,Off the world collection,1.00,2.00,10.00
980,B00ASP8XNS,"[0, 0]",4,This was a good novel to read. Wished it was a...,"03 29, 2014",A2OXH8SW5T7PHJ,Knowing his secret,0.00,1.00,4.00
981,B008O0QUU2,"[0, 0]",5,Kristen Ashley came through once again! Breath...,"04 17, 2013",A22EWLW1PM53PE,Breathe,0.00,1.00,5.00


In [30]:
new_df = df2.groupby(['asin']).agg({'weight': 'sum','weighted_rating': 'sum'})
new_df

Unnamed: 0_level_0,weight,weighted_rating
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
B001892EIS,1.0,4.0
B001B4G89Q,1.0,4.0
B001E50WMG,1.0,5.0
B001TJ1O4W,1.0,2.0
B001VLXML6,1.0,3.0
...,...,...
B00LK2SJVE,1.0,5.0
B00LPPDXC6,1.0,5.0
B00LR3JXFC,2.0,10.0
B00LRZLRMM,1.0,5.0


In [31]:
# build weighted_rating column
new_df['group_weighted_rating'] = new_df['weighted_rating'] / new_df['weight'] 
new_df

Unnamed: 0_level_0,weight,weighted_rating,group_weighted_rating
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B001892EIS,1.0,4.0,4.0
B001B4G89Q,1.0,4.0,4.0
B001E50WMG,1.0,5.0,5.0
B001TJ1O4W,1.0,2.0,2.0
B001VLXML6,1.0,3.0,3.0
...,...,...,...
B00LK2SJVE,1.0,5.0,5.0
B00LPPDXC6,1.0,5.0,5.0
B00LR3JXFC,2.0,10.0,5.0
B00LRZLRMM,1.0,5.0,5.0


In [32]:
new_df

Unnamed: 0_level_0,weight,weighted_rating,group_weighted_rating
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B001892EIS,1.0,4.0,4.0
B001B4G89Q,1.0,4.0,4.0
B001E50WMG,1.0,5.0,5.0
B001TJ1O4W,1.0,2.0,2.0
B001VLXML6,1.0,3.0,3.0
...,...,...,...
B00LK2SJVE,1.0,5.0,5.0
B00LPPDXC6,1.0,5.0,5.0
B00LR3JXFC,2.0,10.0,5.0
B00LRZLRMM,1.0,5.0,5.0


In [33]:
new_df = new_df.drop(['weighted_rating','weight'], axis = 1)
new_df = new_df.rename(columns = {'group_weighted_rating':'weighted_rating'})
new_df

Unnamed: 0_level_0,weighted_rating
asin,Unnamed: 1_level_1
B001892EIS,4.0
B001B4G89Q,4.0
B001E50WMG,5.0
B001TJ1O4W,2.0
B001VLXML6,3.0
...,...
B00LK2SJVE,5.0
B00LPPDXC6,5.0
B00LR3JXFC,5.0
B00LRZLRMM,5.0


In [34]:
df_merge = new_df.reset_index()
df_merge

Unnamed: 0,asin,weighted_rating
0,B001892EIS,4.0
1,B001B4G89Q,4.0
2,B001E50WMG,5.0
3,B001TJ1O4W,2.0
4,B001VLXML6,3.0
...,...,...
961,B00LK2SJVE,5.0
962,B00LPPDXC6,5.0
963,B00LR3JXFC,5.0
964,B00LRZLRMM,5.0


In [45]:
df2 = df2.drop(['weighted_rating'], axis = 1)

In [52]:
import pandas as pd
result_df = pd.concat([df2, df_merge], keys=['asin'], join='outer')
result_df

Unnamed: 0,Unnamed: 1,asin,helpful,overall,reviewText,reviewTime,reviewerID,summary,HelpfulRecords,weight
asin,0,B00J4S6YWC,"[0, 0]",5,ARC provided by author in exchange for an hone...,"06 21, 2014",AUSBN91MCI3WM,A Very Sexy Cruise,0.00,1.00
asin,1,B00HCZUBH8,"[2, 4]",5,Wild Ride by Nancy WarrenChanging Gears Series...,"03 3, 2014",A141H51I3H4B1S,A Changing Gears Novel,0.50,1.50
asin,2,B006RZNR3Y,"[0, 0]",5,"Well thought out story, with many things going...","07 10, 2014",AP8TKDM76TROZ,"""We don't take kindly to """"no""""!""",0.00,1.00
asin,3,B00J47H8H8,"[3, 4]",3,This is book four of a five part serial. By n...,"03 21, 2014",A19DWIC1T7127Y,I'm losing interest,0.75,1.75
asin,4,B00LRZLRMM,"[0, 0]",5,I really enjoyed this book. It kept me interes...,"07 14, 2014",AM5P5MI4PU2KH,Boner Brothers,0.00,1.00
asin,...,...,...,...,...,...,...,...,...,...
asin,978,B00KA0AGJK,"[0, 1]",4,I loved the first BA novel and had been waitin...,"05 13, 2014",A334K0B8FVV77A,Fair follow up,0.00,1.00
asin,979,B00IXWHY5Q,"[1, 1]",5,"There are 3 novellas in this, each one is a st...","04 2, 2014",A15FF9W3JCFXV6,Off the world collection,1.00,2.00
asin,980,B00ASP8XNS,"[0, 0]",4,This was a good novel to read. Wished it was a...,"03 29, 2014",A2OXH8SW5T7PHJ,Knowing his secret,0.00,1.00
asin,981,B008O0QUU2,"[0, 0]",5,Kristen Ashley came through once again! Breath...,"04 17, 2013",A22EWLW1PM53PE,Breathe,0.00,1.00


In [47]:
df_spark = spark.createDataFrame(result_df)
df_spark.show()

+--------------+----------+-------+-------+--------------------+--------------------+--------------------+--------------------+------+
|HelpfulRecords|      asin|helpful|overall|          reviewText|          reviewTime|          reviewerID|             summary|weight|
+--------------+----------+-------+-------+--------------------+--------------------+--------------------+--------------------+------+
|           0.0|B00J4S6YWC| [0, 0]|      5|ARC provided by a...|         06 21, 2014|       AUSBN91MCI3WM|  A Very Sexy Cruise|   1.0|
|           0.5|B00HCZUBH8| [2, 4]|      5|Wild Ride by Nanc...|          03 3, 2014|      A141H51I3H4B1S|A Changing Gears ...|   1.5|
|           0.0|B006RZNR3Y| [0, 0]|      5|Well thought out ...|         07 10, 2014|       AP8TKDM76TROZ|"We don't take ki...|   1.0|
|          0.75|B00J47H8H8| [3, 4]|      3|This is book four...|         03 21, 2014|      A19DWIC1T7127Y| I'm losing interest|  1.75|
|           0.0|B00LRZLRMM| [0, 0]|      5|I really enj

In [38]:
 #df2.show(20)

In [39]:
# type(df2)