In [1]:
import pandas as pd
import pyspark 
from pyspark.sql.functions import col
from pyspark.sql import SQLContext

In [2]:
spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

Loading in Instant video reviews and already cleaned comic reviews.

In [3]:
video_reviews_df = spark.read.json('data/reviews_Amazon_Instant_Video.json')
meta_video_reviews = spark.read.json('data/meta_Amazon_Instant_Video.json')
comic_reviews_df = spark.read.json('data/comic_reviews_wtitle.json')

In [4]:
comic_reviews_df.show(1)

+----------+--------------------+-------+-------------+--------------------+
|      asin|               imUrl|overall|   reviewerID|               title|
+----------+--------------------+-------+-------------+--------------------+
|0345507460|http://ecx.images...|    5.0|ACO26JQ366659|The Dresden Files...|
+----------+--------------------+-------+-------------+--------------------+
only showing top 1 row



Filtering out the comic reviewers ids and then getting reviews that match.

In [5]:
comic_reviewers = comic_reviews_df.select('reviewerID').distinct().collect()

In [6]:
comic_reviewers = [r[0] for r in comic_reviewers]

In [7]:
video_reviews = video_reviews_df.filter(col('reviewerID').isin(comic_reviewers))

In [8]:
video_reviews.count()

5986

In [9]:
all_video_reviews = video_reviews.select(['asin', 'overall','reviewerID']).toPandas()

Neither title or image information is avaiable in the Instant Video meta, so will just have to mark blank for now.

In [10]:
all_video_reviews['title'] = ''
all_video_reviews['imUrl'] = 'N/A - Amazon Instant Video'

In [11]:
all_video_reviews.shape

(5986, 5)

In [12]:
all_video_reviews.head()

Unnamed: 0,asin,overall,reviewerID,title,imUrl
0,B000GIOPK2,5.0,A2XNOB1T796Y6B,,N/A - Amazon Instant Video
1,B000GIOPK2,5.0,A3T0DNK02KT55Q,,N/A - Amazon Instant Video
2,B000GIOPK2,3.0,AJKWF4W7QD4NS,,N/A - Amazon Instant Video
3,B000H00VBQ,5.0,A16XRPF40679KG,,N/A - Amazon Instant Video
4,B000H0X79O,5.0,A1ZY828BYZGA98,,N/A - Amazon Instant Video


Loading in the already cleaned movie reviews and adding these new reviews to it.

In [13]:
movie_reviews_df = spark.read.json('data/movie_reviews_wtitle.json').toPandas()

In [14]:
movie_reviews_df.shape

(103757, 5)

In [15]:
all_movies_df = pd.concat([movie_reviews_df, all_video_reviews], 
                           axis=0, sort=True).reset_index(drop=True)

In [16]:
all_movies_df.tail()

Unnamed: 0,asin,imUrl,overall,reviewerID,title
109738,B00LM493J2,N/A - Amazon Instant Video,4.0,A1S1BJFTA644TU,
109739,B00LPWPMCS,N/A - Amazon Instant Video,4.0,AW3VZ5O895LRK,
109740,B00LPWPMCS,N/A - Amazon Instant Video,4.0,A3DN9249F3X716,
109741,B00LPWPMCS,N/A - Amazon Instant Video,4.0,A2HVL790PBWYTU,
109742,B00LPWPMCS,N/A - Amazon Instant Video,3.0,A2EKSOLTKBGWTK,


Export to preserve

In [17]:
all_movies = spark.createDataFrame(all_movies_df)
all_movies.repartition(1).write.json("data/all_movie_reviews")