In [1]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql.functions import to_date
import pyspark.sql.functions as f
import pandas as pd

In [2]:
# Create spark session
spark = SparkSession.builder.appName("sc").getOrCreate()

## Mobile Electronics

In [3]:
# Read in data 
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz"), sep="\t", header=True, inferSchema=True)

# Change format of review_date
df = df.withColumn("review_date",to_date(df["review_date"], 'yyyy/mm/dd'))
df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|  product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   20422322| R8MEA6IGAHO0B|B00MC4CED8|     217304173|BlackVue DR600GW-PMP|Mobile_Electronics|          5|            0|          0|   N|                Y|         Very Happy!|As advertised. Ev...| 2015-08-31|
|         US|   40835037|R31LOQ8JGLPRLK|B00OQMFG1Q|     137313254|GENSSI GSM / GPS ...|Mobile_Electronics|      

In [4]:
# Create vine table
vine = df.select(["review_id","star_rating","helpful_votes","total_votes","vine"]).distinct()
vine.show()
print(vine.count())

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1CUNSRHBYLK9Z|          5|            0|          0|   N|
|R1R92GU8HX3MS3|          5|            0|          0|   N|
|R3T04LU0Y3ISRP|          5|            0|          0|   N|
|R35NXSJYS0KO21|          5|            2|          2|   N|
|R34AP4T5722YRA|          5|            0|          0|   N|
|R1OZ2C46Y5YWYA|          4|            0|          0|   N|
|R2LCMB4NBNV8QA|          4|            0|          0|   N|
|R33JXV99QFS9RD|          3|            0|          0|   N|
|R1Y3W0LSNMJLMA|          5|            0|          0|   N|
|R3NCC7HYC1RQXW|          3|            0|          0|   N|
|R3RW7L5N5DH1ZO|          1|            0|          0|   N|
| R7WE4DW38VLTF|          5|            0|          0|   N|
|R1N4TO70TA41N5|          2|            0|          0|   N|
|R155SD2MBVL3G7|          5|            

In [5]:
# Convert vine table to pandas df
vine_df = vine.toPandas()

# Drop the rows with null values 
vine_df = vine_df.dropna(axis=0)

vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine
0,R1CUNSRHBYLK9Z,5,0,0,N
1,R1R92GU8HX3MS3,5,0,0,N
2,R3T04LU0Y3ISRP,5,0,0,N
3,R35NXSJYS0KO21,5,2,2,N
4,R34AP4T5722YRA,5,0,0,N


In [6]:
# Print number of rows
vine_df.count()

review_id        104975
star_rating      104975
helpful_votes    104975
total_votes      104975
vine             104975
dtype: int64

In [7]:
# Calculate average star_rating, # 5-star reviews, helpful votes, and total_votes by vine status
avg_star_rating = vine_df.groupby("vine")["star_rating"].mean()
num_five_star_reviews = vine_df.loc[vine_df["star_rating"]==5].groupby("vine")["star_rating"].count()
avg_helpful_votes = vine_df.groupby("vine")["helpful_votes"].mean()
avg_total_votes = vine_df.groupby("vine")["total_votes"].mean()

In [8]:
# Create dataframe with calculated values
vine_summary = pd.DataFrame({
    'avg_star_rating' : avg_star_rating,
    'num_five_star_reviews' : num_five_star_reviews,
    'avg_helpful_votes' : avg_helpful_votes,
    'avg_total_votes' : avg_total_votes
})
vine_summary

Unnamed: 0_level_0,avg_star_rating,num_five_star_reviews,avg_helpful_votes,avg_total_votes
vine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3.763141,52249,1.237345,1.607125
Y,3.888889,6,41.888889,50.5


## Major Appliances

In [9]:
# Read in data 
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Major_Appliances_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Major_Appliances_v1_00.tsv.gz"), sep="\t", header=True, inferSchema=True)

# Change format of review_date
df = df.withColumn("review_date",to_date(df["review_date"], 'yyyy/mm/dd'))
df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   16199106|R203HPW78Z7N4K|B0067WNSZY|     633038551|FGGF3032MW Galler...|Major Appliances|          5|            0|          0|   N|                Y|If you need a new...|What a great stov...| 2015-08-31|
|         US|   16374060|R2EAIGVLEALSP3|B002QSXK60|     811766671|Best Hand Clothes...|Major Appliances|          5|    

In [10]:
# Create vine table
vine = df.select(["review_id","star_rating","helpful_votes","total_votes","vine"]).distinct()
vine.show()
print(vine.count())

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
| RY52KZABZK8QF|          1|            0|          0|   N|
|R2U9J9KVB8S140|          5|            1|          1|   N|
|R145WADN41BMII|          3|            0|          1|   N|
| R1EGFDXPYGROC|          5|           28|         32|   N|
|R3LSXWYAIE6WBG|          1|            0|          2|   N|
|R1UW7UNV8WVVDM|          5|            8|         12|   N|
| RLF34QU0SEFIC|          4|            0|          0|   N|
|R36FQD22P658I4|          5|            0|          0|   N|
|R3LTMSZ0JFM20H|          5|            0|          0|   N|
|R21RFI3YONUSXF|          2|            0|          1|   N|
|R1YYVIO5OG2AQ0|          5|            1|          2|   N|
| R7W2CF7W8LM1N|          5|            0|          0|   N|
|R2WNIYQ3L9MN9Q|          1|            4|          4|   N|
| RG9X7NVNUBZ0V|          1|            

In [11]:
# Convert vine table to pandas df
vine_df = vine.toPandas()

# Drop the rows with null values 
vine_df = vine_df.dropna(axis=0)

vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine
0,RY52KZABZK8QF,1,0,0,N
1,R2U9J9KVB8S140,5,1,1,N
2,R145WADN41BMII,3,0,1,N
3,R1EGFDXPYGROC,5,28,32,N
4,R3LSXWYAIE6WBG,1,0,2,N


In [12]:
# Print number of rows
vine_df.count()

review_id        96901
star_rating      96901
helpful_votes    96901
total_votes      96901
vine             96901
dtype: int64

In [13]:
# Calculate average star_rating, # 5-star reviews, helpful votes, and total_votes by vine status
avg_star_rating = vine_df.groupby("vine")["star_rating"].mean()
num_five_star_reviews = vine_df.loc[vine_df["star_rating"]==5].groupby("vine")["star_rating"].count()
avg_helpful_votes = vine_df.groupby("vine")["helpful_votes"].mean()
avg_total_votes = vine_df.groupby("vine")["total_votes"].mean()

In [14]:
# Create dataframe with calculated values
vine_summary = pd.DataFrame({
    'avg_star_rating' : avg_star_rating,
    'num_five_star_reviews' : num_five_star_reviews,
    'avg_helpful_votes' : avg_helpful_votes,
    'avg_total_votes' : avg_total_votes
})
vine_summary

Unnamed: 0_level_0,avg_star_rating,num_five_star_reviews,avg_helpful_votes,avg_total_votes
vine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,3.714805,49592,4.315779,5.173528
Y,4.254032,112,14.274194,16.33871
