In [1]:
import os

# latest spark version: 3.3.1
spark_version = 'spark-3.3.1'
os.environ['SPARK_VERSION']=spark_version

# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2022-11-16 20:45:32--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2022-11-16 20:45:32 (10.5 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [4]:
from pyspark import SparkFiles

# load in the file we wanna read
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz"
spark.sparkContext.addFile(url)

# create the DataFrame
data_DF = spark.read.option('header','true').csv(SparkFiles.get("amazon_reviews_us_Video_Games_v1_00.tsv.gz"), inferSchema=True, sep='\t', timestampFormat="yyyy/mm/dd")
data_DF.show(10)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   12039526| RTIS3L2M1F5SM|B001CXYMFS|     737716809|Thrustmaster T-Fl...|     Video Games|          5|            0|          0|   N|                Y|an amazing joysti...|Used this for Eli...| 2015-08-31|
|         US|    9636577| R1ZV7R40OLHKD|B00M920ND6|     569686175|Tonsee 6 buttons ...|     Video Games|          5|    

are Vine reviews truly trustworthy? Your task is to investigate whether Vine reviews are free of bias. Use either PySpark or, for an extra challenge, SQL to analyze the data.

* If you choose SQL, first use Spark on Colab to extract and transform the data and then load it into a SQL table on your RDS account. Perform your analysis with SQL queries on RDS.

* While there are no strict requirements for the analysis, consider steps you can take to reduce noisy data, such as filtering for reviews that meet a certain number of helpful votes, total votes, or both.

* Submit a summary of your findings and analysis.

In [6]:
# select the columns we want to compare: star rating, helpful votes, total votes, vine and verified purchase
df_select = data_DF.select(["star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase"])
df_select.show()

+-----------+-------------+-----------+----+-----------------+
|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+-----------+-------------+-----------+----+-----------------+
|          5|            0|          0|   N|                Y|
|          5|            0|          0|   N|                Y|
|          1|            0|          1|   N|                Y|
|          3|            0|          0|   N|                Y|
|          4|            0|          0|   N|                Y|
|          1|            0|          0|   N|                Y|
|          5|            0|          0|   N|                Y|
|          5|            0|          0|   N|                Y|
|          5|            0|          0|   N|                Y|
|          4|            0|          0|   N|                Y|
|          5|            0|          0|   N|                N|
|          1|            0|          0|   N|                Y|
|          2|            0|          0|   N|           

In [7]:
# drop N/A and Duplicates
df_select = df_select.dropna(how='any')
df_select.dropDuplicates()
df_select.show()

+-----------+-------------+-----------+----+-----------------+
|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+-----------+-------------+-----------+----+-----------------+
|          5|            0|          0|   N|                Y|
|          5|            0|          0|   N|                Y|
|          1|            0|          1|   N|                Y|
|          3|            0|          0|   N|                Y|
|          4|            0|          0|   N|                Y|
|          1|            0|          0|   N|                Y|
|          5|            0|          0|   N|                Y|
|          5|            0|          0|   N|                Y|
|          5|            0|          0|   N|                Y|
|          4|            0|          0|   N|                Y|
|          5|            0|          0|   N|                N|
|          1|            0|          0|   N|                Y|
|          2|            0|          0|   N|           

In [8]:
# filter if there are over 10 total votes and if the average of helpful votes is over .5
df_vine = df_select.filter("total_votes>=10").filter(df_select["helpful_votes"]/df_select["total_votes"] >= 0.5)
df_vine.show()

+-----------+-------------+-----------+----+-----------------+
|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+-----------+-------------+-----------+----+-----------------+
|          1|           21|         34|   N|                N|
|          1|           21|         35|   N|                Y|
|          5|           16|         17|   N|                Y|
|          1|           13|         15|   N|                N|
|          5|           14|         18|   N|                Y|
|          1|          147|        175|   N|                Y|
|          4|           13|         15|   N|                N|
|          2|           55|         60|   N|                N|
|          1|           51|         65|   N|                Y|
|          1|           14|         16|   N|                Y|
|          4|           31|         36|   N|                N|
|          1|           10|         16|   N|                Y|
|          5|           10|         12|   N|           

## Now we start the analysis

In [9]:
# setting viariables for if the customer is a vine reviewer or not
from pyspark.sql.functions import col, avg
df_paid = df_vine.filter("vine='Y'")
df_unpaid = df_vine.filter("vine='N'")

In [10]:
# show statistics with .describe() for the paid vine reviewers
df_paid.describe().show()

+-------+------------------+------------------+------------------+----+-----------------+
|summary|       star_rating|     helpful_votes|       total_votes|vine|verified_purchase|
+-------+------------------+------------------+------------------+----+-----------------+
|  count|               209|               209|               209| 209|              209|
|   mean| 4.100478468899522| 30.38755980861244|35.047846889952154|null|             null|
| stddev|0.9876398444142312|48.892720457429824|  52.0989960745545|null|             null|
|    min|                 1|                 5|                10|   Y|                N|
|    max|                 5|               347|               362|   Y|                Y|
+-------+------------------+------------------+------------------+----+-----------------+



In [11]:
# show statistics with .describe() for the unpaid reviewers
df_unpaid.describe().show()

+-------+------------------+-----------------+------------------+-----+-----------------+
|summary|       star_rating|    helpful_votes|       total_votes| vine|verified_purchase|
+-------+------------------+-----------------+------------------+-----+-----------------+
|  count|             97866|            97866|             97866|97866|            97866|
|   mean|3.3428974311814112|25.77585678376556|30.945905626060124| null|             null|
| stddev|1.6335240548991206|77.77514663937731| 84.59013094414958| null|             null|
|    min|                 1|                5|                10|    N|                N|
|    max|                 5|            10498|             10780|    N|                Y|
+-------+------------------+-----------------+------------------+-----+-----------------+



### 5 star Paid Reviews

In [12]:
# counting vine reviewers who gave 5 stars to products
five_star = df_paid[df_paid['star_rating'] == 5].count()
five_star

93

In [21]:
# counting all star levels given by vine reviewers
all_paid = df_paid.count()
all_paid

209

In [16]:
# percent of 5 star reviewers in Vine
pct_paid = (five_star/all_paid)*100
print(f"% of paid 5 star Vine reviewers: {pct_paid}")

% of paid 5 star Vine reviewers: 44.49760765550239


### 5 star Unpaid Reviews

In [17]:
# counting unpaid reviewers who gave 5 stars to products
unpaid_five_star = df_unpaid[df_paid['star_rating'] == 5].count()
unpaid_five_star

37451

In [18]:
# counting all star levels given by unpaid reviewers
all_unpaid = df_unpaid.count()
all_unpaid

97866

In [19]:
# percent of 5 star reviewers who were not paid
pct_unpaid = (unpaid_five_star/all_unpaid)*100
print(f"% of paid 5 star unpaid reviewers: {pct_unpaid}")

% of paid 5 star unpaid reviewers: 38.26763125089408


### Vine Analysis

Are Vine reviews truly trustworthy when it comes to Video Games?

* Percent of 5 Star Ratings:
  * 45% of Vine Reviewers
  * 38% of Unpaid Reviewers
As almost half of Vine reviewers give 5 star reviews, I would be cautious to trust their opinions.

* The Average Total Votes show as:
  * 35 for Vine reviewers
  * 30 for Unpaid reviewers
The Average total votes are close in number which is great baseline for the rest of our comparison.

* The Standard Deviation for Helpful Votes shows as:
  *  30 for Paid Helpful Votes 
  *  77 for Unpaid Helpful Votes
For this metric, the Unpaid reviews have a wider SD so a bit more unreliable than the Vine reviews.
  
* The Average Star Rating:
  * 4.1 for Vine Reviewers
  * 3.3 for Unpaid Reviewers
While these numbers may look close together, we must remember this is a 5 star scale. Based on this alone, I'd be more likely to trust the Unpaid reviews.
  
With the information given below, I'd warn agains relying heavily on Vine reviews. 
