<a href="https://colab.research.google.com/github/kk-deng/Big-Data-Challenge/blob/main/Big_Data_Level_1_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Config for Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.2/spark-3.0.2-bin-hadoop2.7.tgz
!tar xf spark-3.0.2-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.2-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession 

spark = SparkSession.builder.appName("Basics").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

# For connection to Postgres 
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2021-03-21 02:53:00--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2021-03-21 02:53:00 (10.0 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [3]:
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Gift_Card_v1_00.tsv.gz"

from pyspark import SparkFiles
spark.sparkContext.addFile(url)
spark_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Gift_Card_v1_00.tsv.gz"), sep="\t", header=True)
spark_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   24371595|R27ZP1F1CD0C3Y|B004LLIL5A|     346014806|Amazon eGift Card...|       Gift Card|          5|            0|          0|   N|                Y|          Five Stars|Great birthday gi...| 2015-08-31|
|         US|   42489718| RJ7RSBCHUDNNE|B004LLIKVU|     473048287|Amazon.com eGift ...|       Gift Card|          5|    

In [4]:
# Outputting the number of rows
spark_df.count()

149086

# Transform to match the table schema

In [5]:
# Removed duplicate rows
spark_df = spark_df.dropDuplicates()
spark_df.count()

149086

In [6]:
# Keep and rename necessary columns
review_id_df = spark_df.select(["review_id", "customer_id", "product_id", "product_parent", "review_date"])
review_id_df.show()

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
| R611C9E5BW4HG|   27816201|B004LLIKVU|     473048287| 2015-08-29|
| RXUB6Z9UQTAIP|   38605041|B00B2TFURQ|     527289417| 2015-08-26|
|R139JH0UAW526N|   48625966|B0145WHYKC|     473048287| 2015-08-24|
|R18SJ9HP6Z6STH|   16730696|B00BWDH368|     473048287| 2015-08-23|
|R1BFDNRYHMMGP2|   10739227|B00A4EK2XW|     129396926| 2015-08-22|
|R3E3NJPAI9AWCO|   14943029|B00BWDH368|     473048287| 2015-08-20|
| R1V2NT6FJ98IC|   45686995|B00G4IURXW|     606034204| 2015-08-19|
|R36UOU2AGKRGCL|   18487107|B004LLIKVU|     473048287| 2015-08-18|
|R3KQOQ7DK5G1KA|     208401|B004LLIL7S|     420267372| 2015-08-18|
|R189L6EAQ9BDZS|   14468969|B004LLIKVU|     473048287| 2015-08-16|
|R2212GYLIVH8UO|    6679426|B00US9QTGM|     298664776| 2015-08-15|
|R2E5JCHJZEWONV|   26913738|B00A48G0D4|     848703272| 2015-08

In [7]:
# For Products table
products_df = spark_df.select(["product_id", "product_title"])
products_df.show()

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|B004LLIKVU|Amazon.com eGift ...|
|B00B2TFURQ|Amazon Gift Card ...|
|B0145WHYKC|Amazon.com eGift ...|
|B00BWDH368|Amazon.com eGift ...|
|B00A4EK2XW|Amazon eGift Card...|
|B00BWDH368|Amazon.com eGift ...|
|B00G4IURXW|Amazon eGift Card...|
|B004LLIKVU|Amazon.com eGift ...|
|B004LLIL7S|Amazon eGift Card...|
|B004LLIKVU|Amazon.com eGift ...|
|B00US9QTGM|    Amazon Allowance|
|B00A48G0D4|Amazon eGift Card...|
|B00IX1I3G6|Amazon.com Gift C...|
|B00CT77IQG|Amazon.com eGift ...|
|B007V6ETDK|Amazon eGift Card...|
|B004LLIKVU|Amazon.com eGift ...|
|BT00CTOZG0|Amazon.com $200 G...|
|B0066AZGJI|Amazon eGift Card...|
|B00XO1OZTY|Amazon eGift Card...|
|B004KNWX80|Amazon Gift Card ...|
+----------+--------------------+
only showing top 20 rows



In [8]:
# For Customers table
customers_df = spark_df.groupby("customer_id").agg({
    "customer_id": "count"
    }).withColumnRenamed("count(customer_id)", "customer_count")
customers_df.show()

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|   14943029|             1|
|   18725832|             1|
|   34013003|             1|
|    1640914|             1|
|   15529426|             1|
|   40415835|             1|
|   52911287|             1|
|    2382013|             1|
|    2271984|             1|
|    8874786|             1|
|    1258580|             1|
|   29623200|             1|
|   50549783|             1|
|   16117964|             1|
|   44184246|             1|
|   26426615|             1|
|   51033380|             1|
|   13903041|             1|
|   15674100|             1|
|   51695279|             1|
+-----------+--------------+
only showing top 20 rows



In [9]:
# For Review table
reviews_df = spark_df.select(["review_id", "review_headline", "review_body"])
reviews_df.show()

+--------------+--------------------+--------------------+
|     review_id|     review_headline|         review_body|
+--------------+--------------------+--------------------+
| R611C9E5BW4HG|               great|        Classic gift|
| RXUB6Z9UQTAIP|Wonderful instant...|I needed a &#34;T...|
|R139JH0UAW526N|          Five Stars|           Excellent|
|R18SJ9HP6Z6STH|Do you have a fri...|Do you have a fri...|
|R1BFDNRYHMMGP2|PERFECT GIFT........|Desperate to find...|
|R3E3NJPAI9AWCO|Easy transaction....|Easy transaction....|
| R1V2NT6FJ98IC|    Great Gift Idea!|Great gift idea! ...|
|R36UOU2AGKRGCL|          Five Stars|      excellent.....|
|R3KQOQ7DK5G1KA|          Five Stars|                good|
|R189L6EAQ9BDZS|Simple, quick and...|Simple to buy. Ea...|
|R2212GYLIVH8UO|          Five Stars|EXCELLENT SERVICE!!!|
|R2E5JCHJZEWONV|          Five Stars|              Excell|
|R30P7PDF27UKLW|          Five Stars|Very good and arr...|
| RQOVC04LN1IL3|I have no real op...|Since I send them..

In [12]:
# For Vine table
vine_df = spark_df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine"])
vine_df.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
| R611C9E5BW4HG|          5|            0|          0|   N|
| RXUB6Z9UQTAIP|          5|            0|          0|   N|
|R139JH0UAW526N|          5|            0|          0|   N|
|R18SJ9HP6Z6STH|          5|            0|          0|   N|
|R1BFDNRYHMMGP2|          5|            0|          0|   N|
|R3E3NJPAI9AWCO|          5|            0|          0|   N|
| R1V2NT6FJ98IC|          5|            0|          0|   N|
|R36UOU2AGKRGCL|          5|            0|          0|   N|
|R3KQOQ7DK5G1KA|          5|            0|          0|   N|
|R189L6EAQ9BDZS|          5|            0|          0|   N|
|R2212GYLIVH8UO|          5|            0|          0|   N|
|R2E5JCHJZEWONV|          5|            0|          0|   N|
|R30P7PDF27UKLW|          5|            0|          0|   N|
| RQOVC04LN1IL3|          5|            

# Load data to AWS RDS

In [16]:
server = "big-data-challenge-2.cifpjfaljnoi.ca-central-1.rds.amazonaws.com"
database = "postgres"
port = "5432"
username = "postgres"
password = "postgres"

conn = f"postgres://{username}:{password}@{server}:{port}/{database}"

from sqlalchemy import create_engine 
engine = create_engine(conn, echo=False)

  """)


In [13]:
# Review ID table
pd_review_id_df = review_id_df.toPandas()
pd_review_id_df.head()

Unnamed: 0,review_id,customer_id,product_id,product_parent,review_date
0,R611C9E5BW4HG,27816201,B004LLIKVU,473048287,2015-08-29
1,RXUB6Z9UQTAIP,38605041,B00B2TFURQ,527289417,2015-08-26
2,R139JH0UAW526N,48625966,B0145WHYKC,473048287,2015-08-24
3,R18SJ9HP6Z6STH,16730696,B00BWDH368,473048287,2015-08-23
4,R1BFDNRYHMMGP2,10739227,B00A4EK2XW,129396926,2015-08-22


In [14]:
pd_review_id_df.set_index("review_id", inplace=True)
pd_review_id_df.head()

Unnamed: 0_level_0,customer_id,product_id,product_parent,review_date
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
R611C9E5BW4HG,27816201,B004LLIKVU,473048287,2015-08-29
RXUB6Z9UQTAIP,38605041,B00B2TFURQ,527289417,2015-08-26
R139JH0UAW526N,48625966,B0145WHYKC,473048287,2015-08-24
R18SJ9HP6Z6STH,16730696,B00BWDH368,473048287,2015-08-23
R1BFDNRYHMMGP2,10739227,B00A4EK2XW,129396926,2015-08-22


In [None]:
pd_review_id_df.to_sql("review_id_table", con=engine)