In [1]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
#Establish connection with  POSTGRES
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar


--2020-06-27 00:56:22--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2020-06-27 00:56:23 (3.11 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [3]:
# Create SparkSession using its connection with Postgres
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("amazonCloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

## EXTRACT

In [4]:
# Load in file from amazon aws url
from pyspark import SparkFiles

url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Sports_v1_00.tsv.gz"
spark.sparkContext.addFile(url)

sports_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Sports_v1_00.tsv.gz"),header= True, sep="\t",inferSchema=True, timestampFormat="yyyy-mm-dd")

# Show DataFrame
sports_df.show(2, truncate=False)

+-----------+-----------+--------------+----------+--------------+-----------------------------------------------------------------------------------------------+----------------+-----------+-------------+-----------+----+-----------------+-----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|marketplace|customer_id|review_id     |product_id|product_parent|product_title                                                                                  |product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|review_headline  |review_body                                                                                                                                                                   |review_date        |
+-----------+-----------+--------------+----------+--------------+--------------------------

**Number of records in the dataframe**

In [5]:
#Count the number of records (rows) in the dataset
print(sports_df.count())

4850360


Examine the schema

In [6]:
sports_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: timestamp (nullable = true)



## TRANSFORM

In [7]:
#Remove NA values
sports_df2 = sports_df.dropna()
sports_df2.show(2, truncate=False)

+-----------+-----------+--------------+----------+--------------+-----------------------------------------------------------------------------------------------+----------------+-----------+-------------+-----------+----+-----------------+-----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|marketplace|customer_id|review_id     |product_id|product_parent|product_title                                                                                  |product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|review_headline  |review_body                                                                                                                                                                   |review_date        |
+-----------+-----------+--------------+----------+--------------+--------------------------

In [8]:
#Count the number of records (rows) in the dataset (without NA)
print(sports_df2.count())

4848999


In [10]:
#Change review_date column timestamp format to date
from pyspark.sql.functions import col, to_date

sports_df3 = sports_df2.withColumn("review_date", to_date(col("review_date"),"yyyy-mm-dd"))
sports_df3.show(2, truncate=False)

+-----------+-----------+--------------+----------+--------------+-----------------------------------------------------------------------------------------------+----------------+-----------+-------------+-----------+----+-----------------+-----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|marketplace|customer_id|review_id     |product_id|product_parent|product_title                                                                                  |product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|review_headline  |review_body                                                                                                                                                                   |review_date|
+-----------+-----------+--------------+----------+--------------+------------------------------------------

Create "review_id_table"

In [11]:
#Create df to match "review_id_table"
reviewID_sportsDF = sports_df3.select(["review_id","customer_id","product_id","product_parent","review_date"])
reviewID_sportsDF.show(2, truncate=False)

+--------------+-----------+----------+--------------+-----------+
|review_id     |customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R1WBPB8MDCCN8F|48945260   |B012P7UPSM|409940130     |2015-01-31 |
|R32M0YEWV77XG8|5782091    |B001GQ3VHG|657746679     |2015-01-31 |
+--------------+-----------+----------+--------------+-----------+
only showing top 2 rows



In [12]:
#Validate Schema matches the table established in postgres DB
reviewID_sportsDF.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- review_date: date (nullable = true)



Create "products" table

In [13]:
#Create df to match "products" table
products_sportsDF = sports_df3.select(["product_id","product_title"])
products_sportsDF.show(2, truncate=False)

+----------+-----------------------------------------------------------------------------------------------+
|product_id|product_title                                                                                  |
+----------+-----------------------------------------------------------------------------------------------+
|B012P7UPSM|Chicago Blackhawks Adult Cuff Knit Beanie w/ Pom One Size Fits All NHL Authentic Hat Cap - OSFA|
|B001GQ3VHG|Copag Poker Size Regular Index 1546 Playing Cards 2 decks (Black Gold Setup)                   |
+----------+-----------------------------------------------------------------------------------------------+
only showing top 2 rows



In [14]:
#Drop duplicates to contain only unique values
products_sportsDF = products_sportsDF.dropDuplicates()
# products_sportsDF.count()

In [15]:
#Validate Schema matches the table established in postgres DB
products_sportsDF.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_title: string (nullable = true)



Create "customers" table

In [16]:
#Create df to match "customers" table
#First, create "customer_count" column
customers_count = sports_df3.groupBy("customer_id").count() #creates a df with 2 columns, "customer_id" and "count"

customers_sportsDF = customers_count.withColumnRenamed("count","customer_count")
customers_sportsDF.show(2,truncate= False)

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|35669025   |1             |
|48198949   |30            |
+-----------+--------------+
only showing top 2 rows



In [17]:
#Validate Schema matches the table established in postgres DB
customers_sportsDF.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_count: long (nullable = false)



In [18]:
#Format customer_count column. Change from Long to Integer.
from pyspark.sql.types import IntegerType

customers_sportsDF = customers_sportsDF.withColumn("customer_count",col("customer_count").cast(IntegerType()))

#Validate schema
customers_sportsDF.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_count: integer (nullable = false)



Create "vine" table

In [19]:
#Create df to match "vine" table
vine_sportsDF = sports_df3.select(["review_id","star_rating","helpful_votes","total_votes","vine"])
vine_sportsDF.show(5, truncate=False)

+--------------+-----------+-------------+-----------+----+
|review_id     |star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1WBPB8MDCCN8F|5          |0            |0          |N   |
|R32M0YEWV77XG8|5          |1            |1          |N   |
|RR8V7WR27NXJ5 |1          |0            |0          |N   |
|R1MHO5V9Z932AY|5          |0            |0          |N   |
|R16PD71086BD2V|5          |0            |1          |N   |
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [20]:
#Validate Schema matches the table established in postgres DB
vine_sportsDF.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)



In [21]:
#Format star_rating column. Change from String to Integer.
from pyspark.sql.types import IntegerType

vine_sportsDF = vine_sportsDF.withColumn("star_rating",col("star_rating").cast(IntegerType()))
vine_sportsDF.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)



## LOAD

Postgres Setup

In [22]:
# Configure settings for RDS

mode = "append"
jdbc_url="jdbc:postgresql://mypostgresdb.cw6xrdxbjex8.us-east-2.rds.amazonaws.com:5432/bigdataHW_db" 

config = {"user":"root", "password":"basededatos", "driver":"org.postgresql.Driver"}

Write DataFrame to RDS

In [28]:
# Write DataFrame to review_id_table table in RDS

reviewID_sportsDF.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)

In [24]:
# Write DataFrame to products table in RDS

products_sportsDF.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=config)

In [25]:
# Write DataFrame to customers table in RDS

customers_sportsDF.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=config)

In [26]:
# Write DataFrame to vine_table table in RDS

vine_sportsDF.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)