# Import Dependencies

In [1]:
import os

# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.0.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

# Start a SparkSession
import findspark
findspark.init()

Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:13 http://ppa.launchpad

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2021-11-14 18:36:52--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.3’


2021-11-14 18:36:52 (5.10 MB/s) - ‘postgresql-42.2.9.jar.3’ saved [914037/914037]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("dataset_2_ETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

# Load Amazon Data into Spark DataFrame

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz"
spark.sparkContext.addFile(url)

In [5]:
# set schema
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, DateType

schema = [StructField("marketplace", StringType(), True),
          StructField("customer_id", IntegerType(), True),
          StructField("review_id", StringType(), True),
          StructField("product_id", StringType(), True),
          StructField("product_parent", IntegerType(), True),
          StructField("product_title", StringType(), True),
          StructField("product_category", StringType(), True),
          StructField("star_rating", IntegerType(), True),
          StructField("helpful_votes", IntegerType(), True),
          StructField("total_votes", IntegerType(), True),
          StructField("vine", StringType(), True),
          StructField("verified_purchase", StringType(), True),
          StructField("review_headline", StringType(), True),
          StructField("review_body", StringType(), True),
          StructField("review_date", DateType(), True),]

final=StructType(fields=schema)

In [6]:
ebook_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv.gz"), sep="\t", header=True, schema=final)

# Show DataFrame
ebook_df.show(truncate=False)

+-----------+-----------+--------------+----------+--------------+---------------------------------------------------------------------+----------------------+-----------+-------------+-----------+----+-----------------+---------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Set Tables for Import to Database

In [7]:
# show number of rows in dataset
ebook_df.select("customer_id").count()

5101693

In [8]:
# find unique customers and count number of occurrences
customers_df = ebook_df.select("customer_id").groupBy("customer_id").count().withColumnRenamed("count","customer_count")
customers_df.show()

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|   27134317|           306|
|   36063094|            15|
|   49422311|             1|
|   18978114|             1|
|   13455482|             2|
|   32153206|             1|
|   10274872|             4|
|   20977654|             1|
|   48042607|             1|
|   11376183|             7|
|   31762963|             1|
|   11481508|             4|
|   12362676|             2|
|   31176171|            10|
|   44699860|            23|
|   52734389|             8|
|   24303040|             3|
|   26322451|            77|
|   14521576|            30|
|   44269342|            10|
+-----------+--------------+
only showing top 20 rows



In [9]:
customers_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_count: long (nullable = false)



In [10]:
# find unique products and their titles
from pyspark.sql import Row
products_df = ebook_df.dropDuplicates((["product_id"])).select("product_id", "product_title")

products_df.show(truncate=False)

+----------+-----------------------------------------------------------------------------------------------------------------------+
|product_id|product_title                                                                                                          |
+----------+-----------------------------------------------------------------------------------------------------------------------+
|B000FA5S66|Killing Cousins (Torie O'Shea Mysteries)                                                                               |
|B000FC0RS8|Mindshadow (Star Trek: The Original Series Book 27)                                                                    |
|B000FC0SBY|No True Gentleman                                                                                                      |
|B000FC0ZB2|World on Fire: How Exporting Free Market Democracy Breeds Ethnic Hatred and Global Instability                         |
|B000FC13O0|Scandalous Again: Switching Places #1 (Switching Places S

In [11]:
products_df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_title: string (nullable = true)



In [12]:
# find info specified for review table
review_id_table_df = ebook_df.select("review_id", "customer_id", "product_id", "product_parent", "review_date")
review_id_table_df.show(truncate=False)

+--------------+-----------+----------+--------------+-----------+
|review_id     |customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|RGYFDX8QXKEIR |33605939   |B007KO2MLO|328837464     |2013-09-09 |
|R13CBGTMNV9R8Z|34058393   |B005FLODDE|764276359     |2013-09-09 |
|R7DRFHC0F71O0 |39601147   |B00EA3L35O|535606445     |2013-09-09 |
|R27LUKEXU3KBXQ|17351407   |B00BL3JV50|240053004     |2013-09-09 |
|R1VXTPUYMNU687|10463387   |B00CXU7U80|931529805     |2013-09-09 |
|R30DKW1GJWLPZC|50484904   |B004EWGS5G|442453110     |2013-09-09 |
|R18DPFG2FALJI9|7145636    |B00BNRJAT6|856774152     |2013-09-09 |
|R24D677N5WBW5Q|6285538    |B007FZOXJM|5589837       |2013-09-09 |
|R2FCJ9BQLSIOR3|10278048   |B00B6AK7LU|362701357     |2013-09-09 |
|R1R6K4MAKDWTXI|16568972   |B00EVMMLU0|342745087     |2013-09-09 |
|R3R5DILCWM8J7B|26001763   |B00E4W4984|186117550     |2013-09-09 |
|RR5K72IZOCOFE |48203259   |B005A1JBB8|947574172     |2013-09-

In [13]:
review_id_table_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- review_date: date (nullable = true)



In [14]:
# find info specified for vine table
vine_table_df = ebook_df.select("review_id", "star_rating", "helpful_votes", "total_votes", "vine")
vine_table_df.show(truncate=False)

+--------------+-----------+-------------+-----------+----+
|review_id     |star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|RGYFDX8QXKEIR |4          |0            |0          |N   |
|R13CBGTMNV9R8Z|4          |1            |2          |N   |
|R7DRFHC0F71O0 |5          |0            |0          |N   |
|R27LUKEXU3KBXQ|5          |1            |1          |N   |
|R1VXTPUYMNU687|5          |1            |2          |N   |
|R30DKW1GJWLPZC|3          |1            |2          |N   |
|R18DPFG2FALJI9|5          |0            |0          |N   |
|R24D677N5WBW5Q|5          |0            |0          |N   |
|R2FCJ9BQLSIOR3|5          |0            |0          |N   |
|R1R6K4MAKDWTXI|4          |0            |0          |N   |
|R3R5DILCWM8J7B|5          |0            |0          |N   |
|RR5K72IZOCOFE |4          |0            |0          |N   |
|R3K9PJU5GLDY3O|5          |1            |2          |N   |
|R1KTZMCDOJXAEK|5          |0           

In [15]:
vine_table_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)



# Push to AWS RDS Instace

In [16]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://mypostgresdb.cqdxhnrowhog.us-east-1.rds.amazonaws.com:5432/homework_level_1_db"
configuration = {"user": "root", 
          "password": "<password>"", 
          "driver": "org.postgresql.Driver"}

In [17]:
# Write DataFrame to review_id_table table in RDS
review_id_table_df.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=configuration)

In [18]:
# Write DataFrame to products table in RDS
products_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=configuration)

In [19]:
# Write DataFrame to customers table in RDS
customers_df.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=configuration)

In [20]:
# Write DataFrame to vine_table table in RDS
vine_table_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=configuration)