In [13]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [14]:
#Establish connection with  POSTGRES
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-06-28 01:24:44--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.2’


2020-06-28 01:24:45 (4.72 MB/s) - ‘postgresql-42.2.9.jar.2’ saved [914037/914037]



In [15]:
# Create SparkSession using its connection with Postgres
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("amazonCloud_VineAnalysis").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

## Postgres Setup to read DB and Load Table Contents to Spark Dataframe



In [16]:
# Configure settings for RDS

jdbc_url="jdbc:postgresql://mypostgresdb.cw6xrdxbjex8.us-east-2.rds.amazonaws.com:5432/bigdataHW_db" 

config = {"user":"root", "password":"basededatos", "driver":"org.postgresql.Driver"}

Read Sports Tables

In [17]:
reviewID_sportsDF = spark.read.jdbc(url=jdbc_url, table='review_id_table', properties=config)
products_sportsDF = spark.read.jdbc(url=jdbc_url, table='products', properties=config)
customers_sportsDF = spark.read.jdbc(url=jdbc_url, table='customers', properties=config)
vine_sportsDF = spark.read.jdbc(url=jdbc_url, table='vine_table', properties=config)

# reviewID_sportsDF.printSchema()
# products_sportsDF.printSchema()
# customers_sportsDF.printSchema()
# vine_sportsDF.printSchema()

Read Outdoors Tables

In [18]:
reviewID_outdoorsDF = spark.read.jdbc(url=jdbc_url, table='review_id_table2', properties=config)
products_outdoorsDF = spark.read.jdbc(url=jdbc_url, table='products2', properties=config)
customers_outdoorsDF = spark.read.jdbc(url=jdbc_url, table='customers2', properties=config)
vine_outdoorsDF = spark.read.jdbc(url=jdbc_url, table='vine_table2', properties=config)

reviewID_outdoorsDF.printSchema()
products_outdoorsDF.printSchema()
customers_outdoorsDF.printSchema()
vine_outdoorsDF.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- review_date: date (nullable = true)

root
 |-- product_id: string (nullable = true)
 |-- product_title: string (nullable = true)

root
 |-- customer_id: integer (nullable = true)
 |-- customer_count: integer (nullable = true)

root
 |-- review_id: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)



## OUTDOORS DB ANALYSIS

**1. HOW MANY CUSTOMERS ARE CATEGORIZED AS "VINE VOICES" / BELONG TO THE "VINE PROGRAM"?**

In [94]:
#Identify reviews categorized as vine 
from pyspark.sql.functions import col
vine_reviews_outdoors = vine_outdoorsDF.filter(col("vine") == "Y")


In [95]:
# Join DFs of reviews
review_vine_OutDf= vine_reviews_outdoors.join(reviewID_outdoorsDF, on="review_id", how="inner")

In [96]:
# Join DFs of reviews and customers
review_vine_customers_OutDf= review_vine_OutDf.join(customers_outdoorsDF, on="customer_id", how="inner")
# print(review_vine_customers_OutDf.count())
# review_vine_customers_OutDf.show(10, truncate=False)
review_vine_customers_OutDf.columns

['customer_id',
 'review_id',
 'star_rating',
 'helpful_votes',
 'total_votes',
 'vine',
 'product_id',
 'product_parent',
 'review_date',
 'customer_count']

In [97]:
vine_customers_Outdoors = review_vine_customers_OutDf.groupBy("customer_id").count()

print(f"Total of customers belonging to the Vine Program (for outdoors products reviews): {vine_customers_Outdoors.count()}")

Total of customers belonging to the Vine Program (for outdoors products reviews): 1773


**2. WHO ARE THE TOP 20 VINE CUSTOMERS WITH MORE REVIEWS?**

In [98]:
from pyspark.sql.functions import desc
vine_customers_OutdoorsTop20 = vine_customers_Outdoors.orderBy(vine_customers_Outdoors["count"].desc()).limit(20).show(truncate = False)

+-----------+-----+
|customer_id|count|
+-----------+-----+
|48428870   |9    |
|50272083   |9    |
|13722260   |8    |
|38232031   |8    |
|31302915   |8    |
|49346733   |8    |
|45351550   |8    |
|36695257   |8    |
|29531597   |8    |
|11556116   |8    |
|38462413   |8    |
|49299352   |7    |
|51624877   |7    |
|50760025   |7    |
|21084288   |7    |
|49980229   |7    |
|51409121   |7    |
|40013398   |7    |
|21028674   |7    |
|52536258   |6    |
+-----------+-----+



**3. WHAT IS THE AVERAGE STAR RATINGS FOR THE TOP 20 CUSTOMERS WITH MORE REVIEWS?**

In [99]:
#Average Star rating of Top 20 vine customers with more reviews
from pyspark.sql.functions import format_number

vine_customers_rates_Outdoor = review_vine_customers_OutDf.groupBy("customer_id").agg({"review_id":"count","star_rating": "avg" })

vine_customers_rates_Outdoor.orderBy(col("count(review_id)").desc()).select("customer_id","count(review_id)",(format_number("avg(star_rating)",2).alias ("AVG Star Rating"))).limit(20).show(truncate = False)

+-----------+----------------+---------------+
|customer_id|count(review_id)|AVG Star Rating|
+-----------+----------------+---------------+
|48428870   |9               |5.00           |
|50272083   |9               |3.89           |
|13722260   |8               |4.12           |
|38232031   |8               |4.75           |
|31302915   |8               |4.62           |
|49346733   |8               |3.75           |
|45351550   |8               |4.62           |
|36695257   |8               |4.38           |
|29531597   |8               |4.50           |
|11556116   |8               |4.38           |
|38462413   |8               |3.25           |
|49299352   |7               |4.43           |
|51624877   |7               |4.43           |
|50760025   |7               |3.71           |
|21084288   |7               |4.14           |
|49980229   |7               |4.29           |
|51409121   |7               |3.71           |
|40013398   |7               |4.71           |
|21028674   |

**4. WHAT IS THE AVERAGE STAR RATING FOR THE CUSTOMER WITH THE MOST HELPFUL VOTES?**

In [101]:
vine_customers_helpvotes_Outdoor = review_vine_customers_OutDf.groupBy("customer_id").agg({"helpful_votes":"sum","star_rating": "avg" })
vine_customers_helpvotes_Outdoor.orderBy(vine_customers_helpvotes_Outdoor["sum(helpful_votes)"].desc()).limit(1).show(truncate = False)

+-----------+----------------+------------------+
|customer_id|avg(star_rating)|sum(helpful_votes)|
+-----------+----------------+------------------+
|45351550   |4.625           |474               |
+-----------+----------------+------------------+



**5. WHAT IS THE AVERAGE STAR RATING OF THE VINE CUSTOMER WITH LESS REVIEWS AND HOW MANY HELPFUL AND TOTAL VOTES HAS?**

In [102]:
vine_customers_Outdoor_TtlInfo= review_vine_customers_OutDf.groupBy("customer_id").agg({"review_id":"count","total_votes":"sum","helpful_votes":"sum","star_rating": "avg" })
vine_customers_Outdoor_TtlInfo.orderBy("count(review_id)").orderBy("sum(helpful_votes)").limit(1).show()


+-----------+----------------+----------------+----------------+------------------+
|customer_id|count(review_id)|sum(total_votes)|avg(star_rating)|sum(helpful_votes)|
+-----------+----------------+----------------+----------------+------------------+
|   38679000|               1|               1|             5.0|                 0|
+-----------+----------------+----------------+----------------+------------------+

