In [1]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
#Establish connection with  POSTGRES
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-06-28 09:02:44--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.1’


2020-06-28 09:02:45 (4.77 MB/s) - ‘postgresql-42.2.9.jar.1’ saved [914037/914037]



In [3]:
# Create SparkSession using its connection with Postgres
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("amazonCloud_VineAnalysis").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

## Postgres Setup to read DB and Load Table Contents to Spark Dataframe



In [4]:
# Configure settings for RDS

jdbc_url="jdbc:postgresql://mypostgresdb.cw6xrdxbjex8.us-east-2.rds.amazonaws.com:5432/bigdataHW_db" 

config = {"user":"root", "password":"basededatos", "driver":"org.postgresql.Driver"}

Read Sports Tables

In [5]:
reviewID_sportsDF = spark.read.jdbc(url=jdbc_url, table='review_id_table', properties=config)
products_sportsDF = spark.read.jdbc(url=jdbc_url, table='products', properties=config)
customers_sportsDF = spark.read.jdbc(url=jdbc_url, table='customers', properties=config)
vine_sportsDF = spark.read.jdbc(url=jdbc_url, table='vine_table', properties=config)

# reviewID_sportsDF.printSchema()
# products_sportsDF.printSchema()
# customers_sportsDF.printSchema()
# vine_sportsDF.printSchema()

Read Outdoors Tables

In [6]:
reviewID_outdoorsDF = spark.read.jdbc(url=jdbc_url, table='review_id_table2', properties=config)
products_outdoorsDF = spark.read.jdbc(url=jdbc_url, table='products2', properties=config)
customers_outdoorsDF = spark.read.jdbc(url=jdbc_url, table='customers2', properties=config)
vine_outdoorsDF = spark.read.jdbc(url=jdbc_url, table='vine_table2', properties=config)

# reviewID_outdoorsDF.printSchema()
# products_outdoorsDF.printSchema()
# customers_outdoorsDF.printSchema()
# vine_outdoorsDF.printSchema()

## OUTDOORS DB ANALYSIS

In [7]:
print(vine_outdoorsDF.count())
vine_outdoorsDF.select("vine").distinct().show()

2302173
+----+
|vine|
+----+
|   Y|
|   N|
+----+



In [8]:
#Identify reviews categorized as vine 
from pyspark.sql.functions import col
vine_reviews_outdoors = vine_outdoorsDF.filter(col("vine") == "Y")
print(f"Total reviews categorized as vine: {vine_reviews_outdoors.count()}")
# vine_reviews_outdoors.show(10, truncate=False)

Total reviews categorized as vine: 3137


In [9]:
print(f"Percentage of vine reviews among all sports reviews: {round((vine_reviews_outdoors.count()/vine_outdoorsDF.count())*100,2)}%")

Percentage of vine reviews among all sports reviews: 0.14%


In [10]:
# Join DFs of reviews
review_vine_OutDf= vine_reviews_outdoors.join(reviewID_outdoorsDF, on="review_id", how="inner")
# review_vine_OutDf.show(10, truncate=False)

In [11]:
# Join DFs of reviews and customers
review_vine_customers_OutDf= review_vine_OutDf.join(customers_outdoorsDF, on="customer_id", how="inner")
# review_vine_customers_OutDf.show(10, truncate=False)
# review_vine_customers_OutDf.columns

In [12]:
# Dataframe of Outdoor Products by Customer

vine_customers_Outdoor_TtlInfo= review_vine_customers_OutDf.groupBy("customer_id").agg({"review_id":"count","total_votes":"sum","helpful_votes":"sum","star_rating": "avg","product_id":"count"})
vine_customers_Outdoor_TtlInfo.show(5)
vine_customers_Outdoor_TtlInfo2 = vine_customers_Outdoor_TtlInfo.toDF("Customer ID", "Total Reviews", "Helpful votes", "Average Star Rating", "Total Products reviewed", "Total Votes")
vine_customers_Outdoor_TtlInfo2.show(5)

+-----------+----------------+------------------+------------------+-----------------+----------------+
|customer_id|count(review_id)|sum(helpful_votes)|  avg(star_rating)|count(product_id)|sum(total_votes)|
+-----------+----------------+------------------+------------------+-----------------+----------------+
|   17481726|               3|                12|3.6666666666666665|                3|              13|
|   18800155|               2|                 1|               4.0|                2|               2|
|   38679000|               1|                 0|               5.0|                1|               1|
|   41549558|               5|                 3|               5.0|                5|               6|
|   49214860|               3|               123|               4.0|                3|             125|
+-----------+----------------+------------------+------------------+-----------------+----------------+
only showing top 5 rows

+-----------+-------------+------------

**1. HOW MANY CUSTOMERS ARE CATEGORIZED AS "VINE VOICES" / BELONG TO THE "VINE PROGRAM"?**

In [13]:
# total vine customers
vine_customers_Outdoor_TtlInfo2.cache()
print(f"Total of customers belonging to the Vine Program (for outdoors products reviews): {vine_customers_Outdoor_TtlInfo2.count()}")

Total of customers belonging to the Vine Program (for outdoors products reviews): 1773


**2. WHO ARE THE TOP 20 VINE CUSTOMERS WITH MORE REVIEWS?**

In [14]:
#TOP 20 Vine Customers with more reviews
from pyspark.sql.functions import desc
vine_customers_Outdoor_TtlInfo2.orderBy(vine_customers_Outdoor_TtlInfo2["Total Reviews"].desc()).select("Customer ID","Total Reviews").limit(20).show()

+-----------+-------------+
|Customer ID|Total Reviews|
+-----------+-------------+
|   48428870|            9|
|   50272083|            9|
|   13722260|            8|
|   38232031|            8|
|   31302915|            8|
|   49346733|            8|
|   45351550|            8|
|   36695257|            8|
|   29531597|            8|
|   11556116|            8|
|   38462413|            8|
|   49299352|            7|
|   51624877|            7|
|   50760025|            7|
|   21084288|            7|
|   49980229|            7|
|   51409121|            7|
|   40013398|            7|
|   21028674|            7|
|   52536258|            6|
+-----------+-------------+



**3. WHAT IS THE AVERAGE STAR RATINGS FOR THE TOP 20 CUSTOMERS WITH MORE REVIEWS?**

In [15]:
#Average Star Ratings for Top 20 vine customers
vine_customers_Outdoor_TtlInfo2.orderBy(vine_customers_Outdoor_TtlInfo2["Total Reviews"].desc()).select("Customer ID","Total Reviews", "Average Star Rating").limit(20).show()

+-----------+-------------+-------------------+
|Customer ID|Total Reviews|Average Star Rating|
+-----------+-------------+-------------------+
|   48428870|            9|                5.0|
|   50272083|            9|  3.888888888888889|
|   13722260|            8|              4.125|
|   38232031|            8|               4.75|
|   31302915|            8|              4.625|
|   49346733|            8|               3.75|
|   45351550|            8|              4.625|
|   36695257|            8|              4.375|
|   29531597|            8|                4.5|
|   11556116|            8|              4.375|
|   38462413|            8|               3.25|
|   49299352|            7|  4.428571428571429|
|   51624877|            7|  4.428571428571429|
|   50760025|            7| 3.7142857142857144|
|   21084288|            7|  4.142857142857143|
|   49980229|            7|  4.285714285714286|
|   51409121|            7| 3.7142857142857144|
|   40013398|            7|  4.714285714

**4. WHAT IS THE AVERAGE STAR RATING FOR THE VINE CUSTOMER WITH THE MOST HELPFUL VOTES?**

In [16]:
vine_customers_Outdoor_TtlInfo2.orderBy(vine_customers_Outdoor_TtlInfo2["Helpful votes"].desc()).select("Customer ID","Helpful votes","Average Star Rating").limit(1).show()

+-----------+-------------+-------------------+
|Customer ID|Helpful votes|Average Star Rating|
+-----------+-------------+-------------------+
|   45351550|          474|              4.625|
+-----------+-------------+-------------------+



**5. WHAT IS THE AVERAGE STAR RATING OF THE VINE CUSTOMER WITH LESS REVIEWS AND HOW MANY HELPFUL AND TOTAL VOTES HAS?**

In [17]:
vine_customers_Outdoor_TtlInfo2.orderBy("Total Reviews").limit(1).show()

+-----------+-------------+-------------+-------------------+-----------------------+-----------+
|Customer ID|Total Reviews|Helpful votes|Average Star Rating|Total Products reviewed|Total Votes|
+-----------+-------------+-------------+-------------------+-----------------------+-----------+
|   38679000|            1|            0|                5.0|                      1|          1|
+-----------+-------------+-------------+-------------------+-----------------------+-----------+



## SPORTS DB ANALYSIS

In [18]:
#Identify reviews categorized as vine 
from pyspark.sql.functions import col
vine_reviews_sports = vine_sportsDF.filter(col("vine") == "Y")

print(f"Total reviews categorized as vine: {vine_reviews_sports.count()}")
# vine_reviews_sports.show(10, truncate=False)

Total reviews categorized as vine: 10080


In [19]:
print(f"Percentage of vine reviews among all sports reviews: {round((vine_reviews_sports.count()/vine_sportsDF.count())*100,2)}%")

Percentage of vine reviews among all sports reviews: 0.21%


In [20]:
# Join DFs of reviews
review_vine_SportDf= vine_reviews_sports.join(reviewID_sportsDF, on="review_id", how="inner")
# review_vine_SportDf.show(10, truncate=False)

In [21]:
# Join DFs of reviews and customers
review_vine_customers_SportsDf= review_vine_SportDf.join(customers_sportsDF, on="customer_id", how="inner")

# review_vine_customers_SportsDf.show(10, truncate=False)
# review_vine_customers_SportsDf.columns

In [22]:
# Dataframe of Sports Products by Customer

vine_customers_Sports_TtlInfo= review_vine_customers_SportsDf.groupBy("customer_id").agg({"review_id":"count","total_votes":"sum","helpful_votes":"sum","star_rating": "avg","product_id":"count"})
vine_customers_Sports_TtlInfo.show(5)
vine_customers_Sports_TtlInfo2 = vine_customers_Sports_TtlInfo.toDF("Customer ID", "Total Reviews", "Helpful votes", "Average Star Rating", "Total Products reviewed", "Total Votes")
vine_customers_Sports_TtlInfo2.show(5)

+-----------+----------------+------------------+-----------------+-----------------+----------------+
|customer_id|count(review_id)|sum(helpful_votes)| avg(star_rating)|count(product_id)|sum(total_votes)|
+-----------+----------------+------------------+-----------------+-----------------+----------------+
|   17171509|               1|                 1|              5.0|                1|               2|
|   17481726|               5|                16|              4.0|                5|              20|
|   18800155|               2|                 0|              5.0|                2|               0|
|   22978817|               7|                 5|4.285714285714286|                7|               9|
|   38679000|               3|                 0|4.666666666666667|                3|               2|
+-----------+----------------+------------------+-----------------+-----------------+----------------+
only showing top 5 rows

+-----------+-------------+-------------+-------

**1. HOW MANY CUSTOMERS ARE CATEGORIZED AS "VINE VOICES" / BELONG TO THE "VINE PROGRAM"?**

In [23]:
# total vine customers
vine_customers_Sports_TtlInfo2.cache()
print(f"Total of customers belonging to the Vine Program (for outdoors products reviews): {vine_customers_Sports_TtlInfo2.count()}")

Total of customers belonging to the Vine Program (for outdoors products reviews): 2873


**2. WHO ARE THE TOP 20 VINE CUSTOMERS WITH MORE REVIEWS?**

In [24]:
#TOP 20 Vine Customers with more reviews
from pyspark.sql.functions import desc
vine_customers_Sports_TtlInfo2.orderBy(vine_customers_Sports_TtlInfo2["Total Reviews"].desc()).select("Customer ID","Total Reviews").limit(20).show()

+-----------+-------------+
|Customer ID|Total Reviews|
+-----------+-------------+
|   20789642|           48|
|   49620639|           45|
|   50699505|           45|
|   51286530|           41|
|   52215985|           35|
|   50260445|           34|
|   27598356|           33|
|   52281467|           31|
|   49346733|           29|
|   50227539|           29|
|   35297289|           26|
|   50666371|           25|
|   52188216|           24|
|   26955164|           24|
|   21954496|           24|
|   52166758|           23|
|   21155796|           23|
|   52113744|           23|
|   51409121|           23|
|   31476218|           23|
+-----------+-------------+



**3. WHAT IS THE AVERAGE STAR RATINGS FOR THE TOP 20 CUSTOMERS WITH MORE REVIEWS?**

In [25]:
#Average Star Ratings for Top 20 vine customers
vine_customers_Sports_TtlInfo2.orderBy(vine_customers_Sports_TtlInfo2["Total Reviews"].desc()).select("Customer ID","Total Reviews", "Average Star Rating").limit(20).show()

+-----------+-------------+-------------------+
|Customer ID|Total Reviews|Average Star Rating|
+-----------+-------------+-------------------+
|   20789642|           48| 3.8958333333333335|
|   49620639|           45|  4.222222222222222|
|   50699505|           45|  4.644444444444445|
|   51286530|           41|  4.048780487804878|
|   52215985|           35|  4.714285714285714|
|   50260445|           34|  4.294117647058823|
|   27598356|           33|  4.818181818181818|
|   52281467|           31| 3.4838709677419355|
|   49346733|           29| 3.8275862068965516|
|   50227539|           29|                4.0|
|   35297289|           26| 4.1923076923076925|
|   50666371|           25|               3.92|
|   52188216|           24|              3.875|
|   26955164|           24|  4.666666666666667|
|   21954496|           24|              3.875|
|   52166758|           23|  4.217391304347826|
|   21155796|           23|  4.739130434782608|
|   52113744|           23|  3.782608695

**4. WHAT IS THE AVERAGE STAR RATING FOR THE VINE CUSTOMER WITH THE MOST HELPFUL VOTES?**

In [26]:
vine_customers_Sports_TtlInfo2.orderBy(vine_customers_Sports_TtlInfo2["Helpful votes"].desc()).select("Customer ID","Helpful votes","Average Star Rating").limit(1).show()

+-----------+-------------+-------------------+
|Customer ID|Helpful votes|Average Star Rating|
+-----------+-------------+-------------------+
|   52490988|          737|                3.5|
+-----------+-------------+-------------------+



**5. WHAT IS THE AVERAGE STAR RATING OF THE VINE CUSTOMER WITH LESS REVIEWS AND HOW MANY HELPFUL AND TOTAL VOTES HAS?**

In [27]:
vine_customers_Sports_TtlInfo2.orderBy("Total Reviews").limit(1).show()

+-----------+-------------+-------------+-------------------+-----------------------+-----------+
|Customer ID|Total Reviews|Helpful votes|Average Star Rating|Total Products reviewed|Total Votes|
+-----------+-------------+-------------+-------------------+-----------------------+-----------+
|   17171509|            1|            1|                5.0|                      1|          2|
+-----------+-------------+-------------+-------------------+-----------------------+-----------+



##Combined Tables

In [28]:
from pyspark.sql.functions import lit


vine_customers_Outdoor_TtlInfo3 = vine_customers_Outdoor_TtlInfo2.withColumn("Type of Product", lit("Outdoor"))
vine_customers_Outdoor_TtlInfo3.show(3)
vine_customers_Sports_TtlInfo3 = vine_customers_Sports_TtlInfo2.withColumn("Type of Product", lit("Sports"))
vine_customers_Sports_TtlInfo3.show(3)


+-----------+-------------+-------------+-------------------+-----------------------+-----------+---------------+
|Customer ID|Total Reviews|Helpful votes|Average Star Rating|Total Products reviewed|Total Votes|Type of Product|
+-----------+-------------+-------------+-------------------+-----------------------+-----------+---------------+
|   17481726|            3|           12| 3.6666666666666665|                      3|         13|        Outdoor|
|   18800155|            2|            1|                4.0|                      2|          2|        Outdoor|
|   38679000|            1|            0|                5.0|                      1|          1|        Outdoor|
+-----------+-------------+-------------+-------------------+-----------------------+-----------+---------------+
only showing top 3 rows

+-----------+-------------+-------------+-------------------+-----------------------+-----------+---------------+
|Customer ID|Total Reviews|Helpful votes|Average Star Rating|To

In [29]:
# Join DFs of customers
combinedDF = vine_customers_Outdoor_TtlInfo3.union(vine_customers_Sports_TtlInfo3)

combinedDF.cache()

combinedDF.show(10, truncate=False)

+-----------+-------------+-------------+-------------------+-----------------------+-----------+---------------+
|Customer ID|Total Reviews|Helpful votes|Average Star Rating|Total Products reviewed|Total Votes|Type of Product|
+-----------+-------------+-------------+-------------------+-----------------------+-----------+---------------+
|17481726   |3            |12           |3.6666666666666665 |3                      |13         |Outdoor        |
|18800155   |2            |1            |4.0                |2                      |2          |Outdoor        |
|38679000   |1            |0            |5.0                |1                      |1          |Outdoor        |
|41549558   |5            |3            |5.0                |5                      |6          |Outdoor        |
|49214860   |3            |123          |4.0                |3                      |125        |Outdoor        |
|50152643   |1            |0            |5.0                |1                      |1  

**Total reviews by Vine Customers for Sports and Outdoors Products**

In [30]:
print(f"Total reviews (for sports and outdoors products): {combinedDF.count()}")

Total reviews (for sports and outdoors products): 4646


**Total of Vine Customers (withing sports and outdoors products)**

In [31]:
ttlVine_Customers =combinedDF.groupBy("Customer ID").avg().count()
print(f"Total of customers belonging to the Vine Program (for outdoors products reviews): {ttlVine_Customers}")

Total of customers belonging to the Vine Program (for outdoors products reviews): 3236


**Top 10 Vine Customers with most Products Reviewed and higher votes**

In [32]:

combinedDF.orderBy(combinedDF["Total Votes"].desc()).orderBy(combinedDF["Total Products reviewed"].desc()).select("Customer ID","Total Products reviewed","Total Votes","Type of Product","Average Star Rating").limit(10).show()

+-----------+-----------------------+-----------+---------------+-------------------+
|Customer ID|Total Products reviewed|Total Votes|Type of Product|Average Star Rating|
+-----------+-----------------------+-----------+---------------+-------------------+
|   20789642|                     48|         30|         Sports| 3.8958333333333335|
|   49620639|                     45|         92|         Sports|  4.222222222222222|
|   50699505|                     45|        183|         Sports|  4.644444444444445|
|   51286530|                     41|         66|         Sports|  4.048780487804878|
|   52215985|                     35|         71|         Sports|  4.714285714285714|
|   50260445|                     34|         54|         Sports|  4.294117647058823|
|   27598356|                     33|         24|         Sports|  4.818181818181818|
|   52281467|                     31|        194|         Sports| 3.4838709677419355|
|   49346733|                     29|         23|     

**Customers grouped by product type (outdoors and sports)**

In [33]:
customers_byProduct = combinedDF.groupBy("Customer ID", "Type of Product").agg({"Type of Product":"count","Total Reviews":"count", "Helpful votes":"sum", "Average Star Rating":"avg", "Total Products reviewed":"sum", "Total Votes":"sum"})
# customers_byProduct.show(2)

customers_byProduct2 = customers_byProduct.toDF("Customer ID", "Type of Product","Total Votes","Total Reviews", "Average Star Rating","Total Products reviewed","Count of Type of Products","Helpful votes")
# customers_byProduct2.show(2)

customers_byProduct2.orderBy(customers_byProduct2["Total Products reviewed"].desc()).orderBy(customers_byProduct2["Total Votes"].desc()).select("Customer ID", "Type of Product","Total Votes","Total Products reviewed","Average Star Rating").show(10)

+-----------+---------------+-----------+-----------------------+-------------------+
|Customer ID|Type of Product|Total Votes|Total Products reviewed|Average Star Rating|
+-----------+---------------+-----------+-----------------------+-------------------+
|   50359177|         Sports|        928|                      1|                1.0|
|   52490988|         Sports|        776|                      4|                3.5|
|   32228920|         Sports|        591|                      2|                4.5|
|   16995624|         Sports|        518|                      3| 3.3333333333333335|
|   45351550|        Outdoor|        513|                      8|              4.625|
|   26955164|         Sports|        513|                     24|  4.666666666666667|
|   20981134|         Sports|        474|                      9|  4.777777777777778|
|   52433849|         Sports|        432|                      8|              2.875|
|   51835976|         Sports|        413|             

In [34]:
customers_byProduct3 = combinedDF.groupBy("Customer ID", "Type of Product").agg({"Type of Product":"count"})
customers_byProduct3.orderBy("count(Type of Product)").show(10)

+-----------+---------------+----------------------+
|Customer ID|Type of Product|count(Type of Product)|
+-----------+---------------+----------------------+
|   53094985|        Outdoor|                     1|
|   35753769|        Outdoor|                     1|
|   52791740|        Outdoor|                     1|
|   12803566|        Outdoor|                     1|
|   50046574|        Outdoor|                     1|
|   52876492|         Sports|                     1|
|   38727871|         Sports|                     1|
|   36043471|         Sports|                     1|
|   26565724|         Sports|                     1|
|   49096095|         Sports|                     1|
+-----------+---------------+----------------------+
only showing top 10 rows

