<a href="https://colab.research.google.com/github/karenbennis/Xy/blob/Blake/xy_database_prepper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Java, Spark, and Findspark ---IMPORTED WITH EVERY COLAB NOTEBOOK
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-07-12 23:23:34--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.5’


2020-07-12 23:23:34 (4.78 MB/s) - ‘postgresql-42.2.9.jar.5’ saved [914037/914037]



In [3]:
# Start Spark Session(Creating spark application with name defined by appName()) ---IMPORTED WITH EVERY COLAB NOTEBOOK
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("database_transformation").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [4]:
# Import pandas
import pandas as pd
# Import struct files that we need to modify datatypes
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, DateType
# Import unix timestamp 
from pyspark.sql.functions import unix_timestamp, to_date, col

In [5]:
file_path = "https://raw.githubusercontent.com/karenbennis/Xy/Data_ETL/yelp.csv"
pd_df = pd.read_csv(file_path)
pd_df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [6]:
mySchema = StructType([ StructField("business_id", StringType(), True)\
                       ,StructField("date", StringType(), True)\
                       ,StructField("review_id", StringType(), True)\
                       ,StructField("stars", IntegerType(), True)\
                       ,StructField("text", StringType(), True)\
                       ,StructField("type", StringType(), True)\
                       ,StructField("user_id", StringType(), True)\
                       ,StructField("cool", IntegerType(), True)\
                       ,StructField("useful", IntegerType(), True)\
                       ,StructField("funny", IntegerType(), True)])

In [7]:
df = spark.createDataFrame(pd_df,schema=mySchema)

In [8]:
df.show()

+--------------------+----------+--------------------+-----+--------------------+------+--------------------+----+------+-----+
|         business_id|      date|           review_id|stars|                text|  type|             user_id|cool|useful|funny|
+--------------------+----------+--------------------+-----+--------------------+------+--------------------+----+------+-----+
|9yKzy9PApeiPPOUJE...|2011-01-26|fWKvX83p0-ka4JS3d...|    5|My wife took me h...|review|rLtl8ZkDX5vH5nAx9...|   2|     5|    0|
|ZRJwVLyzEJq1VAihD...|2011-07-27|IjZ33sJrzXqU-0X6U...|    5|I have no idea wh...|review|0a2KyEL0d3Yb1V6ai...|   0|     0|    0|
|6oRAC4uyJCsJl1X0W...|2012-06-14|IESLBzqUCLdSzSqm0...|    4|love the gyro pla...|review|0hT2KtfLiobPvh6cD...|   0|     1|    0|
|_1QQZuf4zZOyFCvXc...|2010-05-27|G-WvGaISbqqaMHlNn...|    5|Rosie, Dakota, an...|review|uZetl9T0NcROGOyFf...|   1|     2|    0|
|6ozycU1RpktNG2-1B...|2012-01-05|1uJFq2r5QfJG_6ExM...|    5|General Manager S...|review|vYmM4KTsC8ZfQBg-

In [9]:
df = df.withColumn('date', 
                   to_date(unix_timestamp(col('date'), 'yyyy-mm-dd').cast("timestamp")))

In [10]:
df.show(3)

+--------------------+----------+--------------------+-----+--------------------+------+--------------------+----+------+-----+
|         business_id|      date|           review_id|stars|                text|  type|             user_id|cool|useful|funny|
+--------------------+----------+--------------------+-----+--------------------+------+--------------------+----+------+-----+
|9yKzy9PApeiPPOUJE...|2011-01-26|fWKvX83p0-ka4JS3d...|    5|My wife took me h...|review|rLtl8ZkDX5vH5nAx9...|   2|     5|    0|
|ZRJwVLyzEJq1VAihD...|2011-01-27|IjZ33sJrzXqU-0X6U...|    5|I have no idea wh...|review|0a2KyEL0d3Yb1V6ai...|   0|     0|    0|
|6oRAC4uyJCsJl1X0W...|2012-01-14|IESLBzqUCLdSzSqm0...|    4|love the gyro pla...|review|0hT2KtfLiobPvh6cD...|   0|     1|    0|
+--------------------+----------+--------------------+-----+--------------------+------+--------------------+----+------+-----+
only showing top 3 rows



In [11]:
df.dtypes

[('business_id', 'string'),
 ('date', 'date'),
 ('review_id', 'string'),
 ('stars', 'int'),
 ('text', 'string'),
 ('type', 'string'),
 ('user_id', 'string'),
 ('cool', 'int'),
 ('useful', 'int'),
 ('funny', 'int')]

In [12]:
df = df.selectExpr("business_id as business_id", "date as review_date", "review_id as review_id", "stars as stars",
                    "text as review_text", "type as review_type", "user_id as user_id", "cool as cool", "useful as useful","funny as funny")

In [13]:
df.show()

+--------------------+-----------+--------------------+-----+--------------------+-----------+--------------------+----+------+-----+
|         business_id|review_date|           review_id|stars|         review_text|review_type|             user_id|cool|useful|funny|
+--------------------+-----------+--------------------+-----+--------------------+-----------+--------------------+----+------+-----+
|9yKzy9PApeiPPOUJE...| 2011-01-26|fWKvX83p0-ka4JS3d...|    5|My wife took me h...|     review|rLtl8ZkDX5vH5nAx9...|   2|     5|    0|
|ZRJwVLyzEJq1VAihD...| 2011-01-27|IjZ33sJrzXqU-0X6U...|    5|I have no idea wh...|     review|0a2KyEL0d3Yb1V6ai...|   0|     0|    0|
|6oRAC4uyJCsJl1X0W...| 2012-01-14|IESLBzqUCLdSzSqm0...|    4|love the gyro pla...|     review|0hT2KtfLiobPvh6cD...|   0|     1|    0|
|_1QQZuf4zZOyFCvXc...| 2010-01-27|G-WvGaISbqqaMHlNn...|    5|Rosie, Dakota, an...|     review|uZetl9T0NcROGOyFf...|   1|     2|    0|
|6ozycU1RpktNG2-1B...| 2012-01-05|1uJFq2r5QfJG_6ExM...|    5|G

In [14]:
# Create dataframe matching the review table schema
review_df = df.select(['review_id', 'review_text', 'stars', 'cool', 'useful', 'funny', 'review_date', 'review_type'])

In [15]:
# Create dataframe matching the business table schema
business_df = df.select(['review_id', 'business_id'])

In [16]:
# Create dataframe matching the user table schema
user_df = df.select(['review_id', 'user_id'])

In [17]:
review_df.show(3)

+--------------------+--------------------+-----+----+------+-----+-----------+-----------+
|           review_id|         review_text|stars|cool|useful|funny|review_date|review_type|
+--------------------+--------------------+-----+----+------+-----+-----------+-----------+
|fWKvX83p0-ka4JS3d...|My wife took me h...|    5|   2|     5|    0| 2011-01-26|     review|
|IjZ33sJrzXqU-0X6U...|I have no idea wh...|    5|   0|     0|    0| 2011-01-27|     review|
|IESLBzqUCLdSzSqm0...|love the gyro pla...|    4|   0|     1|    0| 2012-01-14|     review|
+--------------------+--------------------+-----+----+------+-----+-----------+-----------+
only showing top 3 rows



In [18]:
business_df.show(3)

+--------------------+--------------------+
|           review_id|         business_id|
+--------------------+--------------------+
|fWKvX83p0-ka4JS3d...|9yKzy9PApeiPPOUJE...|
|IjZ33sJrzXqU-0X6U...|ZRJwVLyzEJq1VAihD...|
|IESLBzqUCLdSzSqm0...|6oRAC4uyJCsJl1X0W...|
+--------------------+--------------------+
only showing top 3 rows



In [19]:
user_df.show(3)

+--------------------+--------------------+
|           review_id|             user_id|
+--------------------+--------------------+
|fWKvX83p0-ka4JS3d...|rLtl8ZkDX5vH5nAx9...|
|IjZ33sJrzXqU-0X6U...|0a2KyEL0d3Yb1V6ai...|
|IESLBzqUCLdSzSqm0...|0hT2KtfLiobPvh6cD...|
+--------------------+--------------------+
only showing top 3 rows



In [20]:
# gcloud login and check the DB
!gcloud auth login
!gcloud config set project 'xy-yelp'
!gcloud sql instances describe 'xy-yelp'

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&code_challenge=npfCUFHwcZCaNpqYNfE4WW8-YiYPPQ1mpYc7pMpMLLA&code_challenge_method=S256&access_type=offline&response_type=code&prompt=select_account


Enter verification code: 4/1wFysw-sOqnqXSstzQeGrMT0ant37P0YVF-HFe1e1pqKVxqVccDxnZM

You are now logged in as [belnap.b.01@gmail.com].
Your current project is [xy-yelp].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID
Updated property [core/project].
backendType: SECOND_GEN
connectionName: xy-yelp:northamerica-northeast1:xy-yelp
databas

In [21]:
# download and initialize the psql proxy
!wget https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O cloud_sql_proxy
!chmod +x cloud_sql_proxy
# "connectionName" is from the previous block
!nohup ./cloud_sql_proxy -instances="xy-yelp:northamerica-northeast1:xy-yelp"=tcp:5432 &
!sleep 30s

cloud_sql_proxy: Text file busy
nohup: appending output to 'nohup.out'


In [22]:
db_password = 'kjhbyelpdb'

In [23]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://127.0.0.1:5432/xy_yelp_db"
config = {"user":"postgres", 
          "password": db_password, 
          "driver":"org.postgresql.Driver"}

In [24]:
# Write DataFrame to review table in RDS
review_df.write.jdbc(url=jdbc_url, table='review', mode=mode, properties=config)

In [25]:
# Write DataFrame to business table in RDS
business_df.write.jdbc(url=jdbc_url, table='business', mode=mode, properties=config)

In [26]:
# Write DataFrame to business table in RDS
user_df.write.jdbc(url=jdbc_url, table='yelp_user', mode=mode, properties=config)

In [27]:
# Read data from database
review_df2 = spark.read \
    .jdbc(url=jdbc_url, table='review',
          properties=config)

In [28]:
review_df2.show(2)

+--------------------+--------------------+-----+----+------+-----+-----------+-----------+
|           review_id|         review_text|stars|cool|useful|funny|review_date|review_type|
+--------------------+--------------------+-----+----+------+-----+-----------+-----------+
|fWKvX83p0-ka4JS3d...|My wife took me h...|    5|   2|     5|    0| 2011-01-26|     review|
|IjZ33sJrzXqU-0X6U...|I have no idea wh...|    5|   0|     0|    0| 2011-01-27|     review|
+--------------------+--------------------+-----+----+------+-----+-----------+-----------+
only showing top 2 rows



In [29]:
review_df2.count()

10000

In [30]:
business_df2 = spark.read \
    .jdbc(url=jdbc_url, table='business',
          properties=config)

In [31]:
business_df2.show(2)

+--------------------+--------------------+
|           review_id|         business_id|
+--------------------+--------------------+
|fWKvX83p0-ka4JS3d...|9yKzy9PApeiPPOUJE...|
|IjZ33sJrzXqU-0X6U...|ZRJwVLyzEJq1VAihD...|
+--------------------+--------------------+
only showing top 2 rows



In [32]:
business_df2.count()

10000

In [33]:
user_df2 = spark.read \
    .jdbc(url=jdbc_url, table='yelp_user',
          properties=config)

In [34]:
user_df2.show(2)

+--------------------+--------------------+
|           review_id|             user_id|
+--------------------+--------------------+
|GJGUHAAONtBSBj53c...|Z3c7xGRfeV-uMkSbA...|
|nQH2KAvAeOJOYKX99...|ryjqXdp68i2I9JPOp...|
+--------------------+--------------------+
only showing top 2 rows



In [35]:
user_df2.count()

10000

In [44]:
#df_df = review_df2.join(business_df2, review_df2.review_id == business_df2.review_id, how="inner").drop(_df2.review_id)
review_df2 = review_df2.join(business_df2, on="review_id", how="inner")
review_df2.show(10)

+--------------------+--------------------+-----+----+------+-----+-----------+-----------+--------------------+
|           review_id|         review_text|stars|cool|useful|funny|review_date|review_type|         business_id|
+--------------------+--------------------+-----+----+------+-----+-----------+-----------+--------------------+
|-7yxrdY13ay15rGB7...|I have been going...|    5|   0|     0|    0| 2010-01-16|     review|Lh9nz0KYyzE-YRbKu...|
|-Be0UUGYuiDJVAM_Y...|Since Im big into...|    4|   0|     2|    2| 2011-01-25|     review|pa6K7DGByxBXxcVJ5...|
|-nQHHXi-d_yuW301_...|A pleasant place ...|    2|   0|     0|    0| 2011-01-12|     review|GIGI8bJfN6HyPzmEW...|
|2L30O7G8IQ6HILpR0...|part of a social ...|    5|   0|     0|    0| 2010-01-24|     review|qiwajZigq_2twTmYo...|
|4x5yLG7_yGLuN-w6f...|I love every plac...|    4|   0|     1|    0| 2011-01-02|     review|9yKzy9PApeiPPOUJE...|
|5h0EVAee-RDbbKfhd...|A great value for...|    5|   2|     1|    0| 2012-01-21|     review|4VzaY

In [45]:
review_df2 = review_df2.join(user_df2, on="review_id", how="inner")
review_df2.show(10)

+--------------------+--------------------+-----+----+------+-----+-----------+-----------+--------------------+--------------------+
|           review_id|         review_text|stars|cool|useful|funny|review_date|review_type|         business_id|             user_id|
+--------------------+--------------------+-----+----+------+-----+-----------+-----------+--------------------+--------------------+
|-7yxrdY13ay15rGB7...|I have been going...|    5|   0|     0|    0| 2010-01-16|     review|Lh9nz0KYyzE-YRbKu...|ayKW9eWwGFcrtJaHc...|
|-Be0UUGYuiDJVAM_Y...|Since Im big into...|    4|   0|     2|    2| 2011-01-25|     review|pa6K7DGByxBXxcVJ5...|_4lqpCYCqOQzbB6xQ...|
|-nQHHXi-d_yuW301_...|A pleasant place ...|    2|   0|     0|    0| 2011-01-12|     review|GIGI8bJfN6HyPzmEW...|4QORbyhfN01oKR_Gg...|
|2L30O7G8IQ6HILpR0...|part of a social ...|    5|   0|     0|    0| 2010-01-24|     review|qiwajZigq_2twTmYo...|ST8Yzlk2MqKlcaLqL...|
|4x5yLG7_yGLuN-w6f...|I love every plac...|    4|   0|     1| 