# ETL Notebook

This notebook is to do some DataFrame visualisation for our ETL process. We will run connect to the Database and run our SQL queries to extract our defined dimensions.

This can also be our testbed for potential transforms that we will be conducting for our NLP models.

In [39]:
# Import libraries
import mysql.connector
import pandas as pd
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
from uuid import uuid4
load_dotenv()

True

In [40]:
db_host = os.getenv("DATABASE_ENDPOINT")
db_user = os.getenv("DATABASE_USERNAME")
db_pw = os.getenv("DATABASE_PASSWORD")
db_name = os.getenv("DATABASE_NAME")
db_port = os.getenv("DATABASE_PORT")

dwh_host = os.getenv("DATAWH_ENDPOINT")
dwh_user = os.getenv("DATAWH_USERNAME")
dwh_pw = os.getenv("DATAWH_PASSWORD")
dwh_name = os.getenv("DATAWH_NAME")
dwh_port = os.getenv("DATAWH_PORT")

# housekeeping
db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)

cursor = db_datawarehouse.cursor()
cursor.execute('DROP TABLE IF EXISTS review;')
cursor.execute('DROP TABLE IF EXISTS fact;')
cursor.execute('DROP TABLE IF EXISTS time;')

db_datawarehouse.commit()
db_datawarehouse.close()

In [41]:
db_tripadvisor = mysql.connector.connect(
	host=db_host,
	user=db_user,
	passwd=db_pw,
	database=db_name
)

engine = create_engine(f'mysql://{dwh_user}:{dwh_pw}@{dwh_host}:{dwh_port}/{dwh_name}', echo=False)

dwh = engine.connect()

In [42]:
# Check if the OverallID column exists
cursor = db_tripadvisor.cursor()
cursor.execute("SHOW COLUMNS FROM tripadvisor_reviews LIKE 'OverallID'")
if cursor.fetchone() is None:
    # If OverallID column doesn't exist, add it
    cursor.execute('ALTER TABLE tripadvisor_reviews ADD OverallID INT AUTO_INCREMENT PRIMARY KEY')


In [43]:
# Check tripadvisor_reviews table

str_sql = '''
SELECT *
FROM tripadvisor_reviews
'''

df = pd.read_sql(sql=str_sql, con=db_tripadvisor)

df

  df = pd.read_sql(sql=str_sql, con=db_tripadvisor)


Unnamed: 0,Review Title,Review Text,Date of Stay,Author Contribution,Rating,OverallID
0,Must see in Singapore,A must not miss place for tourists to visit wh...,2024-03-01,73,5.0,1
1,I recently had the most rejuvenating spa,I recently had the most rejuvenating spa exper...,2024-03-01,3,5.0,2
2,Professional service,Visited the hotel for some drinks and what I r...,2024-03-01,2,5.0,3
3,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,2024-03-01,1,5.0,4
4,Nice touch.,While the initial check in experience was not ...,2024-03-01,2,5.0,5
...,...,...,...,...,...,...
11227,A good hotel,Nice location and very near to a shopping mall...,2014-08-01,97,5.0,11228
11228,Swim in heaven,Everytime I check into the Sand I love going u...,2015-03-01,13,4.0,11229
11229,Unmatcheable Property,I stayed here for 2 nights in November'14. ...,2015-01-01,68,5.0,11230
11230,Beautiful experience and amazing architecture,This is a great place to visit even quickly if...,2015-01-01,59,5.0,11231


In [44]:
# Time dimension
# TimeID
# StayDate
# StayDateYear
# StayDateMonth
# StayDateDay
# StayDateDayOfWeek
# StayDateWeek

time_sql = """
SELECT `OverallID` as OverallID, `Date of Stay` as StayDate, YEAR(`Date of Stay`) AS StayDateYear, MONTH(`Date of Stay`) AS StayDateMonth, Day(`Date of Stay`) AS StayDateDay, IF((DayOfWeek(`Date of Stay`) - 1) = 0, 7, DayOfWeek(`Date of Stay`) - 1) As StayDateDayOfWeek, WEEK(`Date of Stay`) AS StayDateWeek
FROM tripadvisor_reviews;
"""

df = pd.read_sql(sql=time_sql, con=db_tripadvisor)
df['TimeID'] = df['StayDate'].apply(lambda x: str(uuid4())[:12])
cols = df.columns.to_list()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df

  df = pd.read_sql(sql=time_sql, con=db_tripadvisor)


Unnamed: 0,TimeID,OverallID,StayDate,StayDateYear,StayDateMonth,StayDateDay,StayDateDayOfWeek,StayDateWeek
0,f2481d1b-a8d,1,2024-03-01,2024,3,1,5,8
1,02efc6be-a65,2,2024-03-01,2024,3,1,5,8
2,e0962631-8e0,3,2024-03-01,2024,3,1,5,8
3,cbca0c49-673,4,2024-03-01,2024,3,1,5,8
4,88ef0121-775,5,2024-03-01,2024,3,1,5,8
...,...,...,...,...,...,...,...,...
11227,68ded1dc-e0c,11228,2014-08-01,2014,8,1,5,30
11228,b52db812-bdf,11229,2015-03-01,2015,3,1,7,9
11229,2a2bac24-7ba,11230,2015-01-01,2015,1,1,4,0
11230,d3ce3729-329,11231,2015-01-01,2015,1,1,4,0


In [45]:
# Load Time Dimension 
df.to_sql(name='time', con = dwh, if_exists='replace')
dwh.commit()

In [46]:
# Review Dimension
# ReviewID
# ReviewText
# ReviewTitle
# ReviewRating

review_sql = """
SELECT `OverallID` AS OverallID, `Review Title` AS ReviewTitle, `Review Text` AS ReviewText, `Rating` AS ReviewRating, `Author Contribution` AS ReviewerContribution
FROM tripadvisor_reviews;
"""

df = pd.read_sql(sql=review_sql, con=db_tripadvisor)
df['ReviewID'] = df['ReviewTitle'].apply(lambda x: str(uuid4())[:12])
cols = df.columns.to_list()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df

  df = pd.read_sql(sql=review_sql, con=db_tripadvisor)


Unnamed: 0,ReviewID,OverallID,ReviewTitle,ReviewText,ReviewRating,ReviewerContribution
0,ee9f7923-a5c,1,Must see in Singapore,A must not miss place for tourists to visit wh...,5.0,73
1,4e36a725-bf7,2,I recently had the most rejuvenating spa,I recently had the most rejuvenating spa exper...,5.0,3
2,18a27b40-64f,3,Professional service,Visited the hotel for some drinks and what I r...,5.0,2
3,e34847fa-d73,4,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,5.0,1
4,c3f71000-fad,5,Nice touch.,While the initial check in experience was not ...,5.0,2
...,...,...,...,...,...,...
11227,b14651ae-c6e,11228,A good hotel,Nice location and very near to a shopping mall...,5.0,97
11228,6a9a8a4b-12f,11229,Swim in heaven,Everytime I check into the Sand I love going u...,4.0,13
11229,2c1b69d7-03e,11230,Unmatcheable Property,I stayed here for 2 nights in November'14. ...,5.0,68
11230,3708f845-e0f,11231,Beautiful experience and amazing architecture,This is a great place to visit even quickly if...,5.0,59


In [47]:
# Load Time Dimension 
df.to_sql(name='review', con = dwh, if_exists='replace')
dwh.commit()

In [48]:
# Ingest fact
# im not v sure
fact_sql = '''
SELECT review.OverallID, review.ReviewID, time.TimeID
FROM review
INNER JOIN time ON review.OverallID = time.OverallID
'''
df = pd.read_sql(sql = fact_sql, con=dwh)
df


Unnamed: 0,OverallID,ReviewID,TimeID
0,1,ee9f7923-a5c,f2481d1b-a8d
1,2,4e36a725-bf7,02efc6be-a65
2,3,18a27b40-64f,e0962631-8e0
3,4,e34847fa-d73,cbca0c49-673
4,5,c3f71000-fad,88ef0121-775
...,...,...,...
11227,11215,2ae5b81d-e1a,4e52c84b-b3b
11228,11221,e6b83466-a33,63100ce3-5bf
11229,11224,44fa279c-cc1,f506cbda-9de
11230,11227,4ef19610-5a4,7290eac9-75e


In [49]:
df.to_sql(name='fact', con=dwh, if_exists='replace')
dwh.commit()

In [50]:
db_tripadvisor.close()
dwh.close()

In [51]:
#  housekeeping - set up primary and foreign keys in datawarehouse

db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)

cursor = db_datawarehouse.cursor()
cursor.execute('ALTER TABLE time ADD PRIMARY KEY (TimeID(12));')
cursor.execute('ALTER TABLE review ADD PRIMARY KEY (ReviewID(12));')

db_datawarehouse.commit()
db_datawarehouse.close()

In [53]:
# check
db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)

time_Sql = '''
SELECT * FROM time ORDER BY OverallID
'''
df = pd.read_sql(sql=time_Sql, con=db_datawarehouse)

df

  df = pd.read_sql(sql=time_Sql, con=db_datawarehouse)


Unnamed: 0,index,TimeID,OverallID,StayDate,StayDateYear,StayDateMonth,StayDateDay,StayDateDayOfWeek,StayDateWeek
0,0,f2481d1b-a8d,1,2024-03-01,2024,3,1,5,8
1,1,02efc6be-a65,2,2024-03-01,2024,3,1,5,8
2,2,e0962631-8e0,3,2024-03-01,2024,3,1,5,8
3,3,cbca0c49-673,4,2024-03-01,2024,3,1,5,8
4,4,88ef0121-775,5,2024-03-01,2024,3,1,5,8
...,...,...,...,...,...,...,...,...,...
11227,11227,68ded1dc-e0c,11228,2014-08-01,2014,8,1,5,30
11228,11228,b52db812-bdf,11229,2015-03-01,2015,3,1,7,9
11229,11229,2a2bac24-7ba,11230,2015-01-01,2015,1,1,4,0
11230,11230,d3ce3729-329,11231,2015-01-01,2015,1,1,4,0


In [54]:
fact_Sql = '''
SELECT * FROM fact ORDER BY OverallID
'''
df = pd.read_sql(sql=fact_Sql, con=db_datawarehouse)

df

  df = pd.read_sql(sql=fact_Sql, con=db_datawarehouse)


Unnamed: 0,index,OverallID,ReviewID,TimeID
0,0,1,ee9f7923-a5c,f2481d1b-a8d
1,1,2,4e36a725-bf7,02efc6be-a65
2,2,3,18a27b40-64f,e0962631-8e0
3,3,4,e34847fa-d73,cbca0c49-673
4,4,5,c3f71000-fad,88ef0121-775
...,...,...,...,...
11227,9397,11228,b14651ae-c6e,68ded1dc-e0c
11228,11231,11229,6a9a8a4b-12f,b52db812-bdf
11229,5834,11230,2c1b69d7-03e,2a2bac24-7ba
11230,9398,11231,3708f845-e0f,d3ce3729-329


In [55]:
review_Sql = '''
SELECT * FROM review ORDER BY OverallID
'''
df = pd.read_sql(sql=review_Sql, con=db_datawarehouse)

df

  df = pd.read_sql(sql=review_Sql, con=db_datawarehouse)


Unnamed: 0,index,ReviewID,OverallID,ReviewTitle,ReviewText,ReviewRating,ReviewerContribution
0,0,ee9f7923-a5c,1,Must see in Singapore,A must not miss place for tourists to visit wh...,5.0,73
1,1,4e36a725-bf7,2,I recently had the most rejuvenating spa,I recently had the most rejuvenating spa exper...,5.0,3
2,2,18a27b40-64f,3,Professional service,Visited the hotel for some drinks and what I r...,5.0,2
3,3,e34847fa-d73,4,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,5.0,1
4,4,c3f71000-fad,5,Nice touch.,While the initial check in experience was not ...,5.0,2
...,...,...,...,...,...,...,...
11227,11227,b14651ae-c6e,11228,A good hotel,Nice location and very near to a shopping mall...,5.0,97
11228,11228,6a9a8a4b-12f,11229,Swim in heaven,Everytime I check into the Sand I love going u...,4.0,13
11229,11229,2c1b69d7-03e,11230,Unmatcheable Property,I stayed here for 2 nights in November'14. ...,5.0,68
11230,11230,3708f845-e0f,11231,Beautiful experience and amazing architecture,This is a great place to visit even quickly if...,5.0,59
