# ETL Notebook

This notebook is to do some DataFrame visualisation for our ETL process. We will run connect to the Database and run our SQL queries to extract our defined dimensions.

This can also be our testbed for potential transforms that we will be conducting for our NLP models.

In [100]:
# Import libraries
import mysql.connector
import pandas as pd
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
from uuid import uuid4
load_dotenv()

True

In [101]:
db_host = os.getenv("DATABASE_ENDPOINT")
db_user = os.getenv("DATABASE_USERNAME")
db_pw = os.getenv("DATABASE_PASSWORD")
db_name = os.getenv("DATABASE_NAME")
db_port = os.getenv("DATABASE_PORT")

dwh_host = os.getenv("DATAWH_ENDPOINT")
dwh_user = os.getenv("DATAWH_USERNAME")
dwh_pw = os.getenv("DATAWH_PASSWORD")
dwh_name = os.getenv("DATAWH_NAME")
dwh_port = os.getenv("DATAWH_PORT")



db_tripadvisor = mysql.connector.connect(
	host=db_host,
	user=db_user,
	passwd=db_pw,
	database=db_name
)

engine = create_engine(f'mysql://{dwh_user}:{dwh_pw}@{dwh_host}:{dwh_port}/{dwh_name}', echo=False)

dwh = engine.connect()

In [102]:
# Check tripadvisor_reviews table

str_sql = '''
SELECT *
FROM tripadvisor_reviews
'''

df = pd.read_sql(sql=str_sql, con=db_tripadvisor)
df

  df = pd.read_sql(sql=str_sql, con=db_tripadvisor)


Unnamed: 0,Review Title,Review Text,Date of Stay,Author Contribution,Rating
0,Must see in Singapore,A must not miss place for tourists to visit wh...,2024-03-01,73,5.0
1,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,2024-03-01,1,5.0
2,Nice touch.,While the initial check in experience was not ...,2024-03-01,2,5.0
3,Amazing experience at MBS with superb hospital...,I booked the Sands Premiere Suite to celebrate...,2024-03-01,1,5.0
4,Incredibile,"Nina is amazing as all the team, I loved her h...",2024-03-01,2,5.0
...,...,...,...,...,...
11227,Aweseome view over the Harbour,Amazing design ! I wouldnt recommend to sleep...,2014-03-01,131,4.0
11228,Excellent Property but poor services,Stayed in this hotel during the period Novembe...,2014-03-01,15,3.0
11229,Stunning &amp; Mesmeric,When you walk out on to that roof for the firs...,2014-03-01,25,5.0
11230,A Slice of Heaven,One can say what they want about this property...,2014-02-01,5,4.0


In [103]:
# Time dimension
# TimeID
# StayDate
# StayDateYear
# StayDateMonth
# StayDateDay
# StayDateDayOfWeek
# StayDateWeek

time_sql = """
SELECT `Date of Stay` as StayDate, YEAR(`Date of Stay`) AS StayDateYear, MONTH(`Date of Stay`) AS StayDateMonth, Day(`Date of Stay`) AS StayDateDay, IF((DayOfWeek(`Date of Stay`) - 1) = 0, 7, DayOfWeek(`Date of Stay`) - 1) As StayDateDayOfWeek, WEEK(`Date of Stay`) AS StayDateWeek
FROM tripadvisor_reviews;
"""

df = pd.read_sql(sql=time_sql, con=db_tripadvisor)
df['TimeID'] = df['StayDate'].apply(lambda x: uuid4())
cols = df.columns.to_list()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df

  df = pd.read_sql(sql=time_sql, con=db_tripadvisor)


Unnamed: 0,TimeID,StayDate,StayDateYear,StayDateMonth,StayDateDay,StayDateDayOfWeek,StayDateWeek
0,a1635722-1307-4f01-9958-f3b5727e81ea,2024-03-01,2024,3,1,5,8
1,7015953b-652f-43e1-981d-c35e6747a904,2024-03-01,2024,3,1,5,8
2,f67e562f-48b2-49fa-a1fa-3cfc2b189581,2024-03-01,2024,3,1,5,8
3,8ddfb8aa-e1c0-421b-9fe7-bbfb6390cda8,2024-03-01,2024,3,1,5,8
4,beb51f42-d6a1-4c8d-ae60-7ab3356d7428,2024-03-01,2024,3,1,5,8
...,...,...,...,...,...,...,...
11227,156f4c10-09ea-450b-a70c-f6bbad7845aa,2014-03-01,2014,3,1,6,8
11228,4802b545-5a92-44e0-8b88-bfdeb1113878,2014-03-01,2014,3,1,6,8
11229,549df57a-4599-4e69-a63a-cfb1f5d90cc9,2014-03-01,2014,3,1,6,8
11230,f6d8f247-7a98-4377-b65f-c53ec592a255,2014-02-01,2014,2,1,6,4


In [104]:
# Load Time Dimension 
df.to_sql(name='time', con = dwh, if_exists='replace')
dwh.commit()

In [105]:
# Review Dimension
# ReviewID
# ReviewText
# ReviewTitle
# ReviewRating

review_sql = """
SELECT `Review Title` AS ReviewTitle, `Review Text` AS ReviewText, `Rating` AS ReviewRating, `Author Contribution` AS ReviewerContribution
FROM tripadvisor_reviews;
"""

df = pd.read_sql(sql=review_sql, con=db_tripadvisor)
df['ReviewID'] = df['ReviewTitle'].apply(lambda x: uuid4())
cols = df.columns.to_list()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df

  df = pd.read_sql(sql=review_sql, con=db_tripadvisor)


Unnamed: 0,ReviewID,ReviewTitle,ReviewText,ReviewRating,ReviewerContribution
0,6a4e13f3-d066-40c8-80d3-f04a14d7c078,Must see in Singapore,A must not miss place for tourists to visit wh...,5.0,73
1,f2e6f464-189a-4c0b-b291-c1276a22ce7e,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,5.0,1
2,5b152e2a-fdb8-4c90-85c2-2c782e28794a,Nice touch.,While the initial check in experience was not ...,5.0,2
3,3030de92-e942-482d-8e46-1f1dea81ead0,Amazing experience at MBS with superb hospital...,I booked the Sands Premiere Suite to celebrate...,5.0,1
4,fe13da7c-09d5-44b7-928d-f6c06ad50447,Incredibile,"Nina is amazing as all the team, I loved her h...",5.0,2
...,...,...,...,...,...
11227,39833875-e805-4cb2-a580-4612adf36fa1,Aweseome view over the Harbour,Amazing design ! I wouldnt recommend to sleep...,4.0,131
11228,5b5e9e50-464a-44be-b6e1-c584b3e8fcc4,Excellent Property but poor services,Stayed in this hotel during the period Novembe...,3.0,15
11229,1b0ad660-ce88-4b38-ae81-bd93f2c738f0,Stunning &amp; Mesmeric,When you walk out on to that roof for the firs...,5.0,25
11230,6bdda380-7b8e-454b-897a-c82d1edaa54d,A Slice of Heaven,One can say what they want about this property...,4.0,5


In [106]:
# Load Time Dimension 
df.to_sql(name='review', con = dwh, if_exists='replace')
dwh.commit()

In [107]:
# ingest  fact
# im not v sure

In [108]:
db_tripadvisor.close()
dwh.close()

In [109]:
#  housekeeping - set up primary and foreign keys in datawarehouse

db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)

cursor = db_datawarehouse.cursor()
cursor.execute('ALTER TABLE time ADD PRIMARY KEY (TimeID(12));')
cursor.execute('ALTER TABLE review ADD PRIMARY KEY (ReviewID(12));')

db_datawarehouse.commit()
db_datawarehouse.close()