# ETL Notebook

This notebook is to do some DataFrame visualisation for our ETL process. We will run connect to the Database and run our SQL queries to extract our defined dimensions.

This can also be our testbed for potential transforms that we will be conducting for our NLP models.

In [9]:
# Import libraries
import mysql.connector
import pandas as pd
import os
from sqlalchemy import create_engine
from dotenv import load_dotenv
from uuid import uuid4
import pandas as pd
import numpy as np
from datetime import datetime
import re
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('vader_lexicon')
from textblob import TextBlob

import warnings
warnings.filterwarnings("ignore")

load_dotenv()

True

In [27]:
db_host = os.getenv("DATABASE_ENDPOINT")
db_user = os.getenv("DATABASE_USERNAME")
db_pw = os.getenv("DATABASE_PASSWORD")
db_name = os.getenv("DATABASE_NAME")
db_port = os.getenv("DATABASE_PORT")

dwh_host = os.getenv("DATAWH_ENDPOINT")
dwh_user = os.getenv("DATAWH_USERNAME")
dwh_pw = os.getenv("DATAWH_PASSWORD")
dwh_name = os.getenv("DATAWH_NAME")
dwh_port = os.getenv("DATAWH_PORT")

# housekeeping
db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)

cursor = db_datawarehouse.cursor()
cursor.execute('DROP TABLE IF EXISTS fact;')

cursor.execute('DROP TABLE IF EXISTS review;')
cursor.execute('DROP TABLE IF EXISTS time;')

engine = create_engine(f'mysql://{dwh_user}:{dwh_pw}@{dwh_host}:{dwh_port}/{dwh_name}', echo=False, future=True)
db_datawarehouse = engine.connect()

db_datawarehouse

<sqlalchemy.engine.base.Connection at 0x297b0cbc0>

In [12]:
db_tripadvisor = mysql.connector.connect(
	host=db_host,
	user=db_user,
	passwd=db_pw,
	database=db_name
)

engine = create_engine(f'mysql://{dwh_user}:{dwh_pw}@{dwh_host}:{dwh_port}/{dwh_name}', echo=False)

dwh = engine.connect()

# housekeeping
db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)

In [13]:
# Check if the OverallID column exists
cursor = db_tripadvisor.cursor()
cursor.execute("SHOW COLUMNS FROM tripadvisor_reviews LIKE 'OverallID'")
if cursor.fetchone() is None:
    # If OverallID column doesn't exist, add it
    cursor.execute('ALTER TABLE tripadvisor_reviews ADD OverallID INT AUTO_INCREMENT PRIMARY KEY')


In [14]:
# Check tripadvisor_reviews table

str_sql = '''
SELECT *
FROM tripadvisor_reviews
'''

df = pd.read_sql(sql=str_sql, con=db_tripadvisor)

df

Unnamed: 0,ReviewTitle,ReviewText,DateOfStay,AuthorContribution,Rating,OverallID
0,Must see in Singapore,A must not miss place for tourists to visit wh...,2024-03-01,73,5.0,1
1,I recently had the most rejuvenating spa,I recently had the most rejuvenating spa exper...,2024-03-01,3,5.0,2
2,Professional service,Visited the hotel for some drinks and what I r...,2024-03-01,2,5.0,3
3,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,2024-03-01,1,5.0,4
4,Nice touch.,While the initial check in experience was not ...,2024-03-01,2,5.0,5
...,...,...,...,...,...,...
11227,A good hotel,Nice location and very near to a shopping mall...,2014-08-01,97,5.0,11228
11228,Swim in heaven,Everytime I check into the Sand I love going u...,2015-03-01,13,4.0,11229
11229,Unmatcheable Property,I stayed here for 2 nights in November'14. ...,2015-01-01,68,5.0,11230
11230,Beautiful experience and amazing architecture,This is a great place to visit even quickly if...,2015-01-01,59,5.0,11231


In [7]:
# Time dimension
# TimeID
# StayDate
# StayDateYear
# StayDateMonth
# StayDateDay
# StayDateDayOfWeek
# StayDateWeek

db_tripadvisor = mysql.connector.connect(
	host=db_host,
	user=db_user,
	passwd=db_pw,
	database=db_name
)

time_sql = f'''
SELECT `OverallID` as OverallID, `Date of Stay` as StayDate, YEAR(`Date of Stay`) AS StayDateYear, MONTH(`Date of Stay`) AS StayDateMonth, Day(`Date of Stay`) AS StayDateDay, IF((DayOfWeek(`Date of Stay`) - 1) = 0, 7, DayOfWeek(`Date of Stay`) - 1) As StayDateDayOfWeek, WEEK(`Date of Stay`) AS StayDateWeek
FROM tripadvisor_reviews
'''

df = pd.read_sql(sql=time_sql, con=db_tripadvisor)
df['TimeID'] = df['StayDate'].apply(lambda x: str(uuid4())[:12])
cols = df.columns.to_list()
cols = cols[-1:] + cols[:-1]
df = df[cols]
# change to datetime type
df["StayDate"] = pd.to_datetime(df['StayDate'], format='%Y-%m-%d')
df

Unnamed: 0,TimeID,OverallID,StayDate,StayDateYear,StayDateMonth,StayDateDay,StayDateDayOfWeek,StayDateWeek
0,261b6583-581,1,2024-03-01,2024,3,1,5,8
1,21628c62-084,2,2024-03-01,2024,3,1,5,8
2,28a9e4d5-60b,3,2024-03-01,2024,3,1,5,8
3,ed89d546-259,4,2024-03-01,2024,3,1,5,8
4,a1d230fc-9b7,5,2024-03-01,2024,3,1,5,8
...,...,...,...,...,...,...,...,...
11227,235b8cd8-6c1,11228,2014-08-01,2014,8,1,5,30
11228,62a85fde-8ff,11229,2015-03-01,2015,3,1,7,9
11229,044ff923-b3a,11230,2015-01-01,2015,1,1,4,0
11230,49c259ad-b59,11231,2015-01-01,2015,1,1,4,0


In [8]:
# Load Time Dimension 
df.to_sql(name='time', con = dwh, if_exists='replace')
dwh.commit()

In [13]:
# to clean reviews
def process_text(text):
    
    # Initialise
    lemmatizer = WordNetLemmatizer()
    processed_text = " "
    
    # Process input
    text_lower = text.lower()
    word = word_tokenize(text_lower)
    
    # Alphabetical Tokens
    alphabetic_tokens = [word for word in word if re.match('^[a-zA-Z]+$', word)]
    
    # Remove stopwords from text and lemmatize
    stop_words = set(stopwords.words('english'))

    
    
    lem_words = []
    for word in alphabetic_tokens:
        if word not in stop_words:
            lem_words.append(lemmatizer.lemmatize(word))
    
    # Join the list of words
    processed_text = processed_text.join(lem_words)     #print(edited_stop_words)

    return processed_text

def clean_text(text):
    cleaned_text = ''.join([char.lower() for char in text if char.isalpha() or char.isspace()])
    return cleaned_text


In [24]:
# Review Dimension
# ReviewID
# ReviewText
# ReviewTitle
# ReviewRating

db_tripadvisor = mysql.connector.connect(
	host=db_host,
	user=db_user,
	passwd=db_pw,
	database=db_name
)

review_sql = f"""
    SELECT tripadvisor_reviews.OverallID AS OverallID, 
            tripadvisor_reviews.ReviewTitle AS ReviewTitle,
            tripadvisor_reviews.ReviewText AS ReviewText, 
            tripadvisor_reviews.Rating AS ReviewRating, 
            tripadvisor_reviews.AuthorContribution AS ReviewerContribution
    FROM tripadvisor_reviews
"""

df = pd.read_sql(sql=review_sql, con=db_tripadvisor)
df['ReviewID'] = df['ReviewTitle'].apply(lambda x: str(uuid4())[:12])
cols = df.columns.to_list()
cols = cols[-1:] + cols[:-1]
df = df[cols]
df['ReviewRating'] = df['ReviewRating'].astype(int)
df['CleanReviewTitle'] = df['ReviewTitle'].apply(clean_text)
df['CleanReviewText'] = df['ReviewText'].apply(clean_text)
df['TextBlob_Title'] = df['CleanReviewTitle'].apply(lambda x: TextBlob(x).sentiment.polarity)
df['TextBlob_Review'] = df['CleanReviewText'].apply(lambda x: TextBlob(x).sentiment.polarity)

df

Unnamed: 0,ReviewID,OverallID,ReviewTitle,ReviewText,ReviewRating,ReviewerContribution,CleanReviewTitle,CleanReviewText,TextBlob_Title,TextBlob_Review
0,8d35d805-454,1,Must see in Singapore,A must not miss place for tourists to visit wh...,5,73,must see in singapore,a must not miss place for tourists to visit wh...,0.000,0.266667
1,51eb6399-b49,2,I recently had the most rejuvenating spa,I recently had the most rejuvenating spa exper...,5,3,i recently had the most rejuvenating spa,i recently had the most rejuvenating spa exper...,0.250,0.357143
2,189f3973-806,3,Professional service,Visited the hotel for some drinks and what I r...,5,2,professional service,visited the hotel for some drinks and what i r...,0.100,0.450833
3,b35f1224-399,4,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,5,1,marina bay world class,amazing hotel and loved the facilities being s...,0.000,0.650000
4,68ead1c7-d57,5,Nice touch.,While the initial check in experience was not ...,5,2,nice touch,while the initial check in experience was not ...,0.600,0.227857
...,...,...,...,...,...,...,...,...,...,...
11227,30e0b036-536,11228,A good hotel,Nice location and very near to a shopping mall...,5,97,a good hotel,nice location and very near to a shopping mall...,0.700,0.209479
11228,00a5d6c3-705,11229,Swim in heaven,Everytime I check into the Sand I love going u...,4,13,swim in heaven,everytime i check into the sand i love going u...,0.000,0.177381
11229,e51af08d-170,11230,Unmatcheable Property,I stayed here for 2 nights in November'14. ...,5,68,unmatcheable property,i stayed here for nights in november hotel...,0.000,0.438095
11230,43653f8f-f4b,11231,Beautiful experience and amazing architecture,This is a great place to visit even quickly if...,5,59,beautiful experience and amazing architecture,this is a great place to visit even quickly if...,0.725,0.338333


In [19]:
# Load review Dimension 
df.to_sql(name='review', con = dwh, if_exists='replace')
dwh.commit()

In [21]:
# Ingest fact
fact_sql = f'''
SELECT review.OverallID, review.ReviewID, time.TimeID, review.TextBlob_Title, review.TextBlob_Review
FROM review
INNER JOIN time ON review.OverallID = time.OverallID
'''
df = pd.read_sql(sql = fact_sql, con=dwh)
df


Unnamed: 0,OverallID,ReviewID,TimeID,TextBlob_Title,TextBlob_Review
0,1,b4a7d9a2-0c5,261b6583-581,0.00,0.228571
1,2,3919c141-85f,21628c62-084,0.00,0.333333
2,3,4befe03b-f30,28a9e4d5-60b,0.10,0.362500
3,4,6921d87a-a3a,ed89d546-259,0.00,0.650000
4,5,58962fc0-b25,a1d230fc-9b7,0.60,0.255000
...,...,...,...,...,...
11227,11215,d4f7502a-efa,d23d0049-5cc,0.55,0.212942
11228,11221,2aca7d58-78c,c181e60f-100,0.10,-0.312500
11229,11224,18b2cea4-af8,f5dffdb3-ee7,0.00,0.644444
11230,11227,aaa388cb-a20,3b4b7b08-22e,0.00,0.450000


In [22]:
df.to_sql(name='fact', con=dwh, if_exists='replace')
dwh.commit()

In [23]:
db_tripadvisor.close()
dwh.close()

In [29]:
#  housekeeping - set up primary and foreign keys in datawarehouse

db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)

cursor = db_datawarehouse.cursor()

cursor.execute('ALTER TABLE review ADD PRIMARY KEY (OverallID);')
cursor.execute('ALTER TABLE time ADD PRIMARY KEY (OverallID);')
cursor.execute('ALTER TABLE fact ADD PRIMARY KEY (OverallID);')
cursor.execute('ALTER TABLE fact ADD FOREIGN KEY (OverallID) REFERENCES review(OverallID);')
cursor.execute('ALTER TABLE fact ADD FOREIGN KEY (OverallID) REFERENCES time(OverallID);')


db_datawarehouse.commit()
db_datawarehouse.close()

In [30]:
# check
db_datawarehouse = mysql.connector.connect(
	host=dwh_host,
	user=dwh_user,
	passwd=dwh_pw,
	database=dwh_name,
    auth_plugin=dwh_pw
)

time_Sql = '''
SELECT * FROM time ORDER BY OverallID
'''
df = pd.read_sql(sql=time_Sql, con=db_datawarehouse)

df

Unnamed: 0,index,TimeID,OverallID,StayDate,StayDateYear,StayDateMonth,StayDateDay,StayDateDayOfWeek,StayDateWeek
0,0,261b6583-581,1,2024-03-01,2024,3,1,5,8
1,1,21628c62-084,2,2024-03-01,2024,3,1,5,8
2,2,28a9e4d5-60b,3,2024-03-01,2024,3,1,5,8
3,3,ed89d546-259,4,2024-03-01,2024,3,1,5,8
4,4,a1d230fc-9b7,5,2024-03-01,2024,3,1,5,8
...,...,...,...,...,...,...,...,...,...
11227,11227,235b8cd8-6c1,11228,2014-08-01,2014,8,1,5,30
11228,11228,62a85fde-8ff,11229,2015-03-01,2015,3,1,7,9
11229,11229,044ff923-b3a,11230,2015-01-01,2015,1,1,4,0
11230,11230,49c259ad-b59,11231,2015-01-01,2015,1,1,4,0


In [31]:
fact_Sql = '''
SELECT * FROM fact ORDER BY OverallID
'''
df = pd.read_sql(sql=fact_Sql, con=db_datawarehouse)

df

Unnamed: 0,index,OverallID,ReviewID,TimeID,TextBlob_Title,TextBlob_Review
0,0,1,b4a7d9a2-0c5,261b6583-581,0.000,0.228571
1,1,2,3919c141-85f,21628c62-084,0.000,0.333333
2,2,3,4befe03b-f30,28a9e4d5-60b,0.100,0.362500
3,3,4,6921d87a-a3a,ed89d546-259,0.000,0.650000
4,4,5,58962fc0-b25,a1d230fc-9b7,0.600,0.255000
...,...,...,...,...,...,...
11227,9397,11228,b07a43e8-b50,235b8cd8-6c1,0.700,0.258929
11228,11231,11229,f886c732-3b8,62a85fde-8ff,0.000,0.305952
11229,5834,11230,21c18015-c84,044ff923-b3a,0.000,0.358333
11230,9398,11231,c7cc92b0-e9d,49c259ad-b59,0.725,0.422917


In [32]:
review_Sql = '''
SELECT * FROM review ORDER BY OverallID
'''
df = pd.read_sql(sql=review_Sql, con=db_datawarehouse)

df

Unnamed: 0,index,ReviewID,OverallID,ReviewTitle,ReviewText,ReviewRating,ReviewerContribution,CleanReviewTitle,CleanReviewText,TextBlob_Title,TextBlob_Review
0,0,b4a7d9a2-0c5,1,Must see in Singapore,A must not miss place for tourists to visit wh...,5,73,must see singapore,must miss place tourist visit singapore pretty...,0.000,0.228571
1,1,3919c141-85f,2,I recently had the most rejuvenating spa,I recently had the most rejuvenating spa exper...,5,3,recently rejuvenating spa,recently rejuvenating spa experience hour flig...,0.000,0.333333
2,2,4befe03b-f30,3,Professional service,Visited the hotel for some drinks and what I r...,5,2,professional service,visited hotel drink really wanted say wonderfu...,0.100,0.362500
3,3,6921d87a-a3a,4,Marina Bay world class,Amazing hotel and loved the facilities. Being ...,5,1,marina bay world class,amazing hotel loved facility someone enjoys gy...,0.000,0.650000
4,4,58962fc0-b25,5,Nice touch.,While the initial check in experience was not ...,5,2,nice touch,initial check experience expected nicole manag...,0.600,0.255000
...,...,...,...,...,...,...,...,...,...,...,...
11227,11227,b07a43e8-b50,11228,A good hotel,Nice location and very near to a shopping mall...,5,97,good hotel,nice location near shopping mall big swimming ...,0.700,0.258929
11228,11228,f886c732-3b8,11229,Swim in heaven,Everytime I check into the Sand I love going u...,4,13,swim heaven,everytime check sand love going pool enjoy lon...,0.000,0.305952
11229,11229,21c18015-c84,11230,Unmatcheable Property,I stayed here for 2 nights in November'14. ...,5,68,unmatcheable property,stayed night hotel real superb beauty jewel si...,0.000,0.358333
11230,11230,c7cc92b0-e9d,11231,Beautiful experience and amazing architecture,This is a great place to visit even quickly if...,5,59,beautiful experience amazing architecture,great place visit even quickly hour layover ai...,0.725,0.422917
