In [31]:
from urllib.parse import quote_plus
import mysql.connector
from mysql.connector import Error
import pandas as pd
from sqlalchemy import create_engine

In [32]:
def connection(hostname, username, password, dbname):
    # Initialize cinnection to None
    con = None

    # Encode the password
    password = quote_plus(password)

    # Create connection while checking for any errors
    try:
        con = mysql.connector.connect(host=hostname, user=username, passwd = password, database  = dbname)
        print("Connection successful!")
    except Error as e:
        print(f"The error {e} has occured.")
    
    return con

def engine(hostname, username, password, dbname, port):
    # Create engine
    eng = create_engine(f'mysql+pymysql://{username}:{password}@{hostname}:{port}/{dbname}')
    return eng

In [33]:
def csv_to_staging(datafile):
    # Create engine using the pre defined engine function
    eng = engine("localhost", "root", "Layaldbroot1997", "feedback_source", "3306")

    # Load csv to dataframe
    df = pd.read_csv(datafile, sep=', ', engine='python')

    # Load dataframe to staging schema
    df.to_sql(name="feedback",con=eng, schema="feedback_source",if_exists="replace",index = False)

csv_to_staging("Data/sentiment-analysis.csv")

def extract_source():
    # Create connection and cursor
    conn = connection("localhost", "root", "Layaldbroot1997", "feedback_source")
    cursor = conn.cursor()

    # The query extracts all data in the staging schema
    query = "Select * from feedback_source.feedback"
    cursor.execute(query)
    data = cursor.fetchall()

    # Define columns names and load cursor data to a dataframe
    column_names = [i[0] for i in cursor.description]
    df = pd.DataFrame(data, columns=column_names)
    
    return df

In [34]:
def transform_feedback():
    # Extract the data from staging area
    df = extract_source()

    # Take a copy of the df
    df = df.copy()

    # Drop dupicates
    df =  df.drop_duplicates()

    # Rename columns properly
    df.rename(columns={'"Text': 'Text', 'Confidence Score"': 'Confidence Score'}, inplace=True)

    # Remove characters from values
    df.loc[:,'Text'] = df['Text'].str.replace(r'\"', '', regex=True)
    df.loc[:,'Confidence Score'] = df['Confidence Score'].str.replace(r'\"', '', regex=True)
    df.loc[:,'User ID'] = df['User ID'].str.replace(r'@', '', regex=True)

    # Change some data types
    df['Date/Time'] = pd.to_datetime(df['Date/Time'])
    df['Confidence Score'] = df['Confidence Score'].astype(dtype='float')

    return df

In [35]:
def load_sources():
    # Load transformd data into a dataframe
    df = transform_feedback()

    # Create engine
    eng = engine("localhost", "root", "Layaldbroot1997", "feedback_dwh", "3306")

    # Load distinct sources to a dataframe
    sources = df[['Source']].drop_duplicates().reset_index(drop=True)

    # Generate ids for the sources
    sources['id'] = range(1, len(sources)+1)

    # Organize the columns
    sources = sources[['id', 'Source']]

    # Load the sources to the corresponding table in the data warehouse
    sources.to_sql(name="dim_source", con=eng, schema="feedback_dwh", if_exists="append", index=False)

    #return
load_sources()

Connection successful!


In [37]:
def load_locations():
    # Load transformd data into a dataframe
    df = transform_feedback()

    # Create engine
    eng = engine("localhost", "root", "Layaldbroot1997", "feedback_dwh", "3306")

    # Load distinct locations to a dataframe
    locations = df[['Location']].drop_duplicates().reset_index(drop=True)

    # Generate ids for the locations
    locations['id'] = range(1, len(locations)+1)

    # Organize the columns
    locations = locations[['id', 'Location']]

    # Load the locations to the corresponding table in the data warehouse
    locations.to_sql(name="dim_location", con=eng, schema="feedback_dwh", if_exists="append", index=False)

    return
load_locations()

Connection successful!


In [38]:
def load_sentiments():
    # Load transformd data into a dataframe
    df = transform_feedback()

    # Create engine
    eng = engine("localhost", "root", "Layaldbroot1997", "feedback_dwh", "3306")

    # Load distinct locations to a dataframe
    sentiments = df[['Sentiment']].drop_duplicates().reset_index(drop=True)

    # Generate ids for the locations
    sentiments['id'] = range(1, len(sentiments)+1)

    # Organize the columns
    sentiments = sentiments[['id', 'Sentiment']]

    # Load the locations to the corresponding table in the data warehouse
    sentiments.to_sql(name="dim_sentiment", con=eng, schema="feedback_dwh", if_exists="append", index=False)

    return
load_sentiments()

Connection successful!
