## Midterm Project

### Prerequisites:
This notebook uses the SqlAlchemy database connectivity library to connect to MySQL databases; therefore, you must have first installed that libary into your python environment by executing the following command in a Terminal window.

- `python -m pip install sqlalchemy`

#### Import the Necessary Libraries

In [1]:
import os
import numpy
import pandas as pd
from sqlalchemy import create_engine

#### Declare & Assign Connection Variables for the MySQL Server & Databases

In [2]:
host_name = "localhost"
port = "3306"
user_id = "root"
pwd = "Passw0rd123"

src_dbname = "sakila"
dst_dbname = "sakila_dw2"

#### Define Functions for Getting Data From and Setting Data Into Databases

In [3]:
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        connection.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

#### Create the New Data Warehouse database, and to Use it, Switch the Connection Context.
*Drop* a database if it already exists, and then *create* the new **sakila_dw2** database and *use* it as the target of all subsequent operations.

In [4]:
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)
connection = sqlEngine.connect()

connection.execute(f"DROP DATABASE IF EXISTS `{dst_dbname}`;")
connection.execute(f"CREATE DATABASE `{dst_dbname}`;")
connection.execute(f"USE {dst_dbname};")

connection.close()

### 1.0. Create & Populate the Dimension Tables


#### 1.1. Extract Data from the Source Database Tables
Fetch data for each dimension table from the **sakila** database using the **get_dataframe()** function.

In [5]:
# Actors
sql_actors = "SELECT * FROM sakila.actor;"
df_actors = get_dataframe(user_id, pwd, host_name, src_dbname, sql_actors)
df_actors.head(2)

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1,PENELOPE,GUINESS,2006-02-14 23:34:33
1,2,NICK,WAHLBERG,2006-02-14 23:34:33


In [6]:
# Category
sql_category = "SELECT * FROM sakila.category;"
df_category = get_dataframe(user_id, pwd, host_name, src_dbname, sql_category)
df_category.head(2)

Unnamed: 0,category_id,name,last_update
0,1,Action,2006-02-14 23:46:27
1,2,Animation,2006-02-14 23:46:27


In [7]:
# Customers
sql_customers = "SELECT * FROM sakila.customer;"
df_customers = get_dataframe(user_id, pwd, host_name, src_dbname, sql_customers)
df_customers.head(2)

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,active,create_date,last_update
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2006-02-14 22:04:36,2006-02-14 23:57:20
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,1,2006-02-14 22:04:36,2006-02-14 23:57:20


In [8]:
# Film
sql_film = "SELECT * FROM sakila.film;"
df_film = get_dataframe(user_id, pwd, host_name, src_dbname, sql_film)
df_film.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 00:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 00:03:42


In [9]:
# Film actor
sql_film_actor = "SELECT * FROM sakila.film_actor;"
df_film_actor = get_dataframe(user_id, pwd, host_name, src_dbname, sql_film_actor)
df_film_actor.head(2)

Unnamed: 0,actor_id,film_id,last_update
0,1,1,2006-02-15 00:05:03
1,1,23,2006-02-15 00:05:03


In [10]:
# Film category
sql_film_category = "SELECT * FROM sakila.film_category;"
df_film_category = get_dataframe(user_id, pwd, host_name, src_dbname, sql_film_category)
df_film_category.head(2)

Unnamed: 0,film_id,category_id,last_update
0,1,6,2006-02-15 00:07:09
1,2,11,2006-02-15 00:07:09


In [11]:
# Inventory
sql_inventory = "SELECT * FROM sakila.inventory;"
df_inventory = get_dataframe(user_id, pwd, host_name, src_dbname, sql_inventory)
df_inventory.head(2)

Unnamed: 0,inventory_id,film_id,store_id,last_update
0,1,1,1,2006-02-15 00:09:17
1,2,1,1,2006-02-15 00:09:17


In [12]:
# Payment
sql_payment = "SELECT * FROM sakila.payment;"
df_payment = get_dataframe(user_id, pwd, host_name, src_dbname, sql_payment)
df_payment.head(2)

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update
0,1,1,1,76,2.99,2005-05-25 11:30:37,2006-02-15 17:12:30
1,2,1,1,573,0.99,2005-05-28 10:35:23,2006-02-15 17:12:30


In [13]:
# Start of fact table (help for merging later on)
sql_fact_table = "SELECT * FROM sakila.payment;"
sakila_fact_table = get_dataframe(user_id, pwd, host_name, src_dbname, sql_fact_table)
sakila_fact_table.head(2)

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update
0,1,1,1,76,2.99,2005-05-25 11:30:37,2006-02-15 17:12:30
1,2,1,1,573,0.99,2005-05-28 10:35:23,2006-02-15 17:12:30


In [14]:
# Rental
sql_rental = "SELECT * FROM sakila.rental;"
df_rental = get_dataframe(user_id, pwd, host_name, src_dbname, sql_rental)
df_rental.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 16:30:53
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 16:30:53


#### 1.2. Create the Date Dimension Table
At this point, we have to **execute the script from Lab 2c** that creates and populates a **Date Dimension** table.  Be certain to target this script to the new data warehouse database we just created **(sakila_dw2)**.  Later in this notebook we will integrate the **dim_date** table with the fact table by performing **lookup operations** to retrieve the surrogate primary keys from the **dim_date** table that correspond with each **date** typed column in the fact table.

#### 1.3. Perform Any Necessary Transformations


In [15]:
# 1. Create a List that enumerates the names of each column you wish to remove (drop) from the Pandas DataFrame
drop_cols = ['last_update']
df_actors.drop(drop_cols, axis=1, inplace=True)

# 3. Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
df_actors.insert(0, "actor_key", range(1, df_actors.shape[0]+1))

# 4. Display the first 2 rows of the dataframe to validate your work
df_actors.head(2)

Unnamed: 0,actor_key,actor_id,first_name,last_name
0,1,1,PENELOPE,GUINESS
1,2,2,NICK,WAHLBERG


In [16]:
# Category
drop_cols = ['last_update']
df_category.drop(drop_cols, axis=1, inplace=True)
df_category.insert(0, "category_key", range(1, df_category.shape[0]+1))
df_category.head(2)

Unnamed: 0,category_key,category_id,name
0,1,1,Action
1,2,2,Animation


In [17]:
# Customer
drop_cols = ['store_id','email','address_id','active','create_date','last_update']
df_customers.drop(drop_cols, axis=1, inplace=True)
df_customers.insert(0, "customer_key", range(1, df_customers.shape[0]+1))
df_customers.head(2)

Unnamed: 0,customer_key,customer_id,first_name,last_name
0,1,1,MARY,SMITH
1,2,2,PATRICIA,JOHNSON


In [18]:
# Film
drop_cols = ['language_id','original_language_id','replacement_cost','last_update']
df_film.drop(drop_cols, axis=1, inplace=True)
df_film.insert(0, "film_key", range(1, df_film.shape[0]+1))
df_film.head(2)

Unnamed: 0,film_key,film_id,title,description,release_year,rental_duration,rental_rate,length,rating,special_features
0,1,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,PG,"Deleted Scenes,Behind the Scenes"
1,2,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,3,4.99,48,G,"Trailers,Deleted Scenes"


In [19]:
# Film actor
drop_cols = ['last_update']
df_film_actor.drop(drop_cols, axis=1, inplace=True)
df_film_actor.insert(0, "film_actor_key", range(1, df_film_actor.shape[0]+1))
df_film_actor.head(2)

Unnamed: 0,film_actor_key,actor_id,film_id
0,1,1,1
1,2,1,23


In [20]:
# Table combining film and actors
df_actors['actor_name']= df_actors['first_name'] + ' '+df_actors['last_name']
df_actors.drop(['first_name','last_name'],axis=1,inplace=True)
df_actors.head(2)

joined_film_actors = pd.merge(df_film_actor, df_actors, on='actor_id', how='left')
joined_film_actors = joined_film_actors.groupby('film_id')['actor_name'].agg(lambda x: ', '.join(x)).reset_index()
joined_film_actors = pd.merge(joined_film_actors, df_film, on='film_id',how='left')
joined_film_actors.insert(0, "joined_film_actors_key", range(1, joined_film_actors.shape[0]+1))
joined_film_actors.head(2)

Unnamed: 0,film_id,actor_name,film_key,title,description,release_year,rental_duration,rental_rate,length,rating,special_features
0,1,"PENELOPE GUINESS, CHRISTIAN GABLE, LUCILLE TRA...",1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,PG,"Deleted Scenes,Behind the Scenes"
1,2,"BOB FAWCETT, MINNIE ZELLWEGER, SEAN GUINESS, C...",2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,3,4.99,48,G,"Trailers,Deleted Scenes"


In [23]:
# Film category
drop_cols = ['last_update']
df_film_category.drop(drop_cols, axis=1, inplace=True)
df_film_category.insert(0, "film_category_key", range(1, df_film_category.shape[0]+1))
df_film_category.head(2)

Unnamed: 0,film_category_key,film_id,category_id
0,1,1,6
1,2,2,11


In [24]:
# Inventory
drop_cols = ['last_update','store_id']
df_inventory.drop(drop_cols, axis=1, inplace=True)
df_inventory.insert(0, "inventory_key", range(1, df_inventory.shape[0]+1))
df_inventory.head(2)

Unnamed: 0,inventory_key,inventory_id,film_id
0,1,1,1
1,2,2,1


In [25]:
# Payment
drop_cols = ['last_update','staff_id']
df_payment.drop(drop_cols, axis=1, inplace=True)
df_payment.insert(0, "payment_key", range(1, df_payment.shape[0]+1))
df_payment.head(2)

Unnamed: 0,payment_key,payment_id,customer_id,rental_id,amount,payment_date
0,1,1,1,76,2.99,2005-05-25 11:30:37
1,2,2,1,573,0.99,2005-05-28 10:35:23


In [26]:
# Fact table altering (help for merge later)
drop_cols = ['last_update','staff_id']
sakila_fact_table.drop(drop_cols, axis=1, inplace=True)
sakila_fact_table.insert(0, "payment_key", range(1, sakila_fact_table.shape[0]+1))
sakila_fact_table.head(2)

Unnamed: 0,payment_key,payment_id,customer_id,rental_id,amount,payment_date
0,1,1,1,76,2.99,2005-05-25 11:30:37
1,2,2,1,573,0.99,2005-05-28 10:35:23


In [27]:
# Rental
drop_cols = ['last_update','rental_date','return_date','staff_id']
df_rental.drop(drop_cols, axis=1, inplace=True)
df_rental.insert(0, "rental_key", range(1, df_rental.shape[0]+1))
df_rental.head(2)

Unnamed: 0,rental_key,rental_id,inventory_id,customer_id
0,1,1,367,130
1,2,2,1525,459


#### 1.4. Load the Transformed DataFrames into the New Data Warehouse by Creating New Tables


In [28]:
db_operation = "insert"

# Actors Category, customer, film, film_actor,film_catory,invnetory, paument,rental

tables = [('dim_actors', df_actors, 'actor_key'),
          ('dim_category', df_category, 'category_key'),
          ('dim_customers', df_customers, 'customer_key'),
          ('dim_film', df_film, 'film_key'),
          ('dim_film_actor', df_film_actor, 'film_actor_key'),
          ('dim_film_category', df_film_category, 'film_category_key'),
          ('dim_inventory', df_inventory, 'inventory_key'),
          ('dim_payment', df_payment, 'payment_key'),
          ('dim_rental', df_rental, 'rental_key'),
          ('joined_film_actors',joined_film_actors,'joined_film_actors_key'),
          ('sakila_fact_table', sakila_fact_table, 'payment_key'),]

In [29]:
for table_name, dataframe, primary_key in tables:
    set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

### 2.0. Populate the Fact Table


In [30]:
sakila_fact_table = pd.merge(sakila_fact_table, df_rental, on='rental_id', how='left')
sakila_fact_table.rename(columns={"customer_id_x":"customer_id"}, inplace=True)
sakila_fact_table.drop(['rental_id','customer_id_y','payment_id','payment_key','rental_key'], axis=1, inplace=True)
sakila_fact_table.head(2)

Unnamed: 0,customer_id,amount,payment_date,inventory_id
0,1,2.99,2005-05-25 11:30:37,3021
1,1,0.99,2005-05-28 10:35:23,4020


In [31]:
sakila_fact_table = pd.merge(df_customers, sakila_fact_table, on='customer_id', how='left')
sakila_fact_table.drop(['customer_key'], axis=1, inplace=True)
sakila_fact_table.head(2)

Unnamed: 0,customer_id,first_name,last_name,amount,payment_date,inventory_id
0,1,MARY,SMITH,2.99,2005-05-25 11:30:37,3021
1,1,MARY,SMITH,0.99,2005-05-28 10:35:23,4020


In [32]:
sakila_fact_table = pd.merge(df_inventory, sakila_fact_table, on='inventory_id', how='right')
sakila_fact_table.drop(['inventory_key','inventory_id','customer_id'], axis=1, inplace=True)
sakila_fact_table.head(2)

Unnamed: 0,film_id,first_name,last_name,amount,payment_date
0,663,MARY,SMITH,2.99,2005-05-25 11:30:37
1,875,MARY,SMITH,0.99,2005-05-28 10:35:23


In [34]:
sakila_fact_table = pd.merge(sakila_fact_table, joined_film_actors, on='film_id', how='left')
sakila_fact_table.drop(['joined_film_actors_key'], axis=1, inplace=True)
sakila_fact_table.head(2)

Unnamed: 0,film_id,first_name,last_name,amount,payment_date,actor_name,film_key,title,description,release_year,rental_duration,rental_rate,length,rating,special_features
0,663,MARY,SMITH,2.99,2005-05-25 11:30:37,"LUCILLE TRACY, VAL BOLGER, MILLA KEITEL, SEAN ...",663.0,PATIENT SISTER,A Emotional Epistle of a Squirrel And a Robot ...,2006.0,7.0,0.99,99.0,NC-17,"Trailers,Commentaries"
1,875,MARY,SMITH,0.99,2005-05-28 10:35:23,"BURT DUKAKIS, NICK STALLONE, MINNIE ZELLWEGER,...",875.0,TALENTED HOMICIDE,A Lacklusture Panorama of a Dentist And a Fore...,2006.0,6.0,0.99,173.0,PG,"Commentaries,Deleted Scenes,Behind the Scenes"


In [35]:
sakila_fact_table = pd.merge(sakila_fact_table, df_film_category, on='film_id', how='left')
sakila_fact_table.drop(['film_category_key'], axis=1, inplace=True)
sakila_fact_table.head(2)

Unnamed: 0,film_id,first_name,last_name,amount,payment_date,actor_name,film_key,title,description,release_year,rental_duration,rental_rate,length,rating,special_features,category_id
0,663,MARY,SMITH,2.99,2005-05-25 11:30:37,"LUCILLE TRACY, VAL BOLGER, MILLA KEITEL, SEAN ...",663.0,PATIENT SISTER,A Emotional Epistle of a Squirrel And a Robot ...,2006.0,7.0,0.99,99.0,NC-17,"Trailers,Commentaries",4
1,875,MARY,SMITH,0.99,2005-05-28 10:35:23,"BURT DUKAKIS, NICK STALLONE, MINNIE ZELLWEGER,...",875.0,TALENTED HOMICIDE,A Lacklusture Panorama of a Dentist And a Fore...,2006.0,6.0,0.99,173.0,PG,"Commentaries,Deleted Scenes,Behind the Scenes",15


In [36]:
sakila_fact_table = pd.merge(sakila_fact_table, df_category, on='category_id', how='left')
sakila_fact_table.rename(columns={"name":"category"}, inplace=True)
sakila_fact_table.drop(['category_key','category_id'], axis=1, inplace=True)
sakila_fact_table.head(2)

Unnamed: 0,film_id,first_name,last_name,amount,payment_date,actor_name,film_key,title,description,release_year,rental_duration,rental_rate,length,rating,special_features,category
0,663,MARY,SMITH,2.99,2005-05-25 11:30:37,"LUCILLE TRACY, VAL BOLGER, MILLA KEITEL, SEAN ...",663.0,PATIENT SISTER,A Emotional Epistle of a Squirrel And a Robot ...,2006.0,7.0,0.99,99.0,NC-17,"Trailers,Commentaries",Classics
1,875,MARY,SMITH,0.99,2005-05-28 10:35:23,"BURT DUKAKIS, NICK STALLONE, MINNIE ZELLWEGER,...",875.0,TALENTED HOMICIDE,A Lacklusture Panorama of a Dentist And a Fore...,2006.0,6.0,0.99,173.0,PG,"Commentaries,Deleted Scenes,Behind the Scenes",Sports


##### Lookup the DateKeys from the Date Dimension Table.


In [37]:
sql_dim_date = "SELECT date_key, full_date FROM sakila_dw2.dim_date;"
df_dim_date = get_dataframe(user_id, pwd, host_name, src_dbname, sql_dim_date)
df_dim_date.full_date = df_dim_date.full_date.astype('datetime64[ns]').dt.date
df_dim_date.head(2)

Unnamed: 0,date_key,full_date
0,20050524,2005-05-24
1,20050525,2005-05-25


Next, for each **date** typed column in the Fact table, lookup the corresponding Primary Key column. Be certain to cast each **date** column to the **datetime64[ns]** data type using the **.astype()** function that's native to Pandas DataFrame columns. Also, extract the **date** portion using the **.dt.date** attribute.

In [38]:
# Lookup the Surrogate Primary Key (date_key) that Corresponds to the "payment_date" Column.
df_dim_payment_date = df_dim_date.rename(columns={"date_key" : "payment_date_key", "full_date" : "payment_date"})
sakila_fact_table.payment_date = sakila_fact_table.payment_date.astype('datetime64[ns]').dt.date

sakila_fact_table = pd.merge(sakila_fact_table, df_dim_payment_date, on='payment_date', how='left')
sakila_fact_table.drop(['payment_date'], axis=1, inplace=True)
sakila_fact_table.head(2)

Unnamed: 0,film_id,first_name,last_name,amount,actor_name,film_key,title,description,release_year,rental_duration,rental_rate,length,rating,special_features,category,payment_date_key
0,663,MARY,SMITH,2.99,"LUCILLE TRACY, VAL BOLGER, MILLA KEITEL, SEAN ...",663.0,PATIENT SISTER,A Emotional Epistle of a Squirrel And a Robot ...,2006.0,7.0,0.99,99.0,NC-17,"Trailers,Commentaries",Classics,20050525
1,875,MARY,SMITH,0.99,"BURT DUKAKIS, NICK STALLONE, MINNIE ZELLWEGER,...",875.0,TALENTED HOMICIDE,A Lacklusture Panorama of a Dentist And a Fore...,2006.0,6.0,0.99,173.0,PG,"Commentaries,Deleted Scenes,Behind the Scenes",Sports,20050528


#### 2.1. Perform any Additional Transformations
In this step we can prepare the DataFrame so that it defines exactly what we want to see created in the database.  Issues may include dropping unwanted columns, reordering the columns, and in our case, creating a new column to serve as the primary key.

In [40]:
# Concatenate the names of the customers and actors into one cell
sakila_fact_table['customer_name'] = sakila_fact_table['first_name'] + ' ' + sakila_fact_table['last_name']
sakila_fact_table.drop(['first_name','last_name'],axis=1,inplace=True)

# Drop any unwanted fields 
sakila_fact_table.drop(['film_id','film_key'],axis=1,inplace=True)

# Reorder the remaining columns
ordered_columns = ['payment_date_key','rental_duration','rental_rate','amount','customer_name','title','description','category','actor_name','release_year','length','rating','special_features']
sakila_fact_table = sakila_fact_table[ordered_columns]

# Insert a new column, with an ever-incrementing numeric value, to serve as the primary key.
sakila_fact_table.insert(0,"fact_order_key", range(1,sakila_fact_table.shape[0]+1))

sakila_fact_table.head(2)

Unnamed: 0,fact_order_key,payment_date_key,rental_duration,rental_rate,amount,customer_name,title,description,category,actor_name,release_year,length,rating,special_features
0,1,20050525,7.0,0.99,2.99,MARY SMITH,PATIENT SISTER,A Emotional Epistle of a Squirrel And a Robot ...,Classics,"LUCILLE TRACY, VAL BOLGER, MILLA KEITEL, SEAN ...",2006.0,99.0,NC-17,"Trailers,Commentaries"
1,2,20050528,6.0,0.99,0.99,MARY SMITH,TALENTED HOMICIDE,A Lacklusture Panorama of a Dentist And a Fore...,Sports,"BURT DUKAKIS, NICK STALLONE, MINNIE ZELLWEGER,...",2006.0,173.0,PG,"Commentaries,Deleted Scenes,Behind the Scenes"


In [None]:
sakila_fact_table

#### 2.2. Write the DataFrame Back to the Database

In [42]:
table_name = "sakila_fact_table"
primary_key = "fact_order_key"
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, sakila_fact_table, table_name, primary_key, db_operation)