In [1]:
import configparser
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
config = configparser.ConfigParser()

In [3]:
config.read('clusterdvd.config')

[]

In [4]:
config['POSTGRES']['PG_HOST']

'172.17.0.2'

In [5]:
db = config['POSTGRES']['PG_DB']
user = config['POSTGRES']['PG_UNAME']
passwd = config['POSTGRES']['PG_PASS']
port = config['POSTGRES']['PG_PORT']
host = config['POSTGRES']['PG_HOST']

Using the pandas internal read_sql method to get the data from the DB

In [7]:
credentials = "postgresql://{}:{}@{}:{}/{}".format(user,passwd,host,port,db)

In [8]:
credentials

'postgresql://postgres:1234@172.17.0.2:5432/dvdrental'

In [9]:
filmTable = pd.read_sql("""SELECT * FROM film""",con=credentials)

In [10]:
filmTable.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
0,133,Chamber Italian,A Fateful Reflection of a Moose And a Husband ...,2006,1,7,4.99,117,14.99,NC-17,2013-05-26 14:50:58.951,[Trailers],'chamber':1 'fate':4 'husband':11 'italian':2 ...
1,384,Grosse Wonderful,A Epic Drama of a Cat And a Explorer who must ...,2006,1,5,4.99,49,19.99,R,2013-05-26 14:50:58.951,[Behind the Scenes],'australia':18 'cat':8 'drama':5 'epic':4 'exp...


Getting the schema from the existing tables

In [24]:
def getSchema(tableName, credentials):
    schema = pd.read_sql("""SELECT * FROM information_schema.columns where table_name='{}'""".format(tableName),con=credentials)
    return schema

In [26]:
getSchema('inventory',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,inventory,inventory_id,1,nextval('inventory_inventory_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,inventory,film_id,2,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,inventory,store_id,3,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,inventory,last_update,4,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES


In [38]:
def queryTable(query, credentials):
    try:
        pd.read_sql(query, credentials)
        return
    except Exception as e:
        print(e)

In [34]:
createDim1 = """CREATE TABLE dimdate(
                date_key integer NOT NULL PRIMARY KEY,
                date date NOT NULL,
                year smallint NOT NULL,
                quarter smallint NOT NULL,
                month smallint NOT NULL,
                day smallint NOT NULL,
                week smallint NOT NULL,
                is_weekend boolean
            )"""

In [87]:
createDim2 = """CREATE TABLE dimcustomer(
                customer_key SERIAL PRIMARY KEY,
                customer_id  SMALLINT NOT NULL,
                first_name VARCHAR(45) NOT NULL,
                last_name VARCHAR(45) NOT NULL,
                email VARCHAR(60),
                address VARCHAR(50) NOT NULL,
                address2 VARCHAR(50),
                district VARCHAR(50) NOT NULL,
                city VARCHAR(50) NOT NULL,
                country VARCHAR(50) NOT NULL,
                postal_code VARCHAR(10),
                phone VARCHAR(20) NOT NULL,
                active smallint NOT NULL,
                create_date timestamp NOT NULL,
                start_date date NOT NULL,
                end_date date NOT NULL
            )"""

In [68]:
createDim3 = """CREATE TABLE dimmovie(
                movie_key SERIAL PRIMARY KEY,
                film_id  SMALLINT NOT NULL,
                title VARCHAR(45) NOT NULL,
                description TEXT,
                release_year YEAR,
                language VARCHAR(20),
                original_language VARCHAR(20),
                rental_duration SMALLINT NOT NULL,
                length smallINT NOT NULL,
                rating VARCHAR(5) NOT NULL,
                special_features VARCHAR(60) NOT NULL
            )"""

In [70]:
createDim4 = """CREATE TABLE dimstore(
                store_key SERIAL PRIMARY KEY,
                store_id  SMALLINT NOT NULL,
                manager_first_name VARCHAR(45) NOT NULL,
                manager_last_name VARCHAR(45) NOT NULL,
                address VARCHAR(50) NOT NULL,
                address2 VARCHAR(50),
                district VARCHAR(20) NOT NULL,
                city VARCHAR(20) NOT NULL,
                country VARCHAR(50) NOT NULL,
                postal_code VARCHAR(10),
                start_date DATE NOT NULL,
                end_date DATE NOT NULL
                )"""

Making the the table 

In [35]:
queryTable(createDim1, credentials)

This result object does not return rows. It has been closed automatically.


In [88]:
queryTable(createDim2, credentials)

This result object does not return rows. It has been closed automatically.


In [69]:
queryTable(createDim3, credentials)

This result object does not return rows. It has been closed automatically.


In [71]:
queryTable(createDim4, credentials)

This result object does not return rows. It has been closed automatically.


In [55]:
def queryBase(query,credentials):
    requiredTable = pd.read_sql(query,con=credentials)
    return requiredTable

In [56]:
query = """SELECT * FROM dimdate"""

In [58]:
queryBase(query,credentials).head(2)

Unnamed: 0,date_key,date,year,quarter,month,day,week,is_weekend
0,70214,2007-02-14,2007,1,2,14,7,False
1,70317,2007-03-17,2007,1,3,17,11,True


Executing insertion into the single dimdate table from payment table

In [53]:
insertQuery = """
        INSERT INTO dimdate (date_key, date, year, quarter, month, day, week, is_weekend)
        SELECT 
            DISTINCT(TO_CHAR(payment_date::DATE, 'yyMMDD')::integer) as date_key,
            date(payment_date) as date,
            EXTRACT(year FROM payment_date) as year,
            EXTRACT(quarter FROM payment_date) as quarter,
            EXTRACT(month FROM payment_date) as month,
            EXTRACT(day FROM payment_date) as day,
            EXTRACT(week FROM payment_date) as week,
            CASE WHEN EXTRACT(ISODOW FROM payment_date) IN (6,7) THEN true else false END;
        FROM payment;
"""

In [54]:
queryTable(insertQuery, credentials)

This result object does not return rows. It has been closed automatically.


Multiple tables are joined and the data is extracted from it
and dimcustomer is filled. All the below "getSchema" function calls were executed when the necessary columns where required to be checked. 

### Major Benefit 1
The benefit of writing functions that they can be called when you need to refer during the Query is written. There is no need to switch between multiple windows, or use mouse to get the data. 

In [72]:
getSchema('country',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,country,country_id,1,nextval('country_country_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,country,last_update,3,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,country,country,2,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES


In [73]:
getSchema('address',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,address,last_update,8,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,address,city_id,5,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,address,address_id,1,nextval('address_address_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,address,district,4,,NO,character varying,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,address,phone,7,,NO,character varying,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,address,postal_code,6,,YES,character varying,10.0,40.0,...,NO,,,,,,NO,NEVER,,YES
6,dvdrental,public,address,address,2,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES
7,dvdrental,public,address,address2,3,,YES,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES


In [74]:
getSchema('customer',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,customer,active,10,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,customer,store_id,2,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,customer,create_date,8,('now'::text)::date,NO,date,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,customer,last_update,9,now(),YES,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,customer,customer_id,1,nextval('customer_customer_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,customer,address_id,6,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
6,dvdrental,public,customer,activebool,7,true,NO,boolean,,,...,NO,,,,,,NO,NEVER,,YES
7,dvdrental,public,customer,first_name,3,,NO,character varying,45.0,180.0,...,NO,,,,,,NO,NEVER,,YES
8,dvdrental,public,customer,last_name,4,,NO,character varying,45.0,180.0,...,NO,,,,,,NO,NEVER,,YES
9,dvdrental,public,customer,email,5,,YES,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES


In [75]:
getSchema('address',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,address,last_update,8,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,address,city_id,5,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,address,address_id,1,nextval('address_address_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,address,district,4,,NO,character varying,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,address,phone,7,,NO,character varying,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,address,postal_code,6,,YES,character varying,10.0,40.0,...,NO,,,,,,NO,NEVER,,YES
6,dvdrental,public,address,address,2,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES
7,dvdrental,public,address,address2,3,,YES,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES


In [76]:
getSchema('city',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,city,city_id,1,nextval('city_city_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,city,country_id,3,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,city,last_update,4,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,city,city,2,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES


In [77]:
getSchema('country',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,country,country_id,1,nextval('country_country_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,country,last_update,3,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,country,country,2,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES


In [86]:
getSchema('dimcustomer',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,dimcustomer,customer_key,1,nextval('dimcustomer_customer_key_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,dimcustomer,customer_id,2,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,dimcustomer,active,13,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,dimcustomer,create_date,14,,NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,dimcustomer,start_date,15,,NO,date,,,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,dimcustomer,end_date,16,,NO,date,,,...,NO,,,,,,NO,NEVER,,YES
6,dvdrental,public,dimcustomer,address2,7,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES
7,dvdrental,public,dimcustomer,district,8,,NO,character varying,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES
8,dvdrental,public,dimcustomer,city,9,,NO,character varying,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES
9,dvdrental,public,dimcustomer,country,10,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES


### Major Benefit 2:
When doing the join and need to refer the table columns, the functions written inside the jupyter environment can be used for querying the tables

In [78]:
checkQuery_error = """
        SELECT c.customer_id as customer_key,
                c.customer_id, c.first_name, c.last_name, c.email,
                a.address, a.address2, a.district, ci.city, co.country, 
                postal_code, a.phone, c.active, c.create_date,now() as start_date,
                now() as end_date
        FROM customer AS c
        JOIN address ON c.address_id = a.address_id
        JOIN city ON a.city_id = ci.city_id
        JOIN country ON ci.country_id = co.country_id;
"""
#The tables are not having aliases... Observe

In [82]:
checkQuery = """
        SELECT c.customer_id as customer_key,
                c.customer_id, c.first_name, c.last_name, c.email,
                a.address, a.address2, a.district, ci.city, co.country, 
                postal_code, a.phone, c.active, c.create_date,now() as start_date,
                now() as end_date
        FROM customer AS c
        JOIN address AS a ON c.address_id = a.address_id
        JOIN city AS ci ON a.city_id = ci.city_id
        JOIN country AS co ON ci.country_id = co.country_id;
"""
#this executes

### Major Benefit 3:
After writing complicated select queries they can be wrong due to various reasons. If required the "Wrong" queries can be retained as a reference...

In [83]:
queryBase(checkQuery,credentials).head(2)

Unnamed: 0,customer_key,customer_id,first_name,last_name,email,address,address2,district,city,country,postal_code,phone,active,create_date,start_date,end_date
0,524,524,Jared,Ely,jared.ely@sakilacustomer.org,1003 Qinhuangdao Street,,West Java,Purwakarta,Indonesia,25972,35533115997,1,2006-02-14,2022-11-01 11:05:07.699618+00:00,2022-11-01 11:05:07.699618+00:00
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,1913 Hanoi Way,,Nagasaki,Sasebo,Japan,35200,28303384290,1,2006-02-14,2022-11-01 11:05:07.699618+00:00,2022-11-01 11:05:07.699618+00:00


In [84]:
insertCustomer = """
            INSERT INTO dimcustomer(customer_key,customer_id,first_name,
            last_name,email,address,address2,district,city,country,
            postal_code,phone,active,create_date,start_date,end_date)
            SELECT c.customer_id as customer_key,
                c.customer_id, c.first_name, c.last_name, c.email,
                a.address, a.address2, a.district, ci.city, co.country, 
                postal_code, a.phone, c.active, c.create_date,now() as start_date,
                now() as end_date
            FROM customer AS c
            JOIN address AS a ON c.address_id = a.address_id
            JOIN city AS ci ON a.city_id = ci.city_id
            JOIN country AS co ON ci.country_id = co.country_id;
"""

#### Inserting data into dimcustomer table and consecutively checking contents

In [89]:
queryTable(insertCustomer,credentials)

This result object does not return rows. It has been closed automatically.


In [91]:
queryCheckdimCustomer = """SELECT * FROM dimcustomer"""
queryBase(queryCheckdimCustomer,credentials).head(2)

Unnamed: 0,customer_key,customer_id,first_name,last_name,email,address,address2,district,city,country,postal_code,phone,active,create_date,start_date,end_date
0,524,524,Jared,Ely,jared.ely@sakilacustomer.org,1003 Qinhuangdao Street,,West Java,Purwakarta,Indonesia,25972,35533115997,1,2006-02-14,2022-11-01,2022-11-01
1,1,1,Mary,Smith,mary.smith@sakilacustomer.org,1913 Hanoi Way,,Nagasaki,Sasebo,Japan,35200,28303384290,1,2006-02-14,2022-11-01,2022-11-01


Referring the tables that we are planning to join, here referring inventory table

In [92]:
getSchema('dimstore',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,dimstore,store_key,1,nextval('dimstore_store_key_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,dimstore,store_id,2,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,dimstore,start_date,11,,NO,date,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,dimstore,end_date,12,,NO,date,,,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,dimstore,address,5,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,dimstore,address2,6,,YES,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES
6,dvdrental,public,dimstore,district,7,,NO,character varying,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES
7,dvdrental,public,dimstore,city,8,,NO,character varying,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES
8,dvdrental,public,dimstore,country,9,,NO,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES
9,dvdrental,public,dimstore,postal_code,10,,YES,character varying,10.0,40.0,...,NO,,,,,,NO,NEVER,,YES


In [94]:
getSchema('store',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,store,store_id,1,nextval('store_store_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,store,manager_staff_id,2,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,store,address_id,3,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,store,last_update,4,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES


In [96]:
getSchema('staff',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,staff,picture,11,,YES,bytea,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,staff,address_id,4,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,staff,store_id,6,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,staff,active,7,true,NO,boolean,,,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,staff,last_update,10,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,staff,staff_id,1,nextval('staff_staff_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
6,dvdrental,public,staff,first_name,2,,NO,character varying,45.0,180.0,...,NO,,,,,,NO,NEVER,,YES
7,dvdrental,public,staff,last_name,3,,NO,character varying,45.0,180.0,...,NO,,,,,,NO,NEVER,,YES
8,dvdrental,public,staff,password,9,,YES,character varying,40.0,160.0,...,NO,,,,,,NO,NEVER,,YES
9,dvdrental,public,staff,email,5,,YES,character varying,50.0,200.0,...,NO,,,,,,NO,NEVER,,YES


#### Inserting data into dimstore table and consecutively checking contents

In [110]:
insertStore = """
        INSERT INTO dimstore(store_key,store_id,start_date,end_date,
            address,address2,district,city,country,
            postal_code,manager_first_name,manager_last_name)
            SELECT s.store_id as store_key,
                s.store_id, 
                now() as start_date,
                now() as end_date,
                a.address, 
                a.address2, 
                a.district, 
                ci.city, 
                co.country, 
                postal_code, 
                st.first_name as manager_first_name, 
                st.last_name as manager_last_name 
            FROM store AS s
            JOIN staff AS st ON s.manager_staff_id = st.staff_id
            JOIN address AS a ON st.address_id = a.address_id
            JOIN city AS ci ON a.city_id = ci.city_id
            JOIN country AS co ON ci.country_id = co.country_id;
"""
#before FROM there comma should not be there

In [111]:
queryTable(insertStore,credentials)

This result object does not return rows. It has been closed automatically.


In [112]:
queryCheckdimStore = """SELECT * FROM dimstore"""
queryBase(queryCheckdimStore,credentials).head(2)

Unnamed: 0,store_key,store_id,manager_first_name,manager_last_name,address,address2,district,city,country,postal_code,start_date,end_date
0,1,1,Mike,Hillyer,23 Workhaven Lane,,Alberta,Lethbridge,Canada,,2022-11-01,2022-11-01
1,2,2,Jon,Stephens,1411 Lillydale Drive,,QLD,Woodridge,Australia,,2022-11-01,2022-11-01


Insert the factsales table, this contains the important step of references

In [113]:
getSchema('film',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,film,fulltext,13,,NO,tsvector,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,film,rating,10,'G'::mpaa_rating,YES,USER-DEFINED,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,film,last_update,11,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,film,film_id,1,nextval('film_film_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,film,release_year,4,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,film,language_id,5,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
6,dvdrental,public,film,rental_duration,6,3,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
7,dvdrental,public,film,rental_rate,7,4.99,NO,numeric,,,...,NO,,,,,,NO,NEVER,,YES
8,dvdrental,public,film,length,8,,YES,smallint,,,...,NO,,,,,,NO,NEVER,,YES
9,dvdrental,public,film,replacement_cost,9,19.99,NO,numeric,,,...,NO,,,,,,NO,NEVER,,YES


In [114]:
getSchema('language',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,language,language_id,1,nextval('language_language_id_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,language,last_update,3,now(),NO,timestamp without time zone,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,language,name,2,,NO,character,20.0,80.0,...,NO,,,,,,NO,NEVER,,YES


In [117]:
insertMovie = f"""
        INSERT INTO dimmovie(movie_key, film_id, title, description, 
        release_year , language, original_language ,rental_duration ,
        length, rating, special_features)
        SELECT 
            f.film_id as movie_key,
            f.film_id,
            f.title,
            f.description,
            f.release_year,
            l.last_update as language,
            l.name as original_language,
            f.rental_duration,
            f.length,
            f.rating,
            f.special_features
        FROM film as f
        JOIN language as l ON f.language_id = l.language_id
"""

In [118]:
queryTable(insertMovie,credentials)

This result object does not return rows. It has been closed automatically.


In [119]:
queryCheckMovie = """SELECT * FROM dimmovie"""
queryBase(queryCheckMovie,credentials).head(2)
#There is mistake in language column. The last_update date is populated instead of language name.

Unnamed: 0,movie_key,film_id,title,description,release_year,language,original_language,rental_duration,length,rating,special_features
0,133,133,Chamber Italian,A Fateful Reflection of a Moose And a Husband ...,2006,2006-02-15 10:02:19,English,7,117,NC-17,{Trailers}
1,384,384,Grosse Wonderful,A Epic Drama of a Cat And a Explorer who must ...,2006,2006-02-15 10:02:19,English,5,49,R,"{""Behind the Scenes""}"


### Preparing the final factsales

In [127]:
queryBase('SELECT * FROM dimdate', credentials).head(2)

Unnamed: 0,date_key,date,year,quarter,month,day,week,is_weekend
0,70214,2007-02-14,2007,1,2,14,7,False
1,70317,2007-03-17,2007,1,3,17,11,True


In [125]:
getSchema('dimdate',credentials)

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,dimdate,date_key,1,,NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,dimdate,date,2,,NO,date,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,dimdate,year,3,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,dimdate,quarter,4,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,dimdate,month,5,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,dimdate,day,6,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
6,dvdrental,public,dimdate,week,7,,NO,smallint,,,...,NO,,,,,,NO,NEVER,,YES
7,dvdrental,public,dimdate,is_weekend,8,,YES,boolean,,,...,NO,,,,,,NO,NEVER,,YES


In [158]:
createfactSales = """CREATE TABLE factsales(
                    sales_key SERIAL PRIMARY KEY,
                    date_key integer REFERENCES dimdate(date_key),
                    customer_key integer REFERENCES dimcustomer(customer_key),
                    movie_key integer REFERENCES dimmovie(movie_key),
                    store_key integer REFERENCES dimstore(store_key),
                    sales_amount numeric)"""

In [159]:
#Inserting the factsales table and checking it
queryTable(createfactSales, credentials)
getSchema('factsales',credentials)

This result object does not return rows. It has been closed automatically.


Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,character_maximum_length,character_octet_length,...,is_identity,identity_generation,identity_start,identity_increment,identity_maximum,identity_minimum,identity_cycle,is_generated,generation_expression,is_updatable
0,dvdrental,public,factsales,sales_key,1,nextval('factsales_sales_key_seq'::regclass),NO,integer,,,...,NO,,,,,,NO,NEVER,,YES
1,dvdrental,public,factsales,date_key,2,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
2,dvdrental,public,factsales,customer_key,3,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
3,dvdrental,public,factsales,movie_key,4,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
4,dvdrental,public,factsales,store_key,5,,YES,integer,,,...,NO,,,,,,NO,NEVER,,YES
5,dvdrental,public,factsales,sales_amount,6,,YES,numeric,,,...,NO,,,,,,NO,NEVER,,YES


In [163]:
insertSale = """INSERT INTO factsales(date_key, customer_key, movie_key,
                store_key, sales_amount)
                SELECT
                    DISTINCT(TO_CHAR(payment_date::DATE, 'yyMMDD')::integer) as date_key,
                    p.customer_id as customer_key,
                    i.film_id as movie_key,
                    i.store_id as store_key,
                    p.amount as sales_amount
                FROM payment as p
                JOIN rental r ON p.rental_id = r.rental_id
                JOIN inventory i ON r.inventory_id = i.inventory_id;"""

#The distinct keyword is required for the datekey. We are creating it

In [164]:
queryTable(insertSale,credentials)

This result object does not return rows. It has been closed automatically.


In [165]:
queryBase("""SELECT * FROM factsales""", credentials)

Unnamed: 0,sales_key,date_key,customer_key,movie_key,store_key,sales_amount
0,1,70428,177,149,2,1.99
1,2,70412,246,518,2,4.99
2,3,70409,43,608,1,4.99
3,4,70409,174,267,1,4.99
4,5,70218,86,767,2,4.99
...,...,...,...,...,...,...
14591,14592,70302,375,621,1,5.99
14592,14593,70301,192,122,1,0.99
14593,14594,70317,303,899,1,0.99
14594,14595,70318,247,827,2,7.99


### Running the final query

In [168]:
finalQuery = """SELECT dm.title, dd.month, dc.city, SUM(sales_amount) as revenue
                FROM factsales as fs
                JOIN dimmovie as dm ON fs.movie_key = dm.movie_key
                JOIN dimdate as dd ON fs.date_key = dd.date_key
                JOIN dimcustomer as dc ON fs.customer_key = dc.customer_key
                GROUP BY (dm.title, dd.month, dc.city)"""

In [169]:
queryBase(finalQuery,credentials)

Unnamed: 0,title,month,city,revenue
0,French Holiday,2,Bayugan,4.99
1,Jekyll Frogmen,4,Varanasi (Benares),5.99
2,Intentions Empire,4,Pjatigorsk,4.99
3,League Hellfighters,3,Coacalco de Berriozbal,4.99
4,Grit Clockwork,3,Bchar,3.99
...,...,...,...,...
14535,Beneath Rush,3,Namibe,0.99
14536,Liaisons Sweet,3,Tonghae,7.99
14537,Sun Confessions,4,Ashdod,3.99
14538,Name Detective,3,Adoni,4.99


In [170]:
orderQuery = """SELECT dm.title, dd.month, dc.city, SUM(sales_amount) as revenue
                FROM factsales as fs
                JOIN dimmovie as dm ON fs.movie_key = dm.movie_key
                JOIN dimdate as dd ON fs.date_key = dd.date_key
                JOIN dimcustomer as dc ON fs.customer_key = dc.customer_key
                GROUP BY (dm.title, dd.month, dc.city)
                ORDER BY revenue DESC"""

In [171]:
queryBase(orderQuery,credentials)

Unnamed: 0,title,month,city,revenue
0,Caribbean Liberty,4,Ibirit,16.98
1,Eagles Panky,3,Datong,16.98
2,Lust Lock,4,Coatzacoalcos,15.98
3,Innocent Usual,2,Valparai,13.98
4,Breakfast Goldfinger,4,Ife,13.98
...,...,...,...,...
14535,Minority Kiss,5,Battambang,0.00
14536,State Wasteland,5,Erlangen,0.00
14537,Trouble Date,5,Changhwa,0.00
14538,Lawless Vision,5,Nagareyama,0.00
