# ETL SILVER -> GOLD

### O objetivo desse arquivo é realizar a normalização do banco para o Star Schema.

- As etapas serão

    0. **Conexão com o Banco**
    1. **Criação do Novo Schema e Tabelas**
    2. **Popular o Dimensões**
    3. **Popular Novas Colunas**
    4. **Popular Fato**

### 0. Conexão com Banco

In [1]:
import pandas as pd
import pyspark.sql as psql
from pyspark.sql import SparkSession

import psycopg2
import os

In [2]:
DB_CONFIG = {
    'host': 'localhost',
    'port': 5434,
    'database': os.getenv('POSTGRES_DB'),
    'user': os.getenv('POSTGRES_USER'),
    'password': os.getenv('POSTGRES_PASSWORD')
}

db = psycopg2.connect(**DB_CONFIG)
cur = db.cursor()

### 1. Criação do Schema e Tabelas

In [3]:
db.rollback()
cur.execute('CREATE SCHEMA IF NOT EXISTS dw;')
db.commit()

In [4]:
create_dim = """
CREATE TABLE IF NOT EXISTS dw.Dim_Rdc (
    SRK_rdc INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    hse_sze REAL,
    bat INTEGER,
    bed INTEGER,
    rms INTEGER,
    prc_per_sqf DOUBLE PRECISION
);

CREATE TABLE IF NOT EXISTS dw.Dim_Lnd_Lot (
    SRK_lot INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    acr_lot REAL,
    prc_per_acr DOUBLE PRECISION,
    tpe TEXT
);

CREATE TABLE IF NOT EXISTS dw.Dim_Add (
    SRK_add INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    ste TEXT,
    cty TEXT,
    srt REAL,
    zip_cde TEXT
);

CREATE TABLE IF NOT EXISTS dw.Dim_Dte (
    SRK_dte INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    prv_sld_dte DATE,
    mon TEXT,
    qtr INTEGER,
    yer INTEGER
);
"""

cur.execute(create_dim)
db.commit()

In [5]:
create_fato = """
CREATE TABLE IF NOT EXISTS dw.Fat_Rel_Est (
    SRK_rel_est INTEGER GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
    prc REAL,
    sta TEXT,
    brk_by REAL,
    SRK_rdc INTEGER,
    SRK_lot INTEGER,
    SRK_add INTEGER,
    SRK_dte INTEGER,
    FOREIGN KEY (SRK_rdc) REFERENCES dw.Dim_Rdc(SRK_rdc)
        ON DELETE SET NULL
        ON UPDATE CASCADE,
    FOREIGN KEY (SRK_lot) REFERENCES dw.Dim_Lnd_Lot(SRK_lot)
        ON DELETE SET NULL
        ON UPDATE CASCADE,
    FOREIGN KEY (SRK_add) REFERENCES dw.Dim_Add(SRK_add)
        ON DELETE SET NULL
        ON UPDATE CASCADE,
    FOREIGN KEY (SRK_dte) REFERENCES dw.Dim_Dte(SRK_dte)
        ON DELETE SET NULL
        ON UPDATE CASCADE
)
"""

cur.execute(create_fato)
db.commit()

#### 2. Popular Dimensões

In [20]:
db.rollback()

In [7]:
query = """
INSERT INTO dw.Dim_Add (ste, cty, srt, zip_cde)
    SELECT DISTINCT
        state, city, street, zip_code
    FROM silver.one_big_table
WHERE state IS NOT NULL;
"""

cur.execute(query)
db.commit()

In [8]:
query = """
INSERT INTO dw.Dim_Rdc (hse_sze, bat, bed, rms, prc_per_sqf)
    SELECT DISTINCT
        house_size, bath, bed, rooms, price_per_sqft
    FROM silver.one_big_table
WHERE house_size IS NOT NULL;
"""

cur.execute(query)
db.commit()

In [9]:
query = """
INSERT INTO dw.Dim_Lnd_Lot (acr_lot, prc_per_acr)
    SELECT DISTINCT
        acre_lot, price_per_acre
    FROM silver.one_big_table
WHERE acre_lot IS NOT NULL;
"""

cur.execute(query)
db.commit()

In [10]:
query = """
INSERT INTO dw.Dim_Dte (prv_sld_dte)
    SELECT DISTINCT
        prev_sold_date
    FROM silver.one_big_table
WHERE prev_sold_date IS NOT NULL;
"""

cur.execute(query)
db.commit()

#### 3. Popular Novas Colunas

In [11]:
query = """
UPDATE dw.Dim_Dte
SET
    mon = TO_CHAR(prv_sld_dte, 'TMMonth'),
    qtr = EXTRACT(QUARTER FROM prv_sld_dte)::INTEGER,
    yer = EXTRACT(YEAR FROM prv_sld_dte)::INTEGER
WHERE prv_sld_dte IS NOT NULL;
"""

cur.execute(query)
db.commit()

In [12]:
query = """
UPDATE dw.Dim_Lnd_Lot
SET tpe = CASE
    WHEN acr_lot <= 5 THEN 'urban'
    WHEN acr_lot > 5 THEN 'rural'
END
WHERE acr_lot IS NOT NULL;
"""

cur.execute(query)
db.commit()

#### 4. Popular Fato

In [21]:
query = """
INSERT INTO dw.Fat_Rel_Est (prc, brk_by, sta, SRK_rdc, SRK_lot, SRK_dte, SRK_add)
    SELECT obt.price, obt.brokered_by, obt.status,
        CASE
            WHEN obt.house_size IS NULL THEN NULL
            ELSE rdc.SRK_rdc
        END AS SRK_rdc,
        CASE
            WHEN obt.acre_lot IS NULL THEN NULL
            ELSE ll.SRK_lot
        END AS SRK_lot,
        CASE
            WHEN obt.status = 'sold' THEN dte.SRK_dte
            ELSE NULL
        END AS SRK_dte,
        add.SRK_add AS SRK_add
    FROM silver.one_big_table obt

    LEFT JOIN dw.Dim_Rdc rdc ON
        obt.house_size = rdc.hse_sze
        AND obt.bath = rdc.bat
        AND obt.bed = rdc.bed
        AND obt.rooms = rdc.rms
        AND obt.price_per_sqft = rdc.prc_per_sqf

    LEFT JOIN dw.Dim_Lnd_Lot ll ON
        obt.acre_lot = ll.acr_lot
        AND obt.price_per_acre = ll.prc_per_acr

    LEFT JOIN dw.Dim_Add add ON
        obt.state = add.ste
        AND obt.city = add.cty
        AND obt.street = add.srt
        AND obt.zip_code = add.zip_cde

    LEFT JOIN dw.Dim_Dte dte ON
        obt.prev_sold_date = dte.prv_sld_dte;
"""

cur.execute(query)
db.commit()