## Notebook component to populate a Redshift cluster with our data

In [1]:
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
import os

### Access credentials securely

In [2]:
load_dotenv(".env")

user = os.environ.get("REDSHIFT_USERNAME")
password = os.environ.get("REDSHIFT_PASSWORD")
host = os.environ.get("REDSHIFT_HOST")
iamrole = os.environ.get("IAM_role")

### Form connection string

Note, this assumes we added

```bash
poetry add sqlalchemy-redshift

```

In [3]:
url = URL.create(
    drivername="redshift+redshift_connector",
    username=user,
    password=password,
    host=host,
    port=5439,
    database="dev",
)

engine = create_engine(url)

### Load JupySQL extension

In [4]:
%reload_ext sql

Config,value
displaycon,False
feedback,True
autopandas,True
named_parameters,True


### Use the engine to initialize access to our Redshift via the alias `redshift`

In [5]:
%sql engine --alias redshift

### Add data to S3 bucket

In [6]:
! aws s3 cp expanded_data/client_account_district.parquet s3://ploomber-redshift-data/nyc-taxi/client_account_district.parquet
! aws s3 cp expanded_data/account_trans_order.parquet s3://ploomber-redshift-data/nyc-taxi/account_trans_order.parquet

upload: expanded_data/client_account_district.parquet to s3://ploomber-redshift-data/nyc-taxi/client_account_district.parquet
upload: expanded_data/account_trans_order.parquet to s3://ploomber-redshift-data/nyc-taxi/account_trans_order.parquet


### Create table `client_account_district` and table `account_trans_order` from data in S3 bucket

Ensure you set up your access key and secret access keys!

In [7]:
%%sql
DROP TABLE IF EXISTS account_trans_order CASCADE;
DROP TABLE IF EXISTS client_account_district CASCADE;

In [8]:
%%sql
CREATE TABLE IF NOT EXISTS account_trans_order
(account_id INT,
frequency VARCHAR(50),
account_creation_date INT,
trans_id INT,
transaction_date INT,
transaction_type VARCHAR(50),
operation VARCHAR(50),
transaction_amount INT,
balance INT,
order_id INT,
bank_to VARCHAR(50),
account_to INT,
order_amount INT);

 COPY account_trans_order
FROM 's3://ploomber-redshift-data/nyc-taxi/account_trans_order.parquet'
IAM_ROLE '{{iamrole}}'
FORMAT AS PARQUET;

In [9]:
%%sql
CREATE TABLE IF NOT EXISTS client_account_district
(client_id INT,
birth_number INT,
account_id INT,
frequency VARCHAR(50),
account_creation_date INT,
district_name VARCHAR(50),
region VARCHAR(50),
no_of_inhabitants BIGINT,
average_salary BIGINT,
unemployment_rate_95 DOUBLE PRECISION,
unemployment_rate_96 DOUBLE PRECISION,
no_of_entrepreneurs_per_1000_inhabitants INT);

 
COPY client_account_district
FROM 's3://ploomber-redshift-data/nyc-taxi/client_account_district.parquet'
IAM_ROLE '{{iamrole}}'
FORMAT AS PARQUET;

### Check for error messages

In [None]:
result = %sql  SELECT * FROM sys_load_error_detail ORDER BY start_time DESC LIMIT 10;

result['error_message'].to_list()[0:10]

## Profile the data

In [10]:
%%sql
SELECT * FROM client_account_district LIMIT 5

Unnamed: 0,client_id,birth_number,account_id,frequency,account_creation_date,district_name,region,no_of_inhabitants,average_salary,unemployment_rate_95,unemployment_rate_96,no_of_entrepreneurs_per_1000_inhabitants
0,660,230322,3889,POPLATEK MESICNE,970703,Jindrichuv Hradec,south Bohemia,93931,8427,1.12,1.54,107
1,661,195115,3889,POPLATEK MESICNE,970703,Jindrichuv Hradec,south Bohemia,93931,8427,1.12,1.54,107
2,662,450807,2939,POPLATEK MESICNE,970628,Kolin,central Bohemia,95616,9307,3.85,4.43,118
3,663,230120,2539,POPLATEK MESICNE,970424,Karlovy Vary,west Bohemia,122603,8991,1.39,2.01,128
4,664,805319,4631,POPLATEK PO OBRATU,961212,Vsetin,north Moravia,148545,8909,4.01,5.56,113


In [None]:
%%sql
SELECT * FROM account_trans_order LIMIT 5

In [11]:
%sqlcmd profile -t client_account_district

Unnamed: 0,client_id,birth_number,account_id,frequency,account_creation_date,district_name,region,no_of_inhabitants,average_salary,unemployment_rate_95,unemployment_rate_96,no_of_entrepreneurs_per_1000_inhabitants
count,645633.0,645633.0,645633.0,645633.0,645633.0,645633.0,645633.0,645633.0,645633.0,645633.0,645633.0,645633.0
unique,5308.0,4968.0,4452.0,3.0,1532.0,76.0,8.0,76.0,75.0,70.0,72.0,44.0
mean,3385.0,534775.0,2894.0,,951531.0,,,760299.0,11146.0,1.6496,2.0206,143.0
min,1.0,110820.0,1.0,,930101.0,,,45714.0,8110.0,0.29,0.43,81.0
max,13998.0,875927.0,11382.0,,971229.0,,,1204953.0,12541.0,7.34,9.4,167.0


In [12]:
%sqlcmd profile -t account_trans_order

Unnamed: 0,account_id,frequency,account_creation_date,trans_id,transaction_date,transaction_type,operation,transaction_amount,balance,order_id,bank_to,account_to,order_amount
count,1715140.0,1715140.0,1715140.0,1715140.0,1715140.0,1715140.0,1448066.0,1715140.0,1715140.0,1715140.0,1715140.0,1715140.0,1715140.0
unique,3758.0,3.0,1468.0,913010.0,2191.0,3.0,5.0,31786.0,104530.0,6471.0,13.0,6446.0,4243.0
mean,3094.0,,944777.0,1331562.0,965826.0,,,5635.0,38774.0,33974.0,,49244253.0,3164.0
min,1.0,,930101.0,1.0,930101.0,,,0.0,-35456.0,29401.0,,399.0,1.0
max,11362.0,,971229.0,3682987.0,981231.0,,,87400.0,209637.0,46338.0,,99994199.0,14882.0


### Clean up data

In [13]:
! aws s3 rm s3://ploomber-redshift-data/nyc-taxi/client_account_district.parquet
! aws s3 rm s3://ploomber-redshift-data/nyc-taxi/account_trans_order.parquet

### Delete tables

In [None]:
%%sql
DROP TABLE client_account_district;
DROP TABLE account_trans_order;