# ETL

Steps to create data warehouse for analytic process of Sparkify song plays data.
- STEP 0: Activate S3 service and check data on buckets.
- STEP 1: Extract data from S3 to Redshift as Staging tables.
- STEP 2: Transform and Load data into Analytics tables.
- STEP 3: Explore analytic queries.

<img src="images/etl_step.png" width="90%"/>

In [19]:
import pandas as pd
import boto3
import json
import configparser
import pandas as pd
from humanize import naturalsize
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


# STEP 0: Activate S3 service and check data on buckets

## 0.1 Activate S3

In [22]:
KEY                = config.get('AWS','KEY')
SECRET             = config.get('AWS','SECRET')
BUCKET_NAME        = config.get("S3", "BUCKET_NAME")

In [23]:
s3 = boto3.resource('s3',
                       region_name="us-west-2",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

## 0.2 Check data on Bucket 

In [24]:
def bucketSummary(bucket_name, prefix):
    #Put s3 obj into list then find length and sum of size in that list
    obj_list = list(s3.Bucket(bucket_name).objects.filter(Prefix=prefix))
    total_files = len(obj_list)
    total_size = naturalsize(sum(obj.size for obj in obj_list))
    print(f"Total_files: {total_files}\nTotal size: {total_size}")

    #select one file path to show in dataframe (Pip install s3fs)
    sample_file_path = f"s3://{bucket_name}/{obj_list[1].key}"
    df = pd.read_json(sample_file_path,  lines=True)
    print("Sample file path: ",sample_file_path )
    display(df.head(2))

In [25]:
bucketSummary(BUCKET_NAME,prefix='log_data')

Total_files: 31
Total size: 3.8 MB
Sample file path:  s3://udacity-dend/log_data/2018/11/2018-11-01-events.json


Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1540919166796,38,,200,1541105830796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",39
1,,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1540344794796,139,,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8


In [26]:
bucketSummary(BUCKET_NAME,prefix='song_data')

Exception during reset or similar
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 693, in _finalize_fairy
    fairy._reset(pool)
  File "/usr/local/lib/python3.7/site-packages/sqlalchemy/pool/base.py", line 880, in _reset
    pool._dialect.do_rollback(self)
  File "/usr/local/lib/python3.7/site-packages/sqlalchemy/engine/default.py", line 538, in do_rollback
    dbapi_connection.rollback()
psycopg2.DatabaseError: SSL SYSCALL error: Operation timed out



Total_files: 14897
Total size: 3.7 MB
Sample file path:  s3://udacity-dend/song_data/A/A/A/TRAAAAK128F9318786.json


Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARJNIUY12298900C91,,,,Adelitas Way,213.9424,1,SOBLFFE12AF72AA5BA,Scream,2009


#  
# STEP 1: Import data from S3 to Redshift as Staging tables

## 1.1 Connect to Database

In [27]:
DB_NAME            = config.get("CLUSTER","DB_NAME")
DB_USER            = config.get("CLUSTER","DB_USER")
DB_PASSWORD        = config.get("CLUSTER","DB_PASSWORD")
DB_PORT            = config.get("CLUSTER","DB_PORT")
HOST               = config.get("CLUSTER","HOST")

In [None]:
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT,DB_NAME)
print(conn_string)
%sql $conn_string

## 1.2 Create staging tables

In [30]:
%%sql 
DROP TABLE IF EXISTS "staging_events";
DROP TABLE IF EXISTS "staging_songs";

CREATE TABLE "staging_events" (
                event_id      INT IDENTITY(0,1)    NOT NULL,
                artist        VARCHAR              NULL,
                auth          VARCHAR              NULL,
                firstName     VARCHAR              NULL,
                gender        CHAR (1)             NULL,
                itemInSession VARCHAR              NULL,
                lastName      VARCHAR              NULL,
                length        NUMERIC              NULL,
                level         VARCHAR              NULL,
                location      VARCHAR              NULL,
                method        VARCHAR              NULL,
                page          VARCHAR              NULL,
                registration  NUMERIC              NULL,
                sessionId     INTEGER              NOT NULL SORTKEY DISTKEY,
                song          VARCHAR              NULL,
                status        INTEGER              NULL,
                ts            NUMERIC              NOT NULL,
                userAgent     VARCHAR              NULL,
                userId        INTEGER              NULL

);

CREATE TABLE IF NOT EXISTS "staging_songs" (
                num_songs           INTEGER         NULL,
                artist_id           VARCHAR         NOT NULL SORTKEY DISTKEY,
                artist_latitude     VARCHAR         NULL,
                artist_longitude    VARCHAR         NULL,
                artist_location     VARCHAR         NULL,
                artist_name         VARCHAR         NULL,
                song_id             VARCHAR         NOT NULL,
                title               VARCHAR         NULL,
                duration            DECIMAL         NULL,
                year                INTEGER         NULL
);

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.
Done.
Done.


KeyError: 'DEFAULT'

## 1.3 Copy data from S3 to staging table

In [31]:
IAM_ROLE_ARN=config.get("IAM_ROLE","IAM_ROLE_ARN")
LOG_DATA    =config.get("S3","LOG_DATA")
SONG_DATA   =config.get("S3","SONG_DATA")
LOG_JSONPATH=config.get("S3","LOG_JSONPATH")

#### Import log events data

In [32]:
%%time

qry = """
    COPY staging_events FROM {}
    CREDENTIALS 'aws_iam_role={}' 
    FORMAT as json {}
    region 'us-west-2';
""".format(LOG_DATA, IAM_ROLE_ARN, LOG_JSONPATH)

%sql $qry

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


KeyError: 'DEFAULT'

#### Import song data

In [33]:
%%time

qry = """
    COPY staging_songs FROM {}
    credentials 'aws_iam_role={}'
    format as json 'auto'
    region 'us-west-2';
""".format(SONG_DATA, IAM_ROLE_ARN)

%sql $qry

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


KeyError: 'DEFAULT'

## 1.4 Check imported data

#### Check data on log event staging table

In [34]:
%%sql 
SELECT COUNT(*)
FROM "staging_events";

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


KeyError: 'DEFAULT'

In [114]:
%%sql 
SELECT *
FROM "staging_events"
ORDER BY "event_id"
LIMIT 2

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
2 rows affected.


event_id,artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid
0,,Logged In,Dominick,M,0,Norris,,free,"Los Angeles-Long Beach-Anaheim, CA",GET,Home,1540975502796,44,,200,1541635950796,"""Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53""",45
1,,Logged In,Theodore,M,0,Smith,,free,"Houston-The Woodlands-Sugar Land, TX",GET,Home,1540306145796,154,,200,1541290555796,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20100101 Firefox/31.0,52


#### Check data on songs staging table

In [115]:
%%sql 
SELECT COUNT(*)
FROM "staging_songs";

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
14896


In [116]:
%%sql 
SELECT *
FROM "staging_songs"
LIMIT 2

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
2 rows affected.


num_songs,artist_id,artist_latitude,artist_longitude,artist_location,artist_name,song_id,title,duration,year
1,AR00MQ31187B9ACD8F,,,,Chris Carrier,SOEWYVW12AB0188813,Calais Douvres,460,0
1,AR039B11187B9B30D0,,,"NEW YORK, New York",John Williams,SOCSLFL12A6701C69A,Escape From The City,229,0


#  

# STEP 2. Transform and Load data into Analytics tables.

## 2.1 Create Analytics tables

In [117]:
%%sql 

DROP TABLE IF EXISTS "songplays";
DROP TABLE IF EXISTS "users";
DROP TABLE IF EXISTS "songs";
DROP TABLE IF EXISTS "artists";
DROP TABLE IF EXISTS "time";

CREATE TABLE IF NOT EXISTS "songplays" (
                songplay_id INTEGER IDENTITY(0,1)   NOT NULL SORTKEY,
                start_time  TIMESTAMP               NOT NULL,
                user_id     VARCHAR(50)             NOT NULL DISTKEY,
                level       VARCHAR(10)             NOT NULL,
                song_id     VARCHAR(40)             NOT NULL,
                artist_id   VARCHAR(50)             NOT NULL,
                session_id  VARCHAR(50)             NOT NULL,
                location    VARCHAR(100)            NULL,
                user_agent  VARCHAR(255)            NULL
                );

CREATE TABLE IF NOT EXISTS "users" (
                user_id     INTEGER                 NOT NULL SORTKEY,
                first_name  VARCHAR(50)             NULL,
                last_name   VARCHAR(80)             NULL,
                gender      VARCHAR(10)             NULL,
                level       VARCHAR(10)             NULL
                ) diststyle all;

CREATE TABLE IF NOT EXISTS "songs" (
                song_id     VARCHAR(50)             NOT NULL SORTKEY,
                title       VARCHAR(500)            NOT NULL,
                artist_id   VARCHAR(50)             NOT NULL,
                year        INTEGER                 NOT NULL,
                duration    DECIMAL(9)              NOT NULL
                );

CREATE TABLE IF NOT EXISTS "artists" (
                artist_id   VARCHAR(50)             NOT NULL SORTKEY,
                name        VARCHAR(500)            NULL,
                location    VARCHAR(500)            NULL,
                latitude    DECIMAL(9)              NULL,
                longitude   DECIMAL(9)              NULL
                ) diststyle all;


CREATE TABLE IF NOT EXISTS "time" (
                start_time  TIMESTAMP               NOT NULL SORTKEY,
                hour        SMALLINT                NULL,
                day         SMALLINT                NULL,
                week        SMALLINT                NULL,
                month       SMALLINT                NULL,
                year        SMALLINT                NULL,
                weekday     SMALLINT                NULL
                ) diststyle all;

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

## 2.2 Insert data

In [118]:
%%sql 

INSERT INTO "songplays" (start_time,user_id,level,song_id,artist_id,session_id,location,user_agent)
SELECT  DISTINCT TIMESTAMP 'epoch' + se.ts/1000* INTERVAL '1 second'   AS start_time,
        se.userId                   AS user_id,
        se.level                    AS level,
        ss.song_id                  AS song_id,
        ss.artist_id                AS artist_id,
        se.sessionId                AS session_id,
        se.location                 AS location,
        se.userAgent                AS user_agent
FROM "staging_events" AS se
JOIN "staging_songs" AS ss ON (se.artist = ss.artist_name)
WHERE se.page = 'NextSong';

    
INSERT INTO "users" (user_id,first_name,last_name,gender,level)
SELECT  DISTINCT se.userId          AS user_id,
        se.firstName                AS first_name,
        se.lastName                 AS last_name,
        se.gender                   AS gender,
        se.level                    AS level
FROM "staging_events" AS se
WHERE se.page = 'NextSong';


INSERT INTO "songs" (song_id,title,artist_id,year,duration)
SELECT  DISTINCT ss.song_id         AS song_id,
        ss.title                    AS title,
        ss.artist_id                AS artist_id,
        ss.year                     AS year,
        ss.duration                 AS duration
FROM "staging_songs" AS ss;


INSERT INTO "artists" (artist_id,name,location,latitude,longitude)
SELECT  DISTINCT ss.artist_id       AS artist_id,
        ss.artist_name              AS name,
        ss.artist_location          AS location,
        ss.artist_latitude          AS latitude,
        ss.artist_longitude         AS longitude
FROM "staging_songs" AS ss;


INSERT INTO "time" (start_time,hour,day,week,month,year,weekday)
SELECT  DISTINCT TIMESTAMP 'epoch' + se.ts/1000* INTERVAL '1 second' AS start_time,
        EXTRACT(hour FROM start_time)    AS hour,
        EXTRACT(day FROM start_time)     AS day,
        EXTRACT(week FROM start_time)    AS week,
        EXTRACT(month FROM start_time)   AS month,
        EXTRACT(year FROM start_time)    AS year,
        EXTRACT(weekday FROM start_time) AS weekday
FROM  "staging_events" AS se
WHERE se.page = 'NextSong';

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
9957 rows affected.
104 rows affected.
14896 rows affected.
10025 rows affected.
6813 rows affected.


[]

## 2.3 Check inserted data

In [8]:
%%sql 
SELECT * FROM "songplays" LIMIT 1;

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


KeyError: 'DEFAULT'

In [120]:
%%sql 
SELECT * FROM "users" LIMIT 1;

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


user_id,first_name,last_name,gender,level
2,Jizelle,Benjamin,F,free


In [121]:
%%sql 
SELECT * FROM "songs" LIMIT 1;

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


song_id,title,artist_id,year,duration
SOAAFHQ12A6D4F836E,Ridin' Rims (Explicit Album Version),AR3CQ2D1187B9B1953,2006,322


In [122]:
%%sql 
SELECT * FROM "artists" LIMIT 1;

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


artist_id,name,location,latitude,longitude
AR00B1I1187FB433EB,Eagle-Eye Cherry,"Stockholm, Sweden",,


In [123]:
%%sql 
SELECT * FROM "time" LIMIT 1;

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


start_time,hour,day,week,month,year,weekday
2018-11-01 21:01:46.796000,21,1,44,11,2018,4


# STEP 3. Analytic queries.

#### Top 10 Songs by number of plays.

In [7]:
%%sql 

SELECT s.title as top_song, a.name as artist, COUNT(sp.songplay_id) AS play_times
FROM "songplays" AS sp
JOIN "songs" AS s on sp.song_id = s.song_id
JOIN "artists" As a on sp.artist_id = a.artist_id
GROUP BY s.title, a.name
ORDER BY play_times DESC
LIMIT 10

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


KeyError: 'DEFAULT'

#### Top 10 Artists by number of song plays.

In [6]:
%%sql 

SELECT a.name AS top_artist, COUNT(sp.songplay_id) AS song_plays
FROM "songplays" AS sp
JOIN "artists" As a on sp.artist_id = a.artist_id
GROUP BY a.name
ORDER BY song_plays DESC
LIMIT 10

 * postgresql://dwhuser:***@dwhcluster.cgnkpe4iotji.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


KeyError: 'DEFAULT'