# ETL Draft
This notebook is to scrap / test code towards the actual ETL

In [1]:
import boto3
import configparser
import matplotlib.pyplot as plt
import pandas as pd
from time import time
import json
%load_ext sql

## Read Configs

In [2]:
config = configparser.ConfigParser()
config.read_file(open('aws.cfg'))
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
PREFERRED_REGION       = config.get('AWS','PREFERRED_REGION')

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
HOST              = config.get('CLUSTER','HOST')
DB_NAME           = config.get('CLUSTER','DB_NAME')
DB_USER           = config.get('CLUSTER','DB_USER')
DB_PASSWORD       = config.get('CLUSTER','DB_PASSWORD')
DB_PORT           = config.get('CLUSTER','DB_PORT')

DB_NAME           = config.get('DATABASE','DB_NAME')

IAM_ROLE        = config.get('IAM_ROLE','ARN')

LOG_DATA        = config.get('S3','LOG_DATA')
LOG_JSONPATH        = config.get('S3','LOG_JSONPATH')
SONG_DATA        = config.get('S3','SONG_DATA')


## CHeck S3 contents

In [4]:
s3 = boto3.resource('s3',
                       region_name= PREFERRED_REGION,
                       aws_access_key_id= KEY,
                       aws_secret_access_key= SECRET
                   )

bucket = s3.Bucket('udacity-dend')

In [None]:

#for obj in bucket.objects.all():
#for obj in bucket.objects.filter(Prefix="log_json_path.json"):
for obj in bucket.objects.filter(Prefix="song-data/A/A/A/TRAAAAK128F9318786.json"):
#for obj in bucket.objects.filter(Prefix="log-data/2018/11/2018-11-01-events.json"):
    key = obj.key    
    body = obj.get()['Body'].read() 
    print(body)

## Check number of files / entities
- Number of song files should match staging_songs.count
- NUmber of log entities?? should match staging_events.count

### Count Song Data Files

In [12]:
count = 0 
for i in bucket.objects.filter(Prefix="song-data/"):
  count += 1

#list(map(lambda i: count = count + 1, bucket.objects.filter(Prefix="song-data/")))

#c = collections Counter()

print(count)

385253


In [37]:
count = 0
#for obj in bucket.objects.filter(Prefix="log-data/2018/11/2018-11-01-events.json"):
for obj in bucket.objects.filter(Prefix="log-data/"):
  text = obj.get()['Body'].read().decode('utf-8')  
  df = pd.read_json(text, lines=True)
  count += len(df.index)  
  
print(count)

8056


## Connect to Redshift cluster

In [3]:
import os 
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

## Create Staging Tables

In [49]:
%%sql 
CREATE SCHEMA IF NOT EXISTS staging_sparklify;
SET search_path TO staging_sparklify;

DROP TABLE IF EXISTS staging_events;
CREATE TABLE staging_events 
(
  artist varchar(1000),
  auth varchar,
  firstName varchar,
  gender varchar,
  itemInSession varchar,
  lastName varchar,
  length varchar,
  level varchar,
  location varchar,
  method varchar,
  page varchar,
  registration varchar,
  sessionId varchar,
  song varchar(1000),
  status varchar,
  ts varchar,
  userAgent varchar,
  userId varchar
);

DROP TABLE IF EXISTS staging_songs;
CREATE TABLE staging_songs 
(
  song_id varchar,
  num_songs varchar,
  title varchar(1000), 
  artist_name varchar(1000),
  artist_latitude varchar,
  year varchar,
  duration varchar,
  artist_id varchar,
  artist_longitude varchar,
  artist_location  varchar(1000)
);


 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.
Done.
Done.
Done.
Done.


[]

## Loading Events / Log

In [51]:
%sql select count(1) from staging_sparklify.staging_events

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
8056


In [62]:
%%sql

copy staging_sparklify.staging_events 
from 's3://udacity-dend/log-data' 
iam_role 'arn:aws:iam::991791500823:role/dwhRole'
json 's3://udacity-dend/log_json_path.json';

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [53]:
%%sql 
select *
from staging_sparklify.staging_events 
where page = 'NextSong'
limit 5

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid
Lifehouse,Logged In,Jahiem,M,2,Miles,203.59791,free,"San Antonio-New Braunfels, TX",PUT,NextSong,1540817347796,42,We'll Never Know,200,1541300337796,"""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",43
Tweet,Logged In,Jayden,M,12,Graves,281.80853,paid,"Marinette, WI-MI",PUT,NextSong,1540664184796,128,Always Will (LP Version),200,1541312311796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",25
Silverchair,Logged In,Jayden,M,26,Graves,213.13261,paid,"Marinette, WI-MI",PUT,NextSong,1540664184796,128,The Door,200,1541314272796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",25
OutKast,Logged In,Jayden,M,34,Graves,239.35955,paid,"Marinette, WI-MI",PUT,NextSong,1540664184796,128,Ms. Jackson,200,1541315904796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",25
Anberlin,Logged In,Aleena,F,0,Kirby,258.42893,paid,"Waterloo-Cedar Falls, IA",PUT,NextSong,1541022995796,196,Dismantle. Repair.,200,1541323143796,Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:31.0) Gecko/20100101 Firefox/31.0,44


## Loading Songs

In [41]:
%sql select count(1) from staging_sparklify.staging_songs

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
385252


In [None]:
%sql select * from stl_load_errors

In [50]:
%%sql

copy staging_songs 
from 's3://udacity-dend/song-data' 
iam_role 'arn:aws:iam::991791500823:role/dwhRole'
json 'auto ignorecase';

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [46]:
%%sql 
select *
from staging_sparklify.staging_songs
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location
SOUQQEA12A8C134B1B,1,High Tide,Richard Souther,37.16793,0,228.5971,ARIG6O41187B988BDD,-95.84502,United States
SOFXNXU12AB018A46E,1,Always On My Mind,Micky Modelle,,0,280.94649,AR6468X1187FB5AA0C,,
SOGYILS12AF72A82AD,1,Coffee Homeground,Kate Bush,,1978,219.0624,AR3DXTG1187FB38776,,"Bexleyheath, Kent, England"
SOJUCAI12A8C135D62,1,New Life,Chuck Loeb,,0,325.25016,ARQY5EG1187FB57063,,
SOUUERM12AB01850E4,1,Frankie & Johnny,Charlie Feathers,34.94652,0,167.96689,ARJIUJH1187B9B84FD,-89.43729999999998,"Slayden, MS"
SOKIODI12AB01839D3,1,The Nth Degree,Stereolab,51.50632,2008,253.51791,AR0TKGM1187B98B40E,-0.12714,London
SOVDHCR12A6701F1B3,1,Millions,Malevolent Creation,,1997,146.28526,AR7UGOA1187B9B2AD0,,
SOUMUUK12AB018AEF6,1,Intermission,Hexstatic,,0,99.36934,ARW3OU61187B98A81E,,"London, England"
SOKEZWF12AB0185C39,1,Heroina Madness,R De Rumba,,2004,315.34975,ARREGHI1187FB47C48,,
SORMAUH12AB0189912,1,Jazz is the move,De Phazz,,2010,208.3522,AR3ZL6A1187B995B37,,


## Understanding how to convert ts into postgresql timestamp

In [108]:
%%sql

select
a.ts,
(a.ts/1000) as seconds_as_float,
TIMESTAMP 'epoch' as epoch_time_0,
(a.ts/1000) * interval '1 second' as time_interval_from_epoch,
TIMESTAMP 'epoch' + (a.ts/1000) * INTERVAL '1 Second ' AS start_time,
a.*
from staging_sparklify.staging_events a
limit 2

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
2 rows affected.


ts,seconds_as_float,epoch_time_0,time_interval_from_epoch,start_time,artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts_1,useragent,userid
1541300540796,1541300540.796,1970-01-01 00:00:00,"17839 days, 3:02:20.796000",2018-11-04 03:02:20.796000,Olivia Ruiz,Logged In,Jahiem,M,3,Miles,254.74567,free,"San Antonio-New Braunfels, TX",PUT,NextSong,1540817347796,42,Cabaret Blanco,200,1541300540796,"""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",43
1541310741796,1541310741.796,1970-01-01 00:00:00,"17839 days, 5:52:21.796000",2018-11-04 05:52:21.796000,,Logged In,Jayden,M,5,Graves,,paid,"Marinette, WI-MI",GET,Home,1540664184796,128,,200,1541310741796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",25


## Sql queries to transform the events and songs into dimension tables

## Load Songs

In [47]:
%%sql

select 
song_id,
title,
artist_id,
year,
duration
from staging_sparklify.staging_songs
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


song_id,title,artist_id,year,duration
SOEKAZG12AB018837E,I'll Slap Your Face (Entertainment USA Theme),ARSVTNL1187B992A91,2001,129.85424
SOMZZON12A6701D3B9,My Lady (2003 Digital Remaster),ARKUI581187B9A6856,1997,162.40281
SOTEMHH12A8C1389A7,One Last Time,ARWDPT81187B99C656,2008,156.42077
SONKSNV12A58A7F654,Speed Of Sound,AR8368J1187FB4CFF3,1992,315.01016
SOXRPUH12AB017F769,Exodus: Part I: Moses and Pharaoh,ARXQC081187FB4AD42,0,1047.71873
SOLOOSA12AC4688A3C,Corazon Partio Club Mix Edit,ARQATCR1187FB4D3E6,0,270.0273
SOWUSBD12AB0180D3E,Speed bump,ARJAEUC11F50C4DDD9,0,243.46077
SOSLFMU12AB018FFEA,Murder Academy,ARDOOH01187B991055,1995,303.04608
SOGYIQL12A8C1329FC,No Aloha,AR62BB21187B9AC83D,1993,127.21587
SORJXPY12AB0182839,Bad Seed,AR9EZGO1187B9A401F,0,165.98159


## Load Artists

In [49]:
%%sql 

select 
artist_id,
artist_name as name,
artist_location as location,
artist_latitude as latitude,
artist_longitude as longitude
from staging_sparklify.staging_songs
limit 5

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


artist_id,name,location,latitude,longitude
ARQVORN11F50C4EFEC,Bedlight For Blue Eyes,,,
AR8JO2B1187B98EBB6,Leftöver Crack,"New York City, NY, USA",,
AR1XD261187B9ACF9B,Nick Cave/Warren Ellis,,,
AR19SOA1187B98F6E6,Bob Neuwirth,New York,40.71455,-74.00712
ARZN98V1187B990D1D,THERION,"Stockholm, Sweden",59.33217,18.06243


## Load songplays

In [57]:
%%sql

select 
'songplay_id' as songplay_id,
TIMESTAMP 'epoch' + (e.ts/1000) * INTERVAL '1 Second ' as start_time,
e.userid as user_id,
e.level,
s.song_id,
e.song as song_title,
s.artist_id,
e.artist as artist_name,
e.sessionid as session_id,
e.location,
e.userAgent as user_agent,
e.length as stream_dureation
from staging_sparklify.staging_events e
left join staging_sparklify.staging_songs s on e.song = s.title and e.artist = s.artist_name
where page = 'NextSong'
and s.artist_id is null
limit 5

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


songplay_id,start_time,user_id,level,song_id,song_title,artist_id,artist_name,session_id,location,user_agent,stream_dureation
songplay_id,2018-11-16 21:14:34.796000,49,paid,,EG GLEÃÂIST SO HVÃÂRT JÃÂLAKVÃÂLD,,Mpiri,648,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0,269.92281
songplay_id,2018-11-24 12:15:28.796000,80,paid,,Los Salieris De Charly,,LeÃÂ³n Gieco,903,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",361.27302
songplay_id,2018-11-24 07:00:22.796000,80,paid,,You'll Never Find Another Love Like Mine (Album Version),,Michael BublÃÂ©,893,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",244.21832
songplay_id,2018-11-08 09:00:55.796000,80,paid,,Get Me Bodied,,BeyoncÃÂ©,342,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",374.59546
songplay_id,2018-11-11 13:52:23.796000,88,free,,Get Me Bodied,,BeyoncÃÂ©,441,"Sacramento--Roseville--Arden-Arcade, CA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",359.54893


## Load Time Dimension


In [13]:
%%sql

select distinct
TIMESTAMP 'epoch' + (e.ts/1000) * INTERVAL '1 Second ' as start_time,
extract(hour from start_time) as hour,
extract(day from start_time) as day,
extract(week from start_time) as week,
extract(month from start_time) as month,
extract(year from start_time) as year,
extract(dayofweek from start_time) as day_of_week,
to_char(start_time, 'Day') as day_name,
day_of_week in (0,6) as weekday
from staging_sparklify.staging_events e
where page = 'NextSong'
order by song
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


start_time,hour,day,week,month,year,day_of_week,day_name,weekday
2018-11-24 14:23:35.796000,14,24,47,11,2018,6,Saturday,True
2018-11-23 21:02:33.796000,21,23,47,11,2018,5,Friday,False
2018-11-20 21:45:04.796000,21,20,47,11,2018,2,Tuesday,False
2018-11-16 20:48:11.796000,20,16,46,11,2018,5,Friday,False
2018-11-06 20:31:44.796000,20,6,45,11,2018,2,Tuesday,False
2018-11-15 22:45:20.796000,22,15,46,11,2018,4,Thursday,False
2018-11-20 19:19:30.796000,19,20,47,11,2018,2,Tuesday,False
2018-11-07 17:10:39.796000,17,7,45,11,2018,3,Wednesday,False
2018-11-30 05:15:12.796000,5,30,48,11,2018,5,Friday,False
2018-11-21 09:37:57.796000,9,21,47,11,2018,3,Wednesday,False


## Load User Dimension

In [16]:
%%sql 
select distinct
userid as user_id,
firstname as first_name,
lastname as last_name,
gender,
level
from staging_sparklify.staging_events e
where page = 'NextSong'
order by song
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


user_id,first_name,last_name,gender,level
83,Stefany,White,F,free
67,Colm,Santana,M,free
51,Maia,Burke,F,free
25,Jayden,Graves,M,paid
60,Devin,Larson,M,free
28,Brantley,West,M,free
10,Sylvie,Cruz,F,free
86,Aiden,Hess,M,free
39,Walter,Frye,M,free
62,Connar,Moreno,M,free


## Issues
It looks like the files being read from 's3://udacity-dend/log-data' have a different encoding and I have records that do not match artst / song

In [64]:
%%sql

select *
from staging_sparklify.staging_songs
where title in ('Get Me Bodied',  'Los Salieris De Charly')
or artist_name = 'Mpiri'
limit 20

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
4 rows affected.


song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location
SOFSGBJ12A8AE4645B,1,Get Me Bodied,Beyoncé,,2006,374.59546,AR65K7A1187FB4DAA4,,
SOPCLSY12A8C13E413,1,Get Me Bodied,Beyoncé,,2006,359.54893,AR65K7A1187FB4DAA4,,
SOJATVB12A3F1EA77A,1,EG GLEÐIST SO HVØRT JÓLAKVØLD,Mpiri,,0,269.92281,ARTYXEZ1187FB54560,,
SOHTEDD12A6D4F8215,1,Los Salieris De Charly,León Gieco,,1992,361.27302,AR2S6UD1187B9B944F,,
