# ETL Draft
This notebook is to scrap / test code towards the actual ETL

In [1]:
import boto3
import configparser
import matplotlib.pyplot as plt
import pandas as pd
from time import time
import json
%load_ext sql

## Read Configs

In [45]:
config = configparser.ConfigParser()
config.read_file(open('aws.cfg'))
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
HOST              = config.get('CLUSTER','HOST')
DB_NAME           = config.get('CLUSTER','DB_NAME')
DB_USER           = config.get('CLUSTER','DB_USER')
DB_PASSWORD       = config.get('CLUSTER','DB_PASSWORD')
DB_PORT           = config.get('CLUSTER','DB_PORT')

IAM_ROLE        = config.get('IAM_ROLE','ARN')

LOG_DATA        = config.get('S3','LOG_DATA')
LOG_JSONPATH     = config.get('S3','LOG_JSONPATH')
SONG_DATA        = config.get('S3','SONG_DATA')
BUCKET_REGION    = config.get('S3','BUCKET_REGION')


## CHeck S3 contents

In [3]:
s3 = boto3.resource('s3',
                       region_name= BUCKET_REGION,
                       aws_access_key_id= KEY,
                       aws_secret_access_key= SECRET
                   )

bucket = s3.Bucket('udacity-dend')

### For testing purposes I'm only using A/A/A
If I upload the entire song dataset, it takes 100 minutes

In [4]:
for obj in bucket.objects.filter(Prefix="song-data/A/A/A/"):
#for obj in bucket.objects.filter(Prefix="log-data/2018/11/2018-11-01-events.json"):
    key = obj.key    
    print(key)

song-data/A/A/A/TRAAAAK128F9318786.json
song-data/A/A/A/TRAAAAV128F421A322.json
song-data/A/A/A/TRAAABD128F429CF47.json
song-data/A/A/A/TRAAACN128F9355673.json
song-data/A/A/A/TRAAAEA128F935A30D.json
song-data/A/A/A/TRAAAED128E0783FAB.json
song-data/A/A/A/TRAAAEM128F93347B9.json
song-data/A/A/A/TRAAAEW128F42930C0.json
song-data/A/A/A/TRAAAFD128F92F423A.json
song-data/A/A/A/TRAAAGR128F425B14B.json
song-data/A/A/A/TRAAAHD128F42635A5.json
song-data/A/A/A/TRAAAHJ128F931194C.json
song-data/A/A/A/TRAAAHZ128E0799171.json
song-data/A/A/A/TRAAAIR128F1480971.json
song-data/A/A/A/TRAAAJN128F428E437.json
song-data/A/A/A/TRAAAND12903CD1F1B.json
song-data/A/A/A/TRAAANK128F428B515.json
song-data/A/A/A/TRAAAOF128F429C156.json
song-data/A/A/A/TRAAAPK128E0786D96.json
song-data/A/A/A/TRAAAQN128F9353BA0.json
song-data/A/A/A/TRAAAQO12903CD8E1C.json
song-data/A/A/A/TRAAAUC128F428716F.json
song-data/A/A/A/TRAAAUR128F428B1FA.json
song-data/A/A/A/TRAAAYL128F4271A5B.json


In [None]:

#for obj in bucket.objects.all():
#for obj in bucket.objects.filter(Prefix="log_json_path.json"):
for obj in bucket.objects.filter(Prefix="song-data/A/A/A/TRAAAAK128F9318786.json"):
#for obj in bucket.objects.filter(Prefix="log-data/2018/11/2018-11-01-events.json"):
    key = obj.key    
    body = obj.get()['Body'].read() 
    print(body)

## Check number of files / entities
- Number of song files should match staging_songs.count
- NUmber of log entities?? should match staging_events.count

### Count Song Data Files

In [12]:
count = 0 
for i in bucket.objects.filter(Prefix="song-data/"):
  count += 1

#list(map(lambda i: count = count + 1, bucket.objects.filter(Prefix="song-data/")))

#c = collections Counter()

print(count)

385253


In [37]:
count = 0
#for obj in bucket.objects.filter(Prefix="log-data/2018/11/2018-11-01-events.json"):
for obj in bucket.objects.filter(Prefix="log-data/"):
  text = obj.get()['Body'].read().decode('utf-8')  
  df = pd.read_json(text, lines=True)
  count += len(df.index)  
  
print(count)

8056


## Connect to Redshift cluster

In [46]:
import os 
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

## Create Staging Tables

In [43]:
%%sql 


DROP TABLE IF EXISTS staging_events;
CREATE TABLE staging_events 
(
  artist varchar(1000),
  auth varchar,
  firstName varchar,
  gender varchar,
  itemInSession varchar,
  lastName varchar,
  length varchar,
  level varchar,
  location varchar,
  method varchar,
  page varchar,
  registration varchar,
  sessionId varchar,
  song varchar(1000),
  status varchar,
  ts varchar,
  userAgent varchar,
  userId varchar
);

DROP TABLE IF EXISTS staging_songs;
CREATE TABLE staging_songs 
(
  song_id varchar,
  num_songs varchar,
  title varchar(1000), 
  artist_name varchar(1000),
  artist_latitude varchar,
  year varchar,
  duration varchar,
  artist_id varchar,
  artist_longitude varchar,
  artist_location  varchar(1000)
);


 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.
Done.
Done.


[]

## Creating DataWarehouse tables

In [44]:
%%sql 

drop table if exists users;
create table if not exists users
(
    user_id int not null primary key sortkey,
    first_name varchar not null,
    last_name varchar not null,
    gender varchar not null,
    level varchar not null
) diststyle ALL;

drop table if exists time;
create table if not exists time
(
  start_time timestamp without time zone not null primary key sortkey,
  hour int not null,
  day int not null,
  week int not null,
  month int not null,
  year int not null,
  day_of_week int not null,
  day_of_week_name varchar not null,
  is_weekend bool not null
) diststyle ALL;

drop table if exists songplays;
create table if not exists songplays
(
    songplay_id int IDENTITY(0,1) primary key,
    start_time timestamp without time zone not null sortkey,
    user_id int not null,
    level varchar not null,
    song_id varchar distkey,
    song_title varchar(1000) not null,
    artist_id varchar,
    artist_name varchar(1000) not null,
    session_id int  not null,
    location varchar(1000) not null,
    user_agent varchar  not null,
    stream_duration decimal 
) diststyle KEY;

drop table if exists songs;
create table if not exists songs
(
    song_id varchar not null primary key distkey,
    title varchar(1000) not null sortkey,
    artist_id varchar not null,
    year int not null,
    duration decimal not null
) diststyle KEY;

drop table if exists artists;
create table if not exists artists
(
    artist_id varchar not null primary key,
    name varchar(1000) not null sortkey,
    location varchar(1000) null,
    latitude decimal null,
    longitude decimal null
) diststyle ALL;

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.


[]

In [38]:
%%sql 


 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.


[]

## Loading Events / Log from S3

In [11]:
%sql select count(1) from staging_events

 * postgresql://dwhuser:***@dwhcluster.cxxnfuxeuzhw.us-east-1.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
8056


In [47]:
%%sql

copy staging_events 
from 's3://udacity-dend/log-data' 
iam_role 'arn:aws:iam::991791500823:role/dwhRole'
region 'us-west-2'
json 's3://udacity-dend/log_json_path.json';

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [48]:
%%sql 
select *
from staging_events 
where page = 'NextSong'
limit 5

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid
Amy Winehouse,Logged In,Ayla,F,0,Johnson,231.52281,free,"Santa Rosa, CA",PUT,NextSong,1540880381796,223,Stronger Than Me,200,1541550480796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.77.4 (KHTML, like Gecko) Version/7.0.5 Safari/537.77.4""",63
Cage The Elephant,Logged In,Kate,F,101,Harrell,175.12444,paid,"Lansing-East Lansing, MI",PUT,NextSong,1540472624796,293,Ain't No Rest For The Wicked (Original Version),200,1541551774796,"""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.94 Safari/537.36""",97
Yeah Yeah Yeahs,Logged In,Kaylee,F,3,Summers,220.96934,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,181,Heads Will Roll,200,1541554963796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36""",8
Juan Carlos Baglietto,Logged In,Wyatt,M,4,Scott,285.64853,free,"Eureka-Arcata-Fortuna, CA",PUT,NextSong,1540872073796,8,Era En Abril,200,1541560364796,Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko,9
Jagged Edge featuring Run of Run DMC,Logged In,Adler,M,1,Barrera,248.37179,free,"New York-Newark-Jersey City, NY-NJ-PA",PUT,NextSong,1540835983796,301,Let's Get Married,200,1541577508796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.78.2 (KHTML, like Gecko) Version/7.0.6 Safari/537.78.2""",100


## Loading Songs from S3

In [49]:
%sql select count(1) from staging_songs

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
0


In [None]:
%sql select * from stl_load_errors

In [50]:
%%sql

copy staging_songs 
from 's3://udacity-dend/song-data/' 
iam_role 'arn:aws:iam::991791500823:role/dwhRole'
region 'us-west-2'
json 'auto ignorecase';

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

In [51]:
%%sql 
select *
from staging_songs
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
(psycopg2.OperationalError) SSL SYSCALL error: EOF detected

[SQL: select *
from staging_songs
limit 10]
(Background on this error at: https://sqlalche.me/e/14/e3q8)


## Understanding how to convert ts into postgresql timestamp

In [108]:
%%sql

select
a.ts,
(a.ts/1000) as seconds_as_float,
TIMESTAMP 'epoch' as epoch_time_0,
(a.ts/1000) * interval '1 second' as time_interval_from_epoch,
TIMESTAMP 'epoch' + (a.ts/1000) * INTERVAL '1 Second ' AS start_time,
a.*
from staging_sparklify.staging_events a
limit 2

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
2 rows affected.


ts,seconds_as_float,epoch_time_0,time_interval_from_epoch,start_time,artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts_1,useragent,userid
1541300540796,1541300540.796,1970-01-01 00:00:00,"17839 days, 3:02:20.796000",2018-11-04 03:02:20.796000,Olivia Ruiz,Logged In,Jahiem,M,3,Miles,254.74567,free,"San Antonio-New Braunfels, TX",PUT,NextSong,1540817347796,42,Cabaret Blanco,200,1541300540796,"""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",43
1541310741796,1541310741.796,1970-01-01 00:00:00,"17839 days, 5:52:21.796000",2018-11-04 05:52:21.796000,,Logged In,Jayden,M,5,Graves,,paid,"Marinette, WI-MI",GET,Home,1540664184796,128,,200,1541310741796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",25


## Sql queries to transform the events and songs into dimension tables

## Load Songs

In [None]:
%%sql

SELECT xxxx,x,x,,,xx,
           ROW_NUMBER() OVER(PARTITION BY userid ORDER BY ts DESC) AS rank
    FROM staging_songs
            WHERE zzz != NULL

In [13]:
%%sql

select 
    title,
    artist_id,
    count(1)
from staging_songs
group by title,
    artist_id
having count(1)> 1


 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
0 rows affected.


title,artist_id,count


In [11]:
%%sql

select 
song_id,
title,
artist_id,
year::int,
duration
from staging_songs
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


song_id,title,artist_id,year,duration
SODZYPO12A8C13A91E,Burn My Body (Album Version),AR1C2IX1187B99BF74,0,177.99791
SOTAZDY12AB0187616,Drillbit,ARZKCQM1257509D107,0,374.62159
SOIGHOD12A8C13B5A1,Indian Angel,ARY589G1187B9A9F4E,2004,171.57179
SOFRDWL12A58A7CEF7,Hit Da Scene,AR9Q9YC1187FB5609B,0,252.94322
SORRNOC12AB017F52B,The Last Beat Of My Heart (b-side),ARSZ7L31187FB4E610,2004,337.81506
SONQPZK12AB0182D84,Double Wide,ARKYKXP11F50C47A6A,0,160.20853
SOIGICF12A8C141BC5,Game & Watch,AREWD471187FB49873,2004,580.54485
SOBRKGM12A8C139EF6,Welcome to the Pleasuredome,ARXQBR11187B98A2CC,1985,821.05424
SOHOZBI12A8C132E3C,Smash It Up,AR0MWD61187B9B2B12,2000,195.39546
SOCIWDW12A8C13D406,Soul Deep,ARMJAGH1187FB546F3,1969,148.03546


In [35]:
%%sql
insert into songs
(song_id,title,artist_id,year,duration)
select 
song_id,
title,
artist_id,
year::int,
duration
from staging_sparklify.staging_songs
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


[]

## Load Artists

In [49]:
%%sql 

select 
artist_id,
artist_name as name,
artist_location as location,
artist_latitude as latitude,
artist_longitude as longitude
from staging_sparklify.staging_songs
limit 5

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


artist_id,name,location,latitude,longitude
ARQVORN11F50C4EFEC,Bedlight For Blue Eyes,,,
AR8JO2B1187B98EBB6,Leftöver Crack,"New York City, NY, USA",,
AR1XD261187B9ACF9B,Nick Cave/Warren Ellis,,,
AR19SOA1187B98F6E6,Bob Neuwirth,New York,40.71455,-74.00712
ARZN98V1187B990D1D,THERION,"Stockholm, Sweden",59.33217,18.06243


In [39]:
%%sql 

insert into artists
(artist_id, name, location, latitude, longitude)
select 
artist_id,
artist_name as name,
artist_location as location,
artist_latitude::decimal as latitude,
artist_longitude::decimal as longitude
from staging_sparklify.staging_songs
limit 5

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


[]

## Load songplays

In [57]:
%%sql

select 
'songplay_id' as songplay_id,
TIMESTAMP 'epoch' + (e.ts/1000) * INTERVAL '1 Second ' as start_time,
e.userid as user_id,
e.level,
s.song_id,
e.song as song_title,
s.artist_id,
e.artist as artist_name,
e.sessionid as session_id,
e.location,
e.userAgent as user_agent,
e.length as stream_dureation
from staging_sparklify.staging_events e
left join staging_sparklify.staging_songs s on e.song = s.title and e.artist = s.artist_name
where page = 'NextSong'
and s.artist_id is null
limit 5

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


songplay_id,start_time,user_id,level,song_id,song_title,artist_id,artist_name,session_id,location,user_agent,stream_dureation
songplay_id,2018-11-16 21:14:34.796000,49,paid,,EG GLEÃÂIST SO HVÃÂRT JÃÂLAKVÃÂLD,,Mpiri,648,"San Francisco-Oakland-Hayward, CA",Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0,269.92281
songplay_id,2018-11-24 12:15:28.796000,80,paid,,Los Salieris De Charly,,LeÃÂ³n Gieco,903,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",361.27302
songplay_id,2018-11-24 07:00:22.796000,80,paid,,You'll Never Find Another Love Like Mine (Album Version),,Michael BublÃÂ©,893,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",244.21832
songplay_id,2018-11-08 09:00:55.796000,80,paid,,Get Me Bodied,,BeyoncÃÂ©,342,"Portland-South Portland, ME","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",374.59546
songplay_id,2018-11-11 13:52:23.796000,88,free,,Get Me Bodied,,BeyoncÃÂ©,441,"Sacramento--Roseville--Arden-Arcade, CA","""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",359.54893


In [41]:
%%sql

insert into songplays
(start_time, user_id, level, song_id, song_title, 
artist_id, artist_name, session_id, location, user_agent, stream_duration)
select 
TIMESTAMP 'epoch' + (e.ts/1000) * INTERVAL '1 Second ' as start_time,
e.userid::int as user_id,
e.level,
s.song_id,
e.song as song_title,
s.artist_id,
e.artist as artist_name,
e.sessionid::int as session_id,
e.location,
e.userAgent as user_agent,
e.length::decimal as stream_dureation
from staging_sparklify.staging_events e
left join staging_sparklify.staging_songs s on e.song = s.title and e.artist = s.artist_name
where page = 'NextSong'
and s.artist_id is null
limit 5

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
5 rows affected.


[]

## Load Time Dimension


In [None]:
# TIMESTAMP 'epoch' + (e.ts/1000) * INTERVAL '1 Second ' as start_time,

In [42]:
%%sql
with time_relevant_records as (
    select TIMESTAMP 'epoch' + (ts/1000) * INTERVAL '1 Second ' as start_time,    
    extract(hour from start_time) as hour,
    extract(day from start_time) as day,
    extract(week from start_time) as week,
    extract(month from start_time) as month,
    extract(year from start_time) as year,
    TO_TIMESTAMP(year || '-' || month || '-' || day || ' ' || hour || ':00:00', 'YYYY-MM-DD HH24:MI:SS') as timestamp_date
    from staging_events 
    where page = 'NextSong'
)
select distinct
    year * 1000000
    + month * 10000
    + day * 100
    + hour 
    as time_key,
    timestamp_date,
    extract(year from start_time) as year,
    extract(month from start_time) as month,
    extract(day from start_time) as day,
    extract(hour from start_time) as hour,    
    extract(week from start_time) as week,    
    extract(dayofweek from start_time) as day_of_week,
    to_char(start_time, 'Day') as day_of_week_name,
    day_of_week in (0,6) as is_weekend
from time_relevant_records
limit 14

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
14 rows affected.


time_key,timestamp_date,year,month,day,hour,week,day_of_week,day_of_week_name,is_weekend
2018110714,2018-11-07 14:00:00+00:00,2018,11,7,14,45,3,Wednesday,False
2018110718,2018-11-07 18:00:00+00:00,2018,11,7,18,45,3,Wednesday,False
2018110516,2018-11-05 16:00:00+00:00,2018,11,5,16,45,1,Monday,False
2018110402,2018-11-04 02:00:00+00:00,2018,11,4,2,44,0,Sunday,True
2018110412,2018-11-04 12:00:00+00:00,2018,11,4,12,44,0,Sunday,True
2018110619,2018-11-06 19:00:00+00:00,2018,11,6,19,45,2,Tuesday,False
2018110720,2018-11-07 20:00:00+00:00,2018,11,7,20,45,3,Wednesday,False
2018111511,2018-11-15 11:00:00+00:00,2018,11,15,11,46,4,Thursday,False
2018111516,2018-11-15 16:00:00+00:00,2018,11,15,16,46,4,Thursday,False
2018111519,2018-11-15 19:00:00+00:00,2018,11,15,19,46,4,Thursday,False


In [27]:
%%sql
with time_relevant_records as 

select distinct
extract(hour from start_time) as hour,
extract(day from start_time) as day,
extract(week from start_time) as week,
extract(month from start_time) as month,
extract(year from start_time) as year,
+ year * 1000000
+ month * 10000
+ day * 100
+ hour 
as time_key,
extract(dayofweek from start_time) as day_of_week,
to_char(start_time, 'Day') as day_of_week_name,
day_of_week in (0,6) as is_weekend
from staging_events e
where page = 'NextSong'
order by song
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
(psycopg2.errors.UndefinedColumn) column "start_time" does not exist in e

[SQL: select distinct
extract(hour from start_time) as hour,
extract(day from start_time) as day,
extract(week from start_time) as week,
extract(month from start_time) as month,
extract(year from start_time) as year,
+ year * 1000000
+ month * 10000
+ day * 100
+ hour 
as time_key,
extract(dayofweek from start_time) as day_of_week,
to_char(start_time, 'Day') as day_of_week_name,
day_of_week in (0,6) as is_weekend
from staging_events e
where page = 'NextSong'
order by song
limit 10]
(Background on this error at: https://sqlalche.me/e/14/f405)


In [22]:
%%sql 

insert into time 
(start_time, hour, day, week, month, year, day_of_week, day_of_week_name, is_weekend)
select distinct
    TIMESTAMP 'epoch' + (e.ts/1000) * INTERVAL '1 Second ' as start_time,
    extract(hour from start_time) as hour,
    extract(day from start_time) as day,
    extract(week from start_time) as week,
    extract(month from start_time) as month,
    extract(year from start_time) as year,
    extract(dayofweek from start_time) as day_of_week,
    to_char(start_time, 'Day') as day_of_week_name,
    day_of_week in (0,6) as is_weekend
from staging_events e
where page = 'NextSong'
order by start_time

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
6813 rows affected.


[]

In [23]:
%%sql

select * 
from time
limit 14

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
14 rows affected.


start_time,hour,day,week,month,year,day_of_week,day_of_week_name,is_weekend
2018-11-01 21:01:46.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:05:52.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:08:16.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:11:13.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:17:33.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:24:53.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:28:54.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:42:00.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:52:05.796000,21,1,44,11,2018,4,Thursday,False
2018-11-01 21:55:25.796000,21,1,44,11,2018,4,Thursday,False


## Load User Dimension

In [16]:
%%sql 
select distinct
userid as user_id,
firstname as first_name,
lastname as last_name,
gender,
level
from staging_sparklify.staging_events e
where page = 'NextSong'
order by song
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


user_id,first_name,last_name,gender,level
83,Stefany,White,F,free
67,Colm,Santana,M,free
51,Maia,Burke,F,free
25,Jayden,Graves,M,paid
60,Devin,Larson,M,free
28,Brantley,West,M,free
10,Sylvie,Cruz,F,free
86,Aiden,Hess,M,free
39,Walter,Frye,M,free
62,Connar,Moreno,M,free


In [43]:
%%sql 

insert into users
(user_id, first_name, last_name, gender, level)
select distinct
userid::int as user_id,
firstname as first_name,
lastname as last_name,
gender,
level
from staging_sparklify.staging_events e
where page = 'NextSong'
order by song
limit 10

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


[]

## Issues
It looks like the files being read from 's3://udacity-dend/log-data' have a different encoding and I have records that do not match artst / song

In [64]:
%%sql

select *
from staging_sparklify.staging_songs
where title in ('Get Me Bodied',  'Los Salieris De Charly')
or artist_name = 'Mpiri'
limit 20

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
4 rows affected.


song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location
SOFSGBJ12A8AE4645B,1,Get Me Bodied,Beyoncé,,2006,374.59546,AR65K7A1187FB4DAA4,,
SOPCLSY12A8C13E413,1,Get Me Bodied,Beyoncé,,2006,359.54893,AR65K7A1187FB4DAA4,,
SOJATVB12A3F1EA77A,1,EG GLEÐIST SO HVØRT JÓLAKVØLD,Mpiri,,0,269.92281,ARTYXEZ1187FB54560,,
SOHTEDD12A6D4F8215,1,Los Salieris De Charly,León Gieco,,1992,361.27302,AR2S6UD1187B9B944F,,


## Tests

In [47]:
%%sql 
select * from songs
limit 10;

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


song_id,title,artist_id,year,duration
SOQPWCR12A6D4FB2A3,A Poor Recipe For Civic Cohesion,AR73AIO1187B9AD57B,2005,118
SOHKNRJ12A6701D1F8,Drop of Rain,AR10USD1187B99F3F1,0,189
SOFSOCN12A8C143F5D,Face the Ashes,ARXR32B1187FB57099,2007,209
SOIGHOD12A8C13B5A1,Indian Angel,ARY589G1187B9A9F4E,2004,171
SONQPZK12AB0182D84,Double Wide,ARKYKXP11F50C47A6A,0,160
SOBLFFE12AF72AA5BA,Scream,ARJNIUY12298900C91,2009,213
SOOVHYF12A8C134892,I'll Be Waiting,ARCLYBR1187FB53913,1989,304
SOHOZBI12A8C132E3C,Smash It Up,AR0MWD61187B9B2B12,2000,195
SOABWAP12A8C13F82A,Take Time,AR5LMPY1187FB573FE,1978,258
SORRNOC12AB017F52B,The Last Beat Of My Heart (b-side),ARSZ7L31187FB4E610,2004,337
