# ETL Draft
This notebook is to scrap / test code towards the actual ETL

In [40]:
import boto3
import configparser
import matplotlib.pyplot as plt
import pandas as pd
from time import time
import json
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


## Read Configs

In [41]:
config = configparser.ConfigParser()
config.read_file(open('aws.cfg'))
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
PREFERRED_REGION       = config.get('AWS','PREFERRED_REGION')

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
HOST              = config.get('CLUSTER','HOST')
DB_NAME           = config.get('CLUSTER','DB_NAME')
DB_USER           = config.get('CLUSTER','DB_USER')
DB_PASSWORD       = config.get('CLUSTER','DB_PASSWORD')
DB_PORT           = config.get('CLUSTER','DB_PORT')

DB_NAME           = config.get('DATABASE','DB_NAME')

IAM_ROLE        = config.get('IAM_ROLE','ARN')

LOG_DATA        = config.get('S3','LOG_DATA')
LOG_JSONPATH        = config.get('S3','LOG_JSONPATH')
SONG_DATA        = config.get('S3','SONG_DATA')


## CHeck S3 contents

In [42]:
s3 = boto3.resource('s3',
                       region_name= PREFERRED_REGION,
                       aws_access_key_id= KEY,
                       aws_secret_access_key= SECRET
                   )

bucket = s3.Bucket('udacity-dend')
#for obj in bucket.objects.all():
#for obj in bucket.objects.filter(Prefix="log_json_path.json"):
for obj in bucket.objects.filter(Prefix="song-data/A/A/A/TRAAAAK128F9318786.json"):
#for obj in bucket.objects.filter(Prefix="log-data/2018/11/2018-11-01-events.json"):
    key = obj.key    
    body = obj.get()['Body'].read() 
    print(body)

b'{"song_id": "SOBLFFE12AF72AA5BA", "num_songs": 1, "title": "Scream", "artist_name": "Adelitas Way", "artist_latitude": null, "year": 2009, "duration": 213.9424, "artist_id": "ARJNIUY12298900C91", "artist_longitude": null, "artist_location": ""}'


## Check number of files / entities
- Number of song files should match staging_songs.count
- NUmber of log entities?? should match staging_events.count

## Connect to Redshift cluster

In [52]:
import os 
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

## Create Staging Tables

In [49]:
%%sql 
CREATE SCHEMA IF NOT EXISTS staging_sparklify;
SET search_path TO staging_sparklify;

DROP TABLE IF EXISTS staging_events;
CREATE TABLE staging_events 
(
  artist varchar(1000),
  auth varchar,
  firstName varchar,
  gender varchar,
  itemInSession varchar,
  lastName varchar,
  length varchar,
  level varchar,
  location varchar,
  method varchar,
  page varchar,
  registration varchar,
  sessionId varchar,
  song varchar(1000),
  status varchar,
  ts varchar,
  userAgent varchar,
  userId varchar
);

DROP TABLE IF EXISTS staging_songs;
CREATE TABLE staging_songs 
(
  song_id varchar,
  num_songs varchar,
  title varchar(1000), 
  artist_name varchar(1000),
  artist_latitude varchar,
  year varchar,
  duration varchar,
  artist_id varchar,
  artist_longitude varchar,
  artist_location  varchar(1000)
);


 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.
Done.
Done.
Done.
Done.


[]

## Loading Events / Log

In [60]:
%sql select * from staging_sparklify.staging_events

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
0 rows affected.


artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts,useragent,userid


In [62]:
%%sql

copy staging_sparklify.staging_events 
from 's3://udacity-dend/log-data' 
iam_role 'arn:aws:iam::991791500823:role/dwhRole'
json 's3://udacity-dend/log_json_path.json';

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

## Loading Songs

In [54]:
%sql select count(1) from staging_sparklify.staging_songs

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
385252


In [None]:
%sql select * from stl_load_errors

In [50]:
%%sql

copy staging_songs 
from 's3://udacity-dend/song-data' 
iam_role 'arn:aws:iam::991791500823:role/dwhRole'
json 'auto ignorecase';

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

## Understanding how to convert ts into postgresql timestamp

In [108]:
%%sql

select
a.ts,
(a.ts/1000) as seconds_as_float,
TIMESTAMP 'epoch' as epoch_time_0,
(a.ts/1000) * interval '1 second' as time_interval_from_epoch,
TIMESTAMP 'epoch' + (a.ts/1000) * INTERVAL '1 Second ' AS start_time,
a.*
from staging_sparklify.staging_events a
limit 2

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
2 rows affected.


ts,seconds_as_float,epoch_time_0,time_interval_from_epoch,start_time,artist,auth,firstname,gender,iteminsession,lastname,length,level,location,method,page,registration,sessionid,song,status,ts_1,useragent,userid
1541300540796,1541300540.796,1970-01-01 00:00:00,"17839 days, 3:02:20.796000",2018-11-04 03:02:20.796000,Olivia Ruiz,Logged In,Jahiem,M,3,Miles,254.74567,free,"San Antonio-New Braunfels, TX",PUT,NextSong,1540817347796,42,Cabaret Blanco,200,1541300540796,"""Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",43
1541310741796,1541310741.796,1970-01-01 00:00:00,"17839 days, 5:52:21.796000",2018-11-04 05:52:21.796000,,Logged In,Jayden,M,5,Graves,,paid,"Marinette, WI-MI",GET,Home,1540664184796,128,,200,1541310741796,"""Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36""",25


In [None]:
-- where page = 'NextSong'