# ETL Draft
This notebook is to scrap / test code towards the actual ETL

In [21]:
import boto3
import configparser
import matplotlib.pyplot as plt
import pandas as pd
from time import time
import json
%load_ext sql

## Read Configs

In [23]:
config = configparser.ConfigParser()
config.read_file(open('aws.cfg'))
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')
PREFERRED_REGION       = config.get('AWS','PREFERRED_REGION')

config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
HOST              = config.get('CLUSTER','HOST')
DB_NAME           = config.get('CLUSTER','DB_NAME')
DB_USER           = config.get('CLUSTER','DB_USER')
DB_PASSWORD       = config.get('CLUSTER','DB_PASSWORD')
DB_PORT           = config.get('CLUSTER','DB_PORT')

DB_NAME           = config.get('DATABASE','DB_NAME')

IAM_ROLE        = config.get('IAM_ROLE','ARN')

LOG_DATA        = config.get('S3','LOG_DATA')
LOG_JSONPATH        = config.get('S3','LOG_JSONPATH')
SONG_DATA        = config.get('S3','SONG_DATA')


## CHeck S3 contents

In [31]:
s3 = boto3.resource('s3',
                       region_name= PREFERRED_REGION,
                       aws_access_key_id= KEY,
                       aws_secret_access_key= SECRET
                   )

bucket = s3.Bucket('udacity-dend')
#for obj in bucket.objects.all():
#for obj in bucket.objects.filter(Prefix="log_json_path.json"):
for obj in bucket.objects.filter(Prefix="song-data/A/A/A/TRAAAAK128F9318786.json"):
#for obj in bucket.objects.filter(Prefix="log-data/2018/11/2018-11-01-events.json"):
    key = obj.key    
    body = obj.get()['Body'].read() 
    print(body)

b'{"song_id": "SOBLFFE12AF72AA5BA", "num_songs": 1, "title": "Scream", "artist_name": "Adelitas Way", "artist_latitude": null, "year": 2009, "duration": 213.9424, "artist_id": "ARJNIUY12298900C91", "artist_longitude": null, "artist_location": ""}'


## Connect to Redshift cluster

In [24]:
import os 
conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

## Create Staging Tables

In [37]:
%%sql 
CREATE SCHEMA IF NOT EXISTS staging_sparklify;
SET search_path TO staging_sparklify;

DROP TABLE IF EXISTS staging_events;
CREATE TABLE staging_events 
(
  artist varchar(1000),
  auth varchar,
  firstName varchar,
  gender varchar,
  itemInSession varchar,
  lastName varchar,
  length varchar,
  level varchar,
  location varchar,
  method varchar,
  page varchar,
  registration varchar,
  sessionId varchar,
  song varchar,
  status varchar,
  ts varchar,
  userAgent varchar,
  userId varchar
);

DROP TABLE IF EXISTS staging_songs;
CREATE TABLE staging_songs 
(
  song_id varchar,
  num_songs varchar,
  title varchar, 
  artist_name varchar(1000),
  artist_latitude varchar,
  year varchar,
  duration varchar,
  artist_id varchar,
  artist_longitude varchar,
  artist_location  varchar(1000)
);


 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.
Done.
Done.
Done.
Done.


[]

## Loading Events / Log

In [None]:
%sql select * from staging_events

In [29]:
%%sql

copy staging_events 
from 's3://udacity-dend/log-data' 
iam_role 'arn:aws:iam::991791500823:role/dwhRole'
json 's3://udacity-dend/log_json_path.json';

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
Done.


[]

## Loading Songs

In [38]:
%sql select * from staging_songs

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
0 rows affected.


song_id,num_songs,title,artist_name,artist_latitude,year,duration,artist_id,artist_longitude,artist_location


In [36]:
%sql select * from stl_load_errors

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
4 rows affected.


userid,slice,tbl,starttime,session,query,filename,line_number,colname,type,col_length,position,raw_line,raw_field_value,err_code,err_reason,is_partial,start_offset
100,3,111136,2022-04-28 15:47:52.876114,1073898190,489,s3://udacity-dend/song-data/A/Y/F/TRAYFUW128F428F618.json,1,artist_location,varchar,256,0,"{""song_id"": ""SORMAXQ12A8C139224"", ""num_songs"": 1, ""title"": ""Landmines"", ""artist_name"": ""St. Vincent"", ""artist_latitude"": 19.40904, ""year"": 2007, ""duration"": 307.53914, ""artist_id"": ""AR0JBXL1187FB52810"", ""artist_longitude"": -99.14977, ""artist_location"": ""ORDER &#039;ACTOR&#039; ON INSOUND: <a href=\\""http://www.insound.com/search/searchmain.jsp?query=st.+vincent+actor\\"" target=\\""_blank\\"" rel=\\""nofollow\\"" onmousedown='UntrustedLink.bootstrap($(this), \\""\\"", event)'>http://www.insound.com/search/searchmain.jsp?query=st.+vincent+actor</a>""}",,1204,String length exceeds DDL length,0,0
100,0,111136,2022-04-28 15:47:52.876114,1073898190,489,s3://udacity-dend/song-data/C/A/R/TRCARJQ128F425A389.json,1,artist_name,varchar,256,0,"{""song_id"": ""SOLAUEC12A8AE476BB"", ""num_songs"": 1, ""title"": ""Medication"", ""artist_name"": ""Spiritualized;Jason;Jason - Dulcimer/;Kate Radley - Vox continental/Farfisa/Tones/Drones/Tremeloes/;Sean Cook - Wha-monica/;Mark Refoy;Jon Mattock;Icon Hunt;Stuart Gordon;Balanescu Quartet;Rico;Rico - Tam Tam/;Bammie;Tim Sanders;Roddy Lorimar;Steve Sidwell;Chris Sharrack;Caroline Crawley;Marilyn McFarlane;Helen White"", ""artist_latitude"": null, ""year"": 0, ""duration"": 498.72934, ""artist_id"": ""ARVHQMD1269FB25AE7"", ""artist_longitude"": null, ""artist_location"": """"}",,1204,String length exceeds DDL length,0,0
100,4,111136,2022-04-28 15:47:52.876114,1073898190,489,s3://udacity-dend/song-data/C/W/V/TRCWVDW128F425A38A.json,1,artist_name,varchar,256,0,"{""song_id"": ""SOORTJE12A8AE476BD"", ""num_songs"": 1, ""title"": ""Electric Phase"", ""artist_name"": ""Spiritualized;Jason;Jason - Dulcimer/;Kate Radley - Vox continental/Farfisa/Tones/Drones/Tremeloes/;Sean Cook - Wha-monica/;Mark Refoy;Jon Mattock;Icon Hunt;Stuart Gordon;Balanescu Quartet;Rico;Rico - Tam Tam/;Bammie;Tim Sanders;Roddy Lorimar;Steve Sidwell;Chris Sharrack;Caroline Crawley;Marilyn McFarlane;Helen White"", ""artist_latitude"": null, ""year"": 0, ""duration"": 93.75302, ""artist_id"": ""ARVHQMD1269FB25AE7"", ""artist_longitude"": null, ""artist_location"": """"}",,1204,String length exceeds DDL length,0,0
100,7,111136,2022-04-28 15:47:52.876114,1073898190,489,s3://udacity-dend/song-data/B/K/O/TRBKOTN128F425A38C.json,1,artist_name,varchar,256,0,"{""song_id"": ""SOCCKQH12A8AE476C3"", ""num_songs"": 1, ""title"": ""Born Never Asked"", ""artist_name"": ""Spiritualized;Jason;Jason - Dulcimer/;Kate Radley - Vox continental/Farfisa/Tones/Drones/Tremeloes/;Sean Cook - Wha-monica/;Mark Refoy;Jon Mattock;Icon Hunt;Stuart Gordon;Balanescu Quartet;Rico;Rico - Tam Tam/;Bammie;Tim Sanders;Roddy Lorimar;Steve Sidwell;Chris Sharrack;Caroline Crawley;Marilyn McFarlane;Helen White"", ""artist_latitude"": null, ""year"": 0, ""duration"": 125.20444, ""artist_id"": ""ARVHQMD1269FB25AE7"", ""artist_longitude"": null, ""artist_location"": """"}",,1204,String length exceeds DDL length,0,0


In [39]:
%%sql

copy staging_songs 
from 's3://udacity-dend/song-data' 
iam_role 'arn:aws:iam::991791500823:role/dwhRole'
json 'auto ignorecase';

 * postgresql://dwhuser:***@dwhcluster.cuvu38ujek21.us-west-2.redshift.amazonaws.com:5439/dwh
