# Exploratory Data Analysis (EDA): understand the input data

In [1]:
import boto3
import configparser

In [2]:
config = configparser.ConfigParser()
config.read('dwh.cfg')

# AWS
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

In [3]:
s3c = boto3.client('s3',
                    region_name='eu-west-2', 
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
              )
s3r = boto3.resource('s3',
                    region_name='eu-west-2', 
                    aws_access_key_id=KEY,
                    aws_secret_access_key=SECRET
              )
udacity_bucket = s3r.Bucket("udacity-dend")

### Look into input files for the `song_data`

$\rightarrow$ Contains JSON files with song metadata (artist_id, artist_latitude, ..., year)

In [4]:
for i, obj in enumerate(udacity_bucket.objects.filter(Prefix='song_data')):
    print(obj)
    if i > 3:
        break

s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAK128F9318786.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAV128F421A322.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAABD128F429CF47.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAACN128F9355673.json')


In [5]:
# print the content of the first file 
# boto3 s3 resource has no get_object
print(s3c.get_object(Bucket='udacity-dend', Key='song_data/A/A/A/TRAAAAK128F9318786.json')\
      ['Body'].read().decode('utf-8')[:500])

{"artist_id":"ARJNIUY12298900C91","artist_latitude":null,"artist_location":"","artist_longitude":null,"artist_name":"Adelitas Way","duration":213.9424,"num_songs":1,"song_id":"SOBLFFE12AF72AA5BA","title":"Scream","year":2009}


### Look into input files for the `log_data`

$\rightarrow$ Contain JSON files with user activity, ie songs played (with artist, authorisation status, ...., user_id)

In [6]:
# list Log Data 
# boto3 s3 client has no attribute 'Bucket'
song_data_bucket = s3r.Bucket("udacity-dend")
for i, obj in enumerate(udacity_bucket.objects.filter(Prefix='log_data')):
    print(obj)
    if i > 5:
        break

s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-02-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-03-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-04-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-05-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-06-events.json')


In [7]:
# print the content of the first file
print(s3c.get_object(Bucket='udacity-dend', Key='log_data/2018/11/2018-11-01-events.json')\
      ['Body'].read().decode('utf-8')[:500])

{"artist":null,"auth":"Logged In","firstName":"Walter","gender":"M","itemInSession":0,"lastName":"Frye","length":null,"level":"free","location":"San Francisco-Oakland-Hayward, CA","method":"GET","page":"Home","registration":1540919166796.0,"sessionId":38,"song":null,"status":200,"ts":1541105830796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"39"}
{"artist":null,"auth":"Logged In","firstNam


$\rightarrow$ error in load of `staging_events`:

1. check in AWS Redshift Query editor `select * from stl_load_errors`
2. error "Invalid timestamp format or value [YYYY-MM-DD HH24:MI:SS]"
3. From sample data above: "ts":1541105830796 is not a timestamp 
    -> might use BIGINT
    -> or adapt LOAD SQL -> chose this, compare sql_queries.py

### Look into input file `log_json_path.json`

$\rightarrow$ Contains column headers

In [8]:
# log_json_path.json
# inspired by https://www.slsmk.com/use-boto3-to-open-an-aws-s3-file-directly/
print(s3c.get_object(Bucket='udacity-dend', Key='log_json_path.json')['Body'].read().decode('utf-8'))

{
    "jsonpaths": [
        "$['artist']",
        "$['auth']",
        "$['firstName']",
        "$['gender']",
        "$['itemInSession']",
        "$['lastName']",
        "$['length']",
        "$['level']",
        "$['location']",
        "$['method']",
        "$['page']",
        "$['registration']",
        "$['sessionId']",
        "$['song']",
        "$['status']",
        "$['ts']",
        "$['userAgent']",
        "$['userId']"
    ]
}


In [9]:
s3c

<botocore.client.S3 at 0x7f1af7487550>

Get an impression of the song_data size

In [10]:
#inspired by https://newbedev.com/how-to-find-size-of-a-folder-inside-an-s3-bucket:
top_level_folders = dict()
num_files = 0
for key in s3c.list_objects(Bucket='udacity-dend')['Contents']:

    folder = key['Key'].split('/')[0]
    if folder == 'song-data':
        if num_files < 5:
            print("Key %s in folder %s. %d bytes" % (key['Key'], folder, key['Size']))
        num_files += 1
print(num_files)

Key song-data/ in folder song-data. 0 bytes
Key song-data/A/A/A/TRAAAAK128F9318786.json in folder song-data. 244 bytes
Key song-data/A/A/A/TRAAAAV128F421A322.json in folder song-data. 303 bytes
Key song-data/A/A/A/TRAAABD128F429CF47.json in folder song-data. 268 bytes
Key song-data/A/A/A/TRAAACN128F9355673.json in folder song-data. 262 bytes
905


# Develop the print statements in etl.py

In [11]:
# Develop print statement in etl.py
from sql_queries import copy_table_queries, insert_table_queries
for query in insert_table_queries:
    print(query.strip().split(' ')[2])

songplays
users
songs
artists
time
