# First look datasets
Using the AWS python SDK

In [1]:
import os
import pandas as pd
import boto3
import json

## Make sure you have an AWS secret and access key

- Create a new IAM user in your AWS account
- Give it `AdministratorAccess`, From `Attach existing policies directly` Tab
- Take note of the access key and secret 
- Edit the file `test_dwh.cfg` in the same folder as this notebook and fill
<font color='red'>
<BR>
[AWS]<BR>
KEY= YOUR_AWS_KEY<BR>
SECRET= YOUR_AWS_SECRET<BR>
<font/>

### Load AWS Params from a file

In [5]:
import configparser
config = configparser.ConfigParser()
config.read_file(open(os.getcwd()+'/test_dwh.cfg'))

In [6]:
KEY                    = config.get('AWS','KEY')
SECRET                 = config.get('AWS','SECRET')

s3 = boto3.resource('s3',
                       region_name="us-east-1",
                       aws_access_key_id=KEY,
                       aws_secret_access_key=SECRET
                   )

### Check out the sample data sources on S3

#### Song Dataset

In [7]:
# Get Bucket name and Key
SONG_DATA_LINK = config.get('S3','SONG_DATA')
SONG_DATA_LINK

"'s3://udacity-dend/song_data'"

In [8]:
# Print first 5 objs to make sure in right bucket.
sampleDbBucket =  s3.Bucket('udacity-dend')

for obj in sampleDbBucket.objects.filter(Prefix = 'song_data').limit(count=5):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAK128F9318786.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAV128F421A322.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAABD128F429CF47.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAACN128F9355673.json')


In [63]:
# Total objects and total size in song dataset

total_object = 0
total_size = 0
for object in sampleDbBucket.objects.filter(Prefix = 'song_data'):
    total_size += object.size
    total_object += 1

print("Total object is \t", total_object)
print("Total size (gb) is \t", total_size/(1024**3))

Total object is 	 14897
Total size (gb) is 	 0.003461006097495556


In [67]:
# Download 1 file from song data for testing

key = 'song_data/A/A/A/TRAAAAK128F9318786.json'
outPutName = 'TRAAAAK128F9318786.json'
sampleDbBucket.download_file(key, outPutName)

In [76]:
# Read 1 file from song dataset into df
df = pd.read_json(os.getcwd() + "/" + outPutName, lines=True)

In [77]:
df.head(5)

Unnamed: 0,artist_id,artist_latitude,artist_location,artist_longitude,artist_name,duration,num_songs,song_id,title,year
0,ARJNIUY12298900C91,,,,Adelitas Way,213.9424,1,SOBLFFE12AF72AA5BA,Scream,2009


#### Log Dataset

In [78]:
# Get Bucket name and Key
LOG_DATA_LINK = config.get('S3','LOG_DATA')
LOG_DATA_LINK

"'s3://udacity-dend/log_data'"

In [80]:
# Print first 5 objs to make sure in right bucket.
sampleDbBucket =  s3.Bucket('udacity-dend')

for obj in sampleDbBucket.objects.filter(Prefix = 'log_data').limit(count=5):
    print(obj)

s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-02-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-03-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-04-events.json')


In [82]:
# Total objects and total size in log dataset

total_object = 0
total_size = 0
for object in sampleDbBucket.objects.filter(Prefix = 'log_data'):
    total_size += object.size
    total_object += 1

print("Total object is \t", total_object)
print("Total size (gb) is \t", total_size/(1024**3))

Total object is 	 31
Total size (gb) is 	 0.003497673198580742


In [83]:
# Download 1 file from song data for testing

key = 'log_data/2018/11/2018-11-01-events.json'
outPutName = '2018-11-01-events.json'
sampleDbBucket.download_file(key, outPutName)

In [84]:
# Read 1 file from log dataset into df
df = pd.read_json(os.getcwd() + "/" + outPutName, lines=True)
df.head(5)

Unnamed: 0,artist,auth,firstName,gender,itemInSession,lastName,length,level,location,method,page,registration,sessionId,song,status,ts,userAgent,userId
0,,Logged In,Walter,M,0,Frye,,free,"San Francisco-Oakland-Hayward, CA",GET,Home,1540919166796,38,,200,1541105830796,"""Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4...",39
1,,Logged In,Kaylee,F,0,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Home,1540344794796,139,,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
2,Des'ree,Logged In,Kaylee,F,1,Summers,246.30812,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,You Gotta Be,200,1541106106796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
3,,Logged In,Kaylee,F,2,Summers,,free,"Phoenix-Mesa-Scottsdale, AZ",GET,Upgrade,1540344794796,139,,200,1541106132796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8
4,Mr Oizo,Logged In,Kaylee,F,3,Summers,144.03873,free,"Phoenix-Mesa-Scottsdale, AZ",PUT,NextSong,1540344794796,139,Flat 55,200,1541106352796,"""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",8


In [85]:
# Download log_json_path.json for testing

key = 'log_json_path.json'
outPutName = 'log_json_path.json'
sampleDbBucket.download_file(key, outPutName)