In [None]:
import json
import boto3
import requests
import pandas as pd
from datetime import datetime

## Get files from the bucket
**NOTE:** No steps will work here unless you've set the AWS credentials in the poetry shell: 
- `export AWS_PROFILE=<your profile name>`

In [None]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('my-mvp-pipeline-bucket')

In [None]:
# Load all the file keys (each file is a batched output from the kinesis firehose)
file_keys = [obj.key for obj in bucket.objects.filter(Prefix='events/')]
# Print the first 5 in the list
file_keys[:5]

In [None]:
# Read a selected file into memory

# Create the file as an object
file = s3.Object(bucket.name, file_keys[0])
# Read (download) the object
json_line = file.get()['Body'].read().decode().strip()
# In the pipeline .put_record() method we added a \n so that we now can split on it to seperate the events
json_list = [json.loads(x) for x in json_line.split('\n')]
# Print the first two events
json_list[:2]

In [None]:
# Turn the Jsons into a dataframe so we can work with them
df = pd.json_normalize(json_list, sep='_')
# Convert timestamp to humanly readable format
df.loc[:, 'utc_timestamp'] = pd.to_datetime(df.utc_timestamp, unit='s')

In [None]:
df

## Working with dates
Even better is to work with dates so you don't load everything at once.

In [None]:
available_dates = set([datetime.strptime('-'.join(f.split('/')[1:4]), '%Y-%m-%d').date() for f in file_keys])
print('Available dates:', available_dates)

In [None]:
# Function to load specific file based on path
def load_file(file_path):
    file = s3.Object(bucket.name, file_path)
    return file.get()['Body'].read().decode().strip()

# Function to load everything existing under a specific date folder
def load_date(date_str: str) -> list:
    date = datetime.strptime(date_str, '%Y-%m-%d').date()
    date_path = '/'.join(str(date).split('-'))
    filter_str = f'events/{date_path}/'
    file_keys = [obj.key for obj in bucket.objects.filter(Prefix=filter_str)]
    json_list = []
    for f in file_keys:
        json_list.extend([json.loads(x) for x in load_file(f).split('\n')])
    return json_list

In [None]:
# Load specific date (string converted into date object in function)
jl = load_date('2020-10-29')

In [None]:
df = pd.json_normalize(jl, sep='_')

In [None]:
df