In [1]:
import io
import json
import os
from pathlib import Path

import boto3
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

from tqdm.notebook import tqdm

In [2]:
S3_ACCESS_KEY_ID = ""
S3_SECRET_ACCESS_KEY = ""

DATA_PATH = Path("/usr/src/app/data/")

In [3]:
s3 = boto3.resource(
    "s3",
    aws_access_key_id=S3_ACCESS_KEY_ID,
    aws_secret_access_key=S3_SECRET_ACCESS_KEY
)


def read_json_from_s3(bucket_name: str, key: str) -> pd.DataFrame:
    return json.loads(
        s3.Object(bucket_name=bucket_name, key=key)
        .get()["Body"]
        .read()
        .decode("utf-8")
    )


def read_parquet_from_s3(bucket_name: str, key: str) -> pd.DataFrame:
    return pd.read_parquet(
        io.BytesIO(
            s3.Object(bucket_name=bucket_name, key=key)
            .get()["Body"]
            .read()
        )
    )

def get_general_game_features(game: dict) -> dict:
    return {
        "game_id": game.get("id"),
        "game_date": game.get("gameDate"),
        "away_team_id": game.get("awayTeam", {}).get("id"),
        "home_team_id": game.get("homeTeam", {}).get("id"),
    }

def get_df_players(game: dict) -> pd.DataFrame:
    return pd.DataFrame(
        [
            {
                **get_general_game_features(game=game),
                "player_id": player.get("playerId"),
                "team_id": player.get("teamId"),
                "season": game.get("season"),
                "first_name": player.get("firstName").get("default"),
                "last_name": player.get("lastName").get("default"),
                "sweater_number": player.get("sweaterNumber"),
                "position_code": player.get("positionCode"),
                "headshot": player.get("headshot"),
            } 
            for player in game.get("rosterSpots")
        ]
    )

In [4]:
BUCKET_NAME = "frozen-facts-center-raw"
KEY = "games/2020/regular/2020020003.json"

game = read_json_from_s3(bucket_name=BUCKET_NAME, key=KEY)
df_players = get_df_players(game=game)

df_players

Unnamed: 0,game_id,game_date,away_team_id,home_team_id,player_id,team_id,season,first_name,last_name,sweater_number,position_code,headshot
0,2020020003,2021-01-13,8,10,8466138,10,20202021,Joe,Thornton,97,C,https://assets.nhle.com/mugs/nhl/20202021/TOR/...
1,2020020003,2021-01-13,8,10,8469455,10,20202021,Jason,Spezza,19,C,https://assets.nhle.com/mugs/nhl/20202021/TOR/...
2,2020020003,2021-01-13,8,10,8470642,8,20202021,Shea,Weber,6,D,https://assets.nhle.com/mugs/nhl/20202021/MTL/...
3,2020020003,2021-01-13,8,10,8471679,8,20202021,Carey,Price,31,G,https://assets.nhle.com/mugs/nhl/20202021/MTL/...
4,2020020003,2021-01-13,8,10,8473507,8,20202021,Jeff,Petry,26,D,https://assets.nhle.com/mugs/nhl/20202021/MTL/...
5,2020020003,2021-01-13,8,10,8474038,8,20202021,Paul,Byron,41,L,https://assets.nhle.com/mugs/nhl/20202021/MTL/...
6,2020020003,2021-01-13,8,10,8474162,10,20202021,Jake,Muzzin,8,D,https://assets.nhle.com/mugs/nhl/20202021/TOR/...
7,2020020003,2021-01-13,8,10,8474190,10,20202021,Wayne,Simmonds,24,R,https://assets.nhle.com/mugs/nhl/20202021/TOR/...
8,2020020003,2021-01-13,8,10,8474567,10,20202021,Zach,Bogosian,22,D,https://assets.nhle.com/mugs/nhl/20202021/TOR/...
9,2020020003,2021-01-13,8,10,8474596,8,20202021,Jake,Allen,34,G,https://assets.nhle.com/mugs/nhl/20202021/MTL/...


In [5]:
bucket_raw = s3.Bucket("frozen-facts-center-raw")
bucket_base = s3.Bucket("frozen-facts-center-base")

for obj_raw in tqdm(bucket_raw.objects.all(), total=len(list(bucket_raw.objects.all()))):
    
    # skip folders
    if not obj_raw.key.endswith(".json"):
        continue
    
    # read data
    game = read_json_from_s3(bucket_name=bucket_raw.name, key=obj_raw.key)
    df = get_df_players(game=game)
    
    # save data into PARQUET file
    parquet_file_key = obj_raw.key.replace("games/", "players/").replace(".json", ".parquet")
    # df.to_parquet(path=f"s3://{bucket_base.name}/{parquet_file_key}", index=False)
    
    file_path = DATA_PATH / parquet_file_key
    file_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(path=file_path)
    
    # break

  0%|          | 0/5153 [00:00<?, ?it/s]