In [1]:
import io
import json
import os
from pathlib import Path

import boto3
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

from tqdm.notebook import tqdm

In [2]:
S3_ACCESS_KEY_ID = ""
S3_SECRET_ACCESS_KEY = ""

DATA_PATH = Path("/usr/src/app/data/")

In [3]:
s3 = boto3.resource(
    "s3",
    aws_access_key_id=S3_ACCESS_KEY_ID,
    aws_secret_access_key=S3_SECRET_ACCESS_KEY
)


def read_json_from_s3(bucket_name: str, key: str) -> pd.DataFrame:
    return json.loads(
        s3.Object(bucket_name=bucket_name, key=key)
        .get()["Body"]
        .read()
        .decode("utf-8")
    )


def read_parquet_from_s3(bucket_name: str, key: str) -> pd.DataFrame:
    return pd.read_parquet(
        io.BytesIO(
            s3.Object(bucket_name=bucket_name, key=key)
            .get()["Body"]
            .read()
        )
    )

def get_df_players(raw_game: dict) -> pd.DataFrame:
    return pd.DataFrame(
        [
            {
                "player_id": player.get("playerId"),
                "team_id": player.get("teamId"),
                "season": raw_game.get("season"),
                "first_name": player.get("firstName").get("default"),
                "last_name": player.get("lastName").get("default"),
                "sweater_number": player.get("sweaterNumber"),
                "position_code": player.get("positionCode"),
                "headshot": player.get("headshot"),
            } 
            for player in raw_game.get("rosterSpots")
        ]
    )

In [4]:
BUCKET_NAME = "frozen-facts-center-raw"
KEY = "games/2020/regular/2020020001.json"

raw_game = read_json_from_s3(bucket_name=BUCKET_NAME, key=KEY)
df_players = get_df_players(raw_game=raw_game)

df_players

Unnamed: 0,player_id,team_id,season,first_name,last_name,sweater_number,position_code,headshot
0,8470880,4,20202021,Brian,Elliott,37,G,https://assets.nhle.com/mugs/nhl/20202021/PHI/...
1,8471215,5,20202021,Evgeni,Malkin,71,C,https://assets.nhle.com/mugs/nhl/20202021/PIT/...
2,8471675,5,20202021,Sidney,Crosby,87,C,https://assets.nhle.com/mugs/nhl/20202021/PIT/...
3,8471724,5,20202021,Kris,Letang,58,D,https://assets.nhle.com/mugs/nhl/20202021/PIT/...
4,8473512,4,20202021,Claude,Giroux,28,R,https://assets.nhle.com/mugs/nhl/20202021/PHI/...
5,8474027,4,20202021,Justin,Braun,61,D,https://assets.nhle.com/mugs/nhl/20202021/PHI/...
6,8474037,4,20202021,James,van Riemsdyk,25,L,https://assets.nhle.com/mugs/nhl/20202021/PHI/...
7,8474098,5,20202021,Colton,Sceviour,7,C,https://assets.nhle.com/mugs/nhl/20202021/PIT/...
8,8474161,4,20202021,Jakub,Voracek,93,R,https://assets.nhle.com/mugs/nhl/20202021/PHI/...
9,8475208,5,20202021,Brian,Dumoulin,8,D,https://assets.nhle.com/mugs/nhl/20202021/PIT/...


In [11]:
bucket_raw = s3.Bucket("frozen-facts-center-raw")
bucket_base = s3.Bucket("frozen-facts-center-base")

for obj_raw in tqdm(bucket_raw.objects.all(), total=len(list(bucket_raw.objects.all()))):
    
    # skip folders
    if not obj_raw.key.endswith(".json"):
        continue
    
    # read data
    raw_game = read_json_from_s3(bucket_name=bucket_raw.name, key=obj_raw.key)
    df = get_df_players(raw_game=raw_game)
    
    # save data into PARQUET file
    parquet_file_key = obj_raw.key.replace("games/", "players/").replace(".json", ".parquet")
    # df.to_parquet(path=f"s3://{bucket_base.name}/{parquet_file_key}", index=False)
    
    file_path = DATA_PATH / parquet_file_key
    file_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(path=file_path)
    
    # break

  0%|          | 0/4485 [00:00<?, ?it/s]