## In this notebook

- Generate players data in three different versions.
    - v1 - players data extracted from play-to-play data; all columns
    - v2 - players data extracted from play-to-play data + players game log data; all columns
    - v3 - players data extracted from play-to-play data + players game log data; subset of columns

In [1]:
import io
import json
import os
from pathlib import Path
from typing import List

import boto3
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import requests

from tqdm.notebook import tqdm

In [2]:
S3_ACCESS_KEY_ID = ""
S3_SECRET_ACCESS_KEY = ""

DATA_PATH = Path("/usr/src/app/data/")

In [3]:
s3 = boto3.resource(
    "s3",
    aws_access_key_id=S3_ACCESS_KEY_ID,
    aws_secret_access_key=S3_SECRET_ACCESS_KEY
)


def read_json_from_s3(bucket_name: str, key: str) -> pd.DataFrame:
    return json.loads(
        s3.Object(bucket_name=bucket_name, key=key)
        .get()["Body"]
        .read()
        .decode("utf-8")
    )


def read_parquet_from_s3(bucket_name: str, key: str) -> pd.DataFrame:
    return pd.read_parquet(
        io.BytesIO(
            s3.Object(bucket_name=bucket_name, key=key)
            .get()["Body"]
            .read()
        )
    )

def get_general_game_features(game: dict) -> dict:
    return {
        "game_id": game.get("id"),
        "game_date": game.get("gameDate"),
        "away_team_id": game.get("awayTeam", {}).get("id"),
        "home_team_id": game.get("homeTeam", {}).get("id"),
    }

def get_game_log_features(player_id: int, game_id: int, season: str, season_type: str) -> dict:    
    response = requests.get(
        url=f"https://api-web.nhle.com/v1/player/{player_id}/game-log/{season}/{season_type}"
    )
    
    game_log = {}
    
    if response.ok:
        game_log = next(
            (
                item for item 
                in response.json().get("gameLog", {}) 
                if item["gameId"] == game_id
            ), 
            {}
        )
    
    return {
        "goals": game_log.get("goals"),
        "assists": game_log.get("assists"),
        "points": game_log.get("points"),
        "pim": game_log.get("pim"),
        "toi": game_log.get("toi"),
        
        # goaltender stats
        "games_started": game_log.get("gamesStarted"),
        "shots_against": game_log.get("shotsAgainst"),
        "goals_against": game_log.get("goalsAgainst"),
        "save_pctg": game_log.get("savePctg"),
        "shutouts": game_log.get("shutouts"),
        
        # skater stats
        "plus_minus": game_log.get("plusMinus"),
        "power_play_goals": game_log.get("powerPlayGoals"),
        "power_play_points": game_log.get("powerPlayPoints"),
        "game_winning_goals": game_log.get("gameWinningGoals"),
        "ot_goals": game_log.get("otGoals"),
        "shots": game_log.get("shots"),
        "shifts": game_log.get("shifts"),
        "shorthanded_goals": game_log.get("shorthandedGoals"),
        "shorthanded_points": game_log.get("shorthandedPoints"),
    }

def get_df_players(game: dict) -> pd.DataFrame:
    game_id = game.get("id")
    season_start_year = int(game_id / 1e+6)
    
    return pd.DataFrame(
        [
            {
                **get_general_game_features(game=game),
                "player_id": player.get("playerId"),
                "team_id": player.get("teamId"),
                "season": game.get("season"),
                "first_name": player.get("firstName").get("default"),
                "last_name": player.get("lastName").get("default"),
                "sweater_number": player.get("sweaterNumber"),
                "position_code": player.get("positionCode"),
                "headshot": player.get("headshot"),
                # **get_game_log_features(
                #     player_id=player.get("playerId"), 
                #     game_id=game.get("id"),
                #     season=f"{season_start_year}{season_start_year + 1}",
                #     season_type=str(game_id)[5],
                # )
            } 
            for player in game.get("rosterSpots")
        ]
    )

In [4]:
BUCKET_NAME = "frozen-facts-center-raw"
KEY = "games/2020/regular/2020020003.json"

# game = read_json_from_s3(bucket_name=BUCKET_NAME, key=KEY)
# df_players = get_df_players(game=game)

# df_players

In [5]:
bucket_raw = s3.Bucket("frozen-facts-center-raw")
# bucket_base = s3.Bucket("frozen-facts-center-base")

for obj_raw in tqdm(bucket_raw.objects.all(), total=len(list(bucket_raw.objects.all()))):
    
    # create file path
    parquet_file_key = obj_raw.key.replace("games/", "players/").replace(".json", ".parquet")
    
    file_path = DATA_PATH / parquet_file_key
    file_path.parent.mkdir(parents=True, exist_ok=True)
    
    if file_path.exists():
        continue
    
    # skip folders
    if not obj_raw.key.endswith(".json"):
        continue
        
    # read data
    game = read_json_from_s3(bucket_name=bucket_raw.name, key=obj_raw.key)
    df = get_df_players(game=game)
    
    # save data into PARQUET file
    # df.to_parquet(path=f"s3://{bucket_base.name}/{parquet_file_key}", index=False)
    df.to_parquet(path=file_path)
    
    # break

  0%|          | 0/5156 [00:00<?, ?it/s]

### v3

In [14]:
cols_v3 = [
    # general game info
    "game_id",
    "game_date",
    
    # game play-by-play
    "player_id",
    "team_id",
    "season",
    "first_name",
    "last_name",
    "sweater_number",
    "position_code",
    "headshot",
    
    # game log data
    "pim",
    "toi",
    "shifts",
    "plus_minus",
    "power_play_goals",
    "power_play_points",
    "shorthanded_goals",
    "shorthanded_points",
    "game_winning_goals",
    "ot_goals",
]

In [16]:
files_v2 = [file for file in (DATA_PATH / "players-v2").glob("**/*.parquet")]

for file_path_v2 in tqdm(files_v2, total=len(files_v2)):
    
    # create new path
    file_path_v3 = Path(file_path_v2.as_posix().replace("players-v2", "players-v3"))
    file_path_v3.parent.mkdir(parents=True, exist_ok=True)
    
    # save data
    df = pd.read_parquet(file_path_v2)
    df.loc[:, cols_v3].to_parquet(path=file_path_v3)

  0%|          | 0/5147 [00:00<?, ?it/s]