# Feature exploration

## Setup

In [None]:
import os
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
import numpy as np
import sqlalchemy
from airflow.models import Variable
from catboost import CatBoostRegressor, Pool
from google.cloud import storage
import tempfile

load_dotenv();

In [None]:
POSTGRES_USER = Variable.get("POSTGRES_USER")
POSTGRES_PW = Variable.get("POSTGRES_PW")
POSTGRES_HOST = Variable.get("POSTGRES_HOST")
MSF_API_KEY = Variable.get("MSF_API_KEY")
postgres_connection_str = f'postgres+psycopg2://{POSTGRES_USER}:{POSTGRES_PW}@{POSTGRES_HOST}/nba'

engine = sqlalchemy.create_engine(postgres_connection_str)

## Lineups

In [None]:
# missing game logs
with engine.connect() as conn:
    sqlq = """
        select
              gm.season
            , lu.game_id
            , lu.team_id
            , lu.player_id
            , lu.type
            , (lu.position like 'Starter%%') as starter
            , lu.position
            , gm.playedStatus
        from lineups as lu
        join games as gm
            on lu.game_id = gm.id
    """
    lineups = pd.read_sql(sqlq, conn)

In [None]:
(
    lineups
    .groupby(["type", "season", "playedstatus"])
    .aggregate(count=("game_id", lambda x: x.count()), unique=("game_id", lambda x: len(x.unique())))
)

### Compare accuracy of expected vs actual

#### Played

In [None]:
(
    lineups
    .query("type == 'actual'")
    .drop(columns=["type", "playedstatus"])
    .merge(
        lineups.query("type == 'expected'"),
        how="outer",
        on=["player_id", "game_id", "team_id", "season"],
        indicator=True,
    )
    .query("playedstatus == 'COMPLETED'")
    .pivot_table(
        index=["season"],
        columns=["_merge"],
        values=["player_id"],
        aggfunc="count",
    )
    .apply(lambda x: x / x.sum(), axis=1)
)

#### Starting

In [None]:
(
    lineups
    .query("type == 'actual'")
    .drop(columns=["type", "playedstatus"])
    .merge(
        lineups.query("type == 'expected'"),
        how="outer",
        on=["player_id", "game_id", "team_id", "season"],
        indicator=True,
    )
    .query("playedstatus == 'COMPLETED'")
    .fillna({"starter_x": False, "starter_y": False})
    .assign(
        starter=lambda x: 
            [
                {0: "True non-starter", 1: "False non-starter", 2: "False starter", 3: "True starter"}.get(x)
                for x in x["starter_x"].astype(int) + x["starter_y"].astype(int) * 2
            ]
    )
    .pivot_table(
        index=["season", "_merge"],
        columns=["starter"],
        values=["player_id"],
        aggfunc="count",
    )
    .apply(lambda x: x / x.sum(), axis=1)
)

## Target variables

Targets: assists, 2pt fg, 3pt fg, ftm, rebounds, steals, blocks, turnovers, fantasy points

### Data quality checks

In [None]:
# missing game logs
with engine.connect() as conn:
    sqlq = """
        select
            season, count(*)
        from games as gm
        where
            not exists (
                select * from player_gamelogs as gl
                where gl.game_id = gm.id
            )
            and gm.playedStatus = 'COMPLETED'
        group by 1
        order by 1
    """
    missing_games = pd.read_sql(sqlq, conn)

In [None]:
missing_games

### Player game stats

In [None]:
with engine.connect() as conn:
    sqlq = """
        with stats as (
            select
                  gm.season
                , lu.game_id
                , lu.team_id
                , lu.player_id
                , lu.type
                , (lu.position like 'Starter%%')::int as expected_starter
                , lu.position as expected_position
                , (alu.position like 'Starter%%')::int as starter
                , alu.position as actual_position
                , gm.starttime
                , gm.playedstatus
                , gl.stats_fieldgoals_fg2ptmade as fgm_2p
                , gl.stats_fieldgoals_fg3ptmade as fgm_3p
                , gl.stats_freethrows_ftmade as ftm
                , gl.stats_rebounds_reb as trb
                , gl.stats_offense_astpergame as ast
                , gl.stats_defense_stl as stl
                , gl.stats_defense_blk as blk
                , gl.stats_defense_tov as tov
                , gl.stats_miscellaneous_minseconds as secs
            from lineups as lu
            join public.games as gm
                on lu.game_id = gm.id
            left join public.player_gamelogs as gl
                on lu.game_id = gl.game_id
                and lu.team_id = gl.team_id
                and lu.player_id = gl.player_id
            left join public.lineups as alu
                on lu.game_id = alu.game_id
                and lu.team_id = alu.team_id
                and lu.player_id = alu.player_id
                and alu.type = 'actual'
            where
                lu.type = 'expected'
        )
        select
            *
            , coalesce(fgm_2p, 0) * 2
                + coalesce(fgm_3p, 0) * 3
                + coalesce(ftm, 0)
                + coalesce(trb, 0) * 1.2
                + coalesce(ast, 0) * 1.5
                + coalesce(blk, 0) * 3
                + coalesce(stl, 0) * 3
                - coalesce(tov, 0)
                as fanduel_fpts
        from stats
        order by player_id, starttime
    """
    player_game_stats = pd.read_sql(sqlq, conn)

In [None]:
player_game_stats.groupby(["season", "playedstatus"]).apply(lambda x: x.describe())

## Trends in stats

In [None]:
def rolling_avg(df, stats, window):
    played_games = df.query("secs > 0")
    avgs = (
        played_games
        .groupby(["player_id"])
        .apply(
            lambda x: 
               x[stats]
               .rolling(window=window)
               .mean()
        )
    )
    played_games = played_games[["player_id", "team_id", "game_id"]].join(avgs)
    
    result = (
        df[["player_id", "team_id", "game_id"]]
        .merge(played_games, on=["player_id", "team_id", "game_id"], how="left")
        .groupby(["player_id"])
        .apply(lambda x: x.shift(1)[stats].fillna(method="ffill"))
        .rename(columns=lambda col: f"{col}_{window}g_avg")
    )
    
    return df[["player_id", "team_id", "game_id"]].join(result)

In [None]:
columns = ['fgm_2p', 'fgm_3p', 'ftm', 'trb', 'ast', 'stl', 'blk', 'tov', 'fanduel_fpts', 'secs']
last_game, rolling_3gm, rolling_9gm, rolling_27gm = (
    rolling_avg(player_game_stats, columns, win)
    for win in (1, 3, 9, 27)
)

## Opponent allowed stats

In [None]:
# team game stats
# rolling average game stats
# joined to player-game data

## Combine Features

In [None]:
# TODO: assert that all dfs being joined have the same number of rows, in the same order

In [None]:
combined = (
    player_game_stats
    .join(last_game.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]))
    .join(rolling_3gm.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]))
    .join(rolling_9gm.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]))
    .join(rolling_27gm.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]))
    .loc[lambda x: x["season"] != '2016-2017-regular']  # discard 2016 season as a warm start for trending metrics
)

## Build model

In [None]:
completed_games = combined.query("playedstatus == 'COMPLETED'")["game_id"].unique()
upcoming_games = combined.query("playedstatus == 'UNPLAYED'")["game_id"].unique()
np.random.shuffle(completed_games)

eval_games = completed_games[0:len(completed_games)//10]
test_games = completed_games[len(completed_games)//10:len(completed_games)//5]
train_games = completed_games[len(completed_games)//5:]

In [None]:
train_df = combined.loc[lambda x: x["game_id"].isin(train_games)]
train_pool = Pool(
    data=train_df.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]),
    label=train_df["fanduel_fpts"],
)

eval_df = combined.loc[lambda x: x["game_id"].isin(eval_games)]
eval_pool = Pool(
    data=eval_df.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]),
    label=eval_df["fanduel_fpts"],
)

test_df = combined.loc[lambda x: x["game_id"].isin(test_games)]
test_pool = Pool(
    data=test_df.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]),
    label=test_df["fanduel_fpts"],
)

model = CatBoostRegressor()
model = model.fit(train_pool, eval_set=eval_pool, verbose=100)

### Save model

In [None]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # bucket_name = "your-bucket-name"
    # source_file_name = "local/path/to/file"
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print(
        "File {} uploaded to {}.".format(
            source_file_name, destination_blob_name
        )
    )

In [None]:
with tempfile.NamedTemporaryFile() as file:
    model.save_model(file.name)
    upload_blob("airjordan-models", file.name, "fanduel_fpts_model.cbm")

### Evaluate model

In [None]:
model.score(test_pool)

In [None]:
test_df = test_df.assign(prediction=model.predict(test_pool))

In [None]:
test_df.plot(x="fanduel_fpts", y="prediction", kind="scatter")

## Score Upcoming games

In [None]:
assert len(upcoming_games) > 0, "Must have at least one upcoming game"
upcoming_df = combined.loc[lambda x: x["game_id"].isin(upcoming_games)]
upcoming_pool = Pool(
    data=upcoming_df.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]),
)

upcoming_df = upcoming_df.assign(prediction=model.predict(upcoming_pool))

## Upload features

In [None]:
with engine.connect() as conn:
    upcoming_df.to_sql("dfs_model_features", conn, index=False, if_exists="replace")

## Confirm that results match using saved model and features

### Load model

In [None]:
def download_blob(bucket_name, source_blob_name, destination_file_name):
    """Downloads a blob from the bucket."""
    # bucket_name = "your-bucket-name"
    # source_blob_name = "storage-object-name"
    # destination_file_name = "local/path/to/file"

    storage_client = storage.Client()

    bucket = storage_client.bucket(bucket_name)

    # Construct a client side representation of a blob.
    # Note `Bucket.blob` differs from `Bucket.get_blob` as it doesn't retrieve
    # any content from Google Cloud Storage. As we don't need additional data,
    # using `Bucket.blob` is preferred here.
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(
        "Blob {} downloaded to {}.".format(
            source_blob_name, destination_file_name
        )
    )

In [None]:
with tempfile.NamedTemporaryFile() as file:
    download_blob("airjordan-models", "fanduel_fpts_model.cbm", file.name)
    model = CatBoostRegressor().load_model(file.name)

### Download features

In [None]:
with engine.connect() as conn:
    features = pd.read_sql("select * from dfs_model_features", conn)

In [None]:
pool = Pool(
    data=features.pipe(lambda x: x[[col for col in x.columns if col not in player_game_stats.columns]]),
)

features = features.assign(prediction=model.predict(pool))

### Compare

In [None]:
upcoming_df = (
    upcoming_df
    .sort_values(by=["game_id", "team_id", "player_id"])
    .reset_index(drop=True)
)

features = (
    features
    .sort_values(by=["game_id", "team_id", "player_id"])
    .reset_index(drop=True)
)

matches = upcoming_df.eq(features)

In [None]:
upcoming_df[~matches["prediction"]]

In [None]:
features[~matches["prediction"]]