# Feature exploration

## Setup

In [None]:
import os
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
import sqlalchemy

load_dotenv();

In [None]:
engine = sqlalchemy.create_engine(
    'postgres+psycopg2://{username}:{password}@{host}/{dbname}'
    .format(
        username=os.environ.get("POSTGRES_USER"),
        password=os.environ.get("POSTGRES_PW"),
        host=os.environ.get("POSTGRES_HOST"),
        dbname="nba",
    )
)

## Target variables

Targets: assists, 2pt fg, 3pt fg, ftm, rebounds, steals, blocks, turnovers, fantasy points

### Data quality checks

In [None]:
# missing game logs
with engine.connect() as conn:
    sqlq = """
        select
            season, count(*)
        from games as gm
        where
            not exists (
                select * from player_gamelogs as gl
                where gl.game_id = gm.id
            )
            and gm.playedStatus = 'COMPLETED'
        group by 1
        order by 1
    """
    missing_games = pd.read_sql(sqlq, conn)

In [None]:
missing_games

### Targets

In [None]:
with engine.connect() as conn:
    sqlq = """
        with stats as (
            select
                  gm.season
                , gm.starttime
                , gl.game_id
                , gl.team_id
                , gl.player_id
                , gl.stats_fieldgoals_fg2ptmade as fgm_2p
                , gl.stats_fieldgoals_fg3ptmade as fgm_3p
                , gl.stats_freethrows_ftmade as ftm
                , gl.stats_rebounds_reb as trb
                , gl.stats_offense_astpergame as ast
                , gl.stats_defense_stl as stl
                , gl.stats_defense_blk as blk
                , gl.stats_defense_tov as tov
                , gl.stats_miscellaneous_minseconds as secs
            from public.player_gamelogs as gl
            join public.games as gm
                on gl.game_id = gm.id
        )
        select
            *
            , coalesce(fgm_2p, 0) * 2
                + coalesce(fgm_3p, 0) * 3
                + coalesce(ftm, 0)
                + coalesce(trb, 0) * 1.2
                + coalesce(ast, 0) * 1.5
                + coalesce(blk, 0) * 3
                + coalesce(stl, 0) * 3
                - coalesce(tov, 0)
                as fanduel_fpts
        from stats
        order by player_id, starttime
    """
    target = pd.read_sql(sqlq, conn)

In [None]:
target.groupby(["season"]).apply(lambda x: x.describe())

## Trends in stats

In [None]:
def rolling_avg(df, stats, window):
    avgs = (
        df
        .groupby(["player_id"])
        .apply(
            lambda x: 
               x
               .shift(1)[stats]
               .rolling(window=window)
               .mean()
               .rename(columns=lambda col: f"{col}_{window}g_avg")
        )
    )
    return df[["player_id", "team_id", "game_id"]].join(avgs)

In [None]:
columns = ['fgm_2p', 'fgm_3p', 'ftm', 'trb', 'ast', 'stl', 'blk', 'tov', 'fanduel_fpts', 'secs']
last_game, rolling_3gm, rolling_9gm, rolling_27gm = (
    rolling_avg(target, columns, win)
    for win in (1, 3, 9, 27)
)

## Opponent allowed stats

In [None]:
# team game stats
# rolling average game stats
# joined to player-game data

## Combine Features

In [None]:
combined = (
    target
    .join(last_game.pipe(lambda x: x[[col for col in x.columns if col not in target.columns]]))
    .join(rolling_3gm.pipe(lambda x: x[[col for col in x.columns if col not in target.columns]]))
    .join(rolling_9gm.pipe(lambda x: x[[col for col in x.columns if col not in target.columns]]))
    .join(rolling_27gm.pipe(lambda x: x[[col for col in x.columns if col not in target.columns]]))
    .loc[lambda x: x["season"] != '2016-2017-regular']
)

## Build model

In [None]:
from catboost import CatBoostRegressor, Pool

In [None]:
import numpy as np

In [None]:
games = combined["game_id"].unique()
np.random.shuffle(games)

eval_games = games[0:len(games)//10]
test_games = games[len(games)//10:len(games)//5]
train_games = games[len(games)//5:]

In [None]:
train_df = combined.loc[lambda x: x["game_id"].isin(train_games)]
train_pool = Pool(
    data=train_df.pipe(lambda x: x[[col for col in x.columns if col not in target.columns]]),
    label=train_df["fanduel_fpts"],
)

eval_df = combined.loc[lambda x: x["game_id"].isin(eval_games)]
eval_pool = Pool(
    data=eval_df.pipe(lambda x: x[[col for col in x.columns if col not in target.columns]]),
    label=eval_df["fanduel_fpts"],
)

test_df = combined.loc[lambda x: x["game_id"].isin(test_games)]
test_pool = Pool(
    data=test_df.pipe(lambda x: x[[col for col in x.columns if col not in target.columns]]),
    label=test_df["fanduel_fpts"],
)

model = CatBoostRegressor()
model = model.fit(train_pool, eval_set=eval_pool, verbose=100)

In [None]:
model.score(test_pool)

In [None]:
test_df["prediction"] = model.predict(test_pool)

In [None]:
test_df.plot(x="fanduel_fpts", y="prediction", kind="scatter")