In [5]:
import pandas as pd

def get_max_avg_col(col_list):
    col_avg = []
    col_max = []

    for i in range(len(col_list)):
        if i == 0:
            col_avg.append(pd.NA)
            col_max.append(pd.NA)
        else:
            col_avg.append(sum(col_list[:i]) / i)
            col_max.append(max(col_list[:i])) 

    return col_avg, col_max

In [6]:
import numpy as np

def get_col_lstsq(col_list):
    col_list = pd.Series(col_list).interpolate(method='linear').bfill().ffill().tolist()
    A = []
    bias = []
    momentum = []
    assert not any(pd.isna(col_list))
    for i in range(len(col_list)):
        A.append([1, i+1])
        x_lstsq, _, _, _= np.linalg.lstsq(np.array(A), np.array(col_list[:i+1]), rcond=None)
        bias.append(x_lstsq[0])
        momentum.append(x_lstsq[1])
        
    return bias, momentum

In [12]:
def get_game_stats_data_df(season_year, target_team_ids=None, target_game_date=None):
    df = pd.read_sql_table(f"game_stats_{season_year}", "sqlite:///../database/game_stats.db")
    df = df[df['SEASON_ID'] == f'2{season_year[:season_year.index("-")]}']
    if target_team_ids:
        df = df[df['TEAM_ID'].isin(target_team_ids)]
    if target_game_date:
        df = df[df['GAME_DATE'] <= target_game_date]
    df['HOME'] = df['MATCHUP'].apply(lambda x: 'vs.' in x if isinstance(x, str) else False).astype(int)
    features = [
        'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
        'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV',
        'PF', 'PTS', 'PLUS_MINUS'
    ]
    
    i = 0
    for team in df['TEAM_ID'].unique():
        team_df = df[df['TEAM_ID'] == team]
        team_sorted = team_df.sort_values('GAME_DATE')
    
        for col in features:
            col_list = list(team_sorted[col])
            avg_col, max_col = get_max_avg_col(col_list)
            bias_col, mom_col = get_col_lstsq(col_list)
    
            team_sorted[f'{col}_BIAS'] = bias_col
            team_sorted[f'{col}_MOM'] = mom_col
            team_sorted[f'{col}_AVG'] = avg_col
            team_sorted[f'{col}_MAX'] = max_col
    
        if i == 0:
            teams_df = team_sorted.copy()
            
        else:
            teams_df = pd.concat([teams_df, team_sorted], ignore_index=True)
            
        i += 1
    
    if target_game_date:
        teams_df = teams_df[teams_df["GAME_DATE"] == target_game_date]
    teams_df.drop(['FGM', 'FGA','FG3M', 'FG3A', 'FTM','FTA'], axis=1, inplace=True)

    opp_features = [f'{i}_OPP' for i in list(teams_df)]
    home_df = teams_df[teams_df['HOME'] == 1].sort_values('GAME_ID')
    away_df = teams_df[teams_df['HOME'] == 0].sort_values('GAME_ID')
    away_df.columns = opp_features
    common_ids = set(home_df['GAME_ID']) & set(away_df['GAME_ID_OPP'])
    home_df = home_df[home_df['GAME_ID'].isin(common_ids)]
    away_df = away_df[away_df['GAME_ID_OPP'].isin(common_ids)]
    home_df.sort_values('GAME_ID')
    home_df.reset_index(drop=True)
    away_df.sort_values('GAME_ID_OPP')
    away_df.reset_index(drop=True)
    away_df['GAME_ID'] = away_df['GAME_ID_OPP'] 
    merged_df = pd.merge(home_df, away_df, on='GAME_ID')
    merged_df = merged_df.drop([
        'SEASON_ID_OPP',
        'TEAM_ID_OPP',
        'HOME_OPP',
        'MIN_OPP',
        'MATCHUP_OPP',
        'SEASON_ID_OPP',
        'HOME'
    ], axis=1)
    basic_features = [
        'MIN', 'FG_PCT', 'FG3_PCT', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PLUS_MINUS'
    ]
    basic_features_opp = [feature + "_OPP" for feature in basic_features][1:]
    merged_df.drop(basic_features + basic_features_opp, inplace=True, axis=1)
    metadata = [
        "SEASON_ID", "TEAM_ABBREVIATION", "TEAM_NAME", "GAME_ID", "MATCHUP"
    ]
    metadata_opp = ["TEAM_ABBREVIATION_OPP", "TEAM_NAME_OPP", "GAME_ID_OPP"]
    merged_df.drop(metadata + metadata_opp, inplace=True, axis=1)
    merged_df.drop(["REB_BIAS", "REB_MOM", "REB_AVG", "REB_MAX", "REB_BIAS_OPP", "REB_MOM_OPP", "REB_AVG_OPP", "REB_MAX_OPP", "GAME_DATE_OPP", "WL_OPP"], axis=1, inplace=True)
    merged_df.replace({'L': 0, 'W': 1}, inplace=True)
    merged_df.dropna(subset=["TEAM_ID"], inplace=True)
    merged_df.sort_values('GAME_DATE', inplace=True)
    
    return merged_df

In [10]:
get_game_stats_data_df("2023-24", target_team_ids=[1610612742, 1610612760, 1610612753, 1610612749, 1610612757, 1610612758], target_game_date="2024-04-14")

True


  merged_df.replace({'L': 0, 'W': 1}, inplace=True)


Unnamed: 0,TEAM_ID,GAME_DATE,WL,MIN_BIAS,MIN_MOM,MIN_AVG,MIN_MAX,FGM_BIAS,FGM_MOM,FGM_AVG,...,PF_AVG_OPP,PF_MAX_OPP,PTS_BIAS_OPP,PTS_MOM_OPP,PTS_AVG_OPP,PTS_MAX_OPP,PLUS_MINUS_BIAS_OPP,PLUS_MINUS_MOM_OPP,PLUS_MINUS_AVG_OPP,PLUS_MINUS_MAX_OPP
0,1610613000.0,2024-04-14,1,232.592088,0.142761,238.860465,290,38.272654,0.034556,39.767442,...,19.686047,33,118.14702,-0.031293,117.104651,146,3.771345,-0.03613,2.497674,38.0
1,1610613000.0,2024-04-14,1,227.844195,0.221007,237.865169,290,41.019476,0.057447,43.516854,...,18.44186,29,116.578722,0.006178,117.209302,148,0.936755,0.031949,2.939535,50.0
2,1610613000.0,2024-04-14,1,230.524259,0.183606,238.761364,289,40.066139,0.050715,42.340909,...,20.174419,34,105.557605,-0.003791,105.662791,137,-4.408607,-0.090348,-8.027907,30.0
