In [43]:
import os
import pandas as pd
from progressbar import progressbar

In [161]:
pd.set_option('display.max_rows', 500)
pd.options.mode.chained_assignment = None  # default='warn'

In [173]:
def process_box_scores(dir: str="../data/nba_boxscores", to_file=False) -> pd.DataFrame:
    def calculateFinalScore(df: pd.DataFrame)-> tuple[int, int, int, int]:
        '''Return gameid, teams playing, and final score
        
        Params:
            df: boxscore dataframe
        Returns:
            game_id, team1_id, team2_id, team1_score, team2_score
        '''
        game_id = df["GAME_ID"][0]
        team1_id = df["TEAM_ID"][0]
        team2_id = df["TEAM_ID"].iloc[-1]
        team1_score = df.loc[df["TEAM_ID"] == team1_id]["PTS"].sum()
        team2_score = df.loc[df["TEAM_ID"] == team2_id]["PTS"].sum()
        winner = 1 if team1_score > team2_score else 2

        return game_id, team1_id, team2_id, team1_score, team2_score, winner
    
    final_scores = []

    for filename in progressbar(os.listdir(dir)):
        f = os.path.join(dir, filename)
        # checking if it is a file
        if not os.path.isfile(f):
            continue
    
        df = pd.read_csv(f)
        final_scores.append(calculateFinalScore(df))
    
    final_scores_df = pd.DataFrame(final_scores, columns=["GAME_ID", "TEAM1_ID", "TEAM2_ID", "TEAM1_SCORE", "TEAM2_SCORE", "WINNER"])

    if to_file:
        final_scores_df.to_csv("../data_proc/final_scores.csv", index=False)

    return final_scores_df

process_box_scores("../data/nba_boxscores", to_file=True)

100% (15365 of 15365) |##################| Elapsed Time: 0:00:22 Time:  0:00:22


Unnamed: 0,GAME_ID,TEAM1_ID,TEAM2_ID,TEAM1_SCORE,TEAM2_SCORE,WINNER
0,21600885,1610612737,1610612738,114.0,98.0,1
1,21900278,1610612741,1610612757,103.0,107.0,2
2,11200117,1610612756,1610612743,88.0,72.0,1
3,21400829,1610612738,1610612747,111.0,118.0,2
4,41900313,1610612747,1610612743,106.0,114.0,2
...,...,...,...,...,...,...
15360,21800513,1610612752,1610612749,96.0,112.0,2
15361,21700136,1610612737,1610612739,117.0,115.0,1
15362,21400830,1610612755,1610612748,108.0,119.0,2
15363,21600644,1610612758,1610612763,91.0,107.0,2


In [228]:
def game_id_df(dir: str="../data/nba_gamelogs/", join_files=True, update_master=False, to_file=False) -> pd.DataFrame:
    '''Mapping df between NBA Game_ID (ex. 21600885) and betting (ex. "20231111_nba_Miami_Atlanta") IDs 
    
    Params:
        dir: directory of nba gamelogs
        join_files: create/use processed combined gamelogs
        reload_master: update processed combined gamelogs
        to_file: write to file
    '''
    lu = pd.read_csv("../data/lu/nba_team_lu.csv")
    lu.drop(["NBA_team_name", "team_id"], axis=1, inplace=True)
    lu.dropna(inplace=True)
    abbrv_to_sbr = lu.set_index("NBA_team_abbrev").T.to_dict('records')[0]

    size = 0
    if join_files and not update_master and os.path.isfile("gamelogs_master.csv"):
        df = pd.read_csv("gamelogs_master.csv")
        
    else:
        df = pd.DataFrame()
        for filename in progressbar(os.listdir(dir)):
            f = os.path.join(dir, filename)
            # checking if it is a file
            if not os.path.isfile(f):
                continue
    
            df_file = pd.read_csv(f)
            size += len(df_file.index)
            df = pd.concat([df, df_file])

        if join_files:
            df.to_csv("gamelogs_master.csv")
    
    display(size)

    df.drop_duplicates(["TEAM_ID", "GAME_ID"], inplace=True)
    game_id_df = df[["GAME_ID", "GAME_DATE", "MATCHUP"]]
    game_id_df["GAME_DATE"] = game_id_df["GAME_DATE"].apply(lambda x: x.split("T")[0].replace("-", ""))
    game_id_df[["TEAM1", "TEAM2"]] = game_id_df["MATCHUP"].str.split(r' .* ', expand=True)
    game_id_df["TEAM1"] = game_id_df["TEAM1"].map(abbrv_to_sbr)
    game_id_df["TEAM2"] = game_id_df["TEAM2"].map(abbrv_to_sbr)

    game_id_df["SBR_GAME_ID"] = game_id_df["GAME_DATE"] + "_nba_" + game_id_df["TEAM1"] + "_" + game_id_df["TEAM2"]
    game_id_df = game_id_df[["GAME_ID", "SBR_GAME_ID"]]

    if to_file:
        game_id_df.to_csv("game_id_proc.csv", index=False)
    return game_id_df


game_id_df("../data/nba_gamelogs/", to_file=True)

  df = pd.read_csv("gamelogs_master.csv")


0

Unnamed: 0,GAME_ID,SBR_GAME_ID
0,11800001,
9,11800001,
26,11800002,20180928_nba_Charlotte_Boston
28,11800002,20180928_nba_Boston_Charlotte
53,11800003,20180929_nba_Toronto_Portland
...,...,...
330736,41400404,20150611_nba_Golden State_Cleveland
330755,41400405,20150614_nba_Cleveland_Golden State
330757,41400405,20150614_nba_Golden State_Cleveland
330772,41400406,20150616_nba_Cleveland_Golden State


In [219]:
def games_basic_master(boxscore_dir:str="../data/nba_boxscores", 
                            gamelogs_dir:str="../data/nba_gamelogs/", 
                            bets_dir:str="../data/nba_lines_historical",
                            to_file=False) -> pd.DataFrame:
    '''Generates master games dataframe with game id's, final scores, and betting lines

    Params:
        boxscore_dir: directory for boxscores, runs process_box_scores on directory if no csv detected
        gamelogs_dir: directory for gamelogs, runs game_id_df on directory if no csv detected
        bets_dir: directory for betting lines, combines with boxscore and gamelogs
    '''

    if not os.path.isfile("final_scores.csv"):
        finalscore_df = process_box_scores(boxscore_dir)
    else:
        finalscore_df = pd.read_csv("final_scores.csv")

    if not os.path.isfile("game_id_proc.csv"):
        game_id_df = game_id_df(gamelogs_dir)
    else:
        game_id_df = pd.read_csv("game_id_proc.csv")
    
    master_df = pd.DataFrame()

    # process bets

    # moneylines -> probabilities
    
    f = os.path.join(bets_dir, f"nba_historical_moneyline.csv")

    df = pd.read_csv(f)
    df.dropna(inplace=True)
    df["away_prob"] = df["away_line"].apply(lambda x: -100 / x if x < 0 else x / 100)
    df["away_prob"] = 1 - df["away_prob"] / (df["away_prob"] + 1)
    df["home_prob"] = df["home_line"].apply(lambda x: -100 / x if x < 0 else x / 100)
    df["home_prob"] = 1 - df["home_prob"] / (df["home_prob"] + 1)
    df.drop(df[(df["away_prob"] + df["home_prob"] >= 1.4)].index, inplace=True) # 1.3 is profit margin for sports bettings
    df.drop(["sport", "date", "bet_type", "away", "home", "home_line", "away_line", "home_prob"], axis=1, inplace=True)

    bets_master = df

    # spread
    f = os.path.join(bets_dir, f"nba_historical_spread.csv")

    df = pd.read_csv(f)
    df.dropna(inplace=True)
    df.drop(df[(df["away_line"] != -df["home_line"])].index, inplace=True) # home line opposite of away line
    df.drop(["sport", "date", "bet_type", "away", "home", "home_line"], axis=1, inplace=True)
    df.rename(columns={"away_line": "away_spread"}, inplace=True)
    bets_master = bets_master.merge(df, on="game_id", how="outer")

    # total
    f = os.path.join(bets_dir, f"nba_historical_total.csv")

    df = pd.read_csv(f)
    df.dropna(inplace=True)
    df.drop(df[(df["away_line"] != df["home_line"])].index, inplace=True) # home line equal away line
    df.drop(["sport", "date", "bet_type", "away", "home", "home_line"], axis=1, inplace=True)
    df.rename(columns={"away_line": "away_total"}, inplace=True)
    bets_master = bets_master.merge(df, on="game_id", how="outer")

    bets_master.dropna(subset=["away_prob", "away_spread", "away_total"], inplace=True)
    
    # final scores
    master_df = finalscore_df

    # game ids
    master_df = master_df.merge(game_id_df, on="GAME_ID")
    master_df.insert(1, "SBR_GAME_ID", master_df.pop("SBR_GAME_ID"))
    
    # betting lines
    master_df = master_df.merge(bets_master, left_on="SBR_GAME_ID", right_on="game_id")
    master_df.drop("game_id", axis=1, inplace=True)

    if to_file:
        master_df.to_csv("games_basic_master.csv", index=False)

    return master_df
    

games_basic_master(to_file=True)

Unnamed: 0,GAME_ID,SBR_GAME_ID,TEAM1_ID,TEAM2_ID,TEAM1_SCORE,TEAM2_SCORE,WINNER,away_prob,away_spread,away_total
0,21600885,20170227_nba_Atlanta_Boston,1610612737,1610612738,114.0,98.0,1,0.392157,4.5,212.5
1,21900278,20191129_nba_Chicago_Portland,1610612741,1610612757,103.0,107.0,2,0.277778,7.0,227.0
2,41900313,20200922_nba_L.A. Lakers_Denver,1610612747,1610612743,106.0,114.0,2,0.700599,-6.0,211.5
3,22100624,20220112_nba_Cleveland_Utah,1610612739,1610612762,111.0,91.0,1,0.363636,5.5,222.5
4,22000156,20210111_nba_Indiana_Sacramento,1610612754,1610612758,122.0,127.0,2,0.622642,-3.5,229.0
...,...,...,...,...,...,...,...,...,...,...
9645,21800513,20181227_nba_New York_Milwaukee,1610612752,1610612749,96.0,112.0,2,0.106724,14.0,226.5
9646,21700136,20171105_nba_Atlanta_Cleveland,1610612737,1610612739,117.0,115.0,1,0.140056,11.0,220.0
9647,21400830,20150223_nba_Philadelphia_Miami,1610612755,1610612748,108.0,119.0,2,0.148148,11.5,193.5
9648,21600644,20170120_nba_Sacramento_Memphis,1610612758,1610612763,91.0,107.0,2,0.256410,8.0,199.5
