In [43]:
import os
import pandas as pd
from progressbar import progressbar

In [48]:
def process_box_scores(dir: str, to_file=False) -> None:
    def calculateFinalScore(df: pd.DataFrame)-> tuple[int, int, int, int]:
        '''Return gameid, teams playing, and final score
        
        Params:
            df: boxscore dataframe
        Returns:
            game_id, team1_id, team2_id, team1_score, team2_score
        '''
        game_id = df["GAME_ID"][0]
        team1_id = df["TEAM_ID"][0]
        team2_id = df["TEAM_ID"].iloc[-1]
        team1_score = df.loc[df["TEAM_ID"] == team1_id]["PTS"].sum()
        team2_score = df.loc[df["TEAM_ID"] == team2_id]["PTS"].sum()
        winner = 1 if team1_score > team2_score else 2

        return game_id, team1_id, team2_id, team1_score, team2_score, winner
    
    final_scores = []

    for filename in progressbar(os.listdir(dir)):
        f = os.path.join(dir, filename)
        # checking if it is a file
        if not os.path.isfile(f):
            continue
    
        df = pd.read_csv(f)
        final_scores.append(calculateFinalScore(df))
    
    final_scores_df = pd.DataFrame(final_scores, columns=["GAME_ID", "TEAM1_ID", "TEAM2_ID", "TEAM1_SCORE", "TEAM2_SCORE", "WINNER"])

    if to_file:
        final_scores_df.to_csv("../data_proc/final_scores.csv")

    return final_scores_df

process_box_scores("../data/nba_boxscores", to_file=True)

100% (15365 of 15365) |##################| Elapsed Time: 0:00:20 Time:  0:00:20


Unnamed: 0,GAME_ID,TEAM1_ID,TEAM2_ID,TEAM1_SCORE,TEAM2_SCORE,WINNER
0,21600885,1610612737,1610612738,114.0,98.0,1
1,21900278,1610612741,1610612757,103.0,107.0,2
2,11200117,1610612756,1610612743,88.0,72.0,1
3,21400829,1610612738,1610612747,111.0,118.0,2
4,41900313,1610612747,1610612743,106.0,114.0,2
...,...,...,...,...,...,...
15360,21800513,1610612752,1610612749,96.0,112.0,2
15361,21700136,1610612737,1610612739,117.0,115.0,1
15362,21400830,1610612755,1610612748,108.0,119.0,2
15363,21600644,1610612758,1610612763,91.0,107.0,2
