In [1]:
import pandas as pd 
from datetime import date

In [2]:
def combine_team_games(df, keep_method='home'):
    '''Combine a TEAM_ID-GAME_ID unique table into rows by game. Slow.

        Parameters
        ----------
        df : Input DataFrame.
        keep_method : {'home', 'away', 'winner', 'loser', ``None``}, default 'home'
            - 'home' : Keep rows where TEAM_A is the home team.
            - 'away' : Keep rows where TEAM_A is the away team.
            - 'winner' : Keep rows where TEAM_A is the losing team.
            - 'loser' : Keep rows where TEAM_A is the winning team.
            - ``None`` : Keep all rows. Will result in an output DataFrame the same
                length as the input DataFrame.
                
        Returns
        -------
        result : DataFrame
    '''
    # Join every row to all others with the same game ID.
    joined = pd.merge(df, df, suffixes=['_A', '_B'],
                      on=['GAME_ID'])
    # Filter out any row that is joined to itself.
    result = joined[joined.TEAM_ID_A != joined.TEAM_ID_B]
    # Take action based on the keep_method flag.
    if keep_method is None:
        # Return all the rows.
        pass
    elif keep_method.lower() == 'home':
        # Keep rows where TEAM_A is the home team.
        result = result[result.MATCHUP_A.str.contains(' vs. ')]
    elif keep_method.lower() == 'away':
        # Keep rows where TEAM_A is the away team.
        result = result[result.MATCHUP_A.str.contains(' @ ')]
    elif keep_method.lower() == 'winner':
        result = result[result.WL_A == 'W']
    elif keep_method.lower() == 'loser':
        result = result[result.WL_A == 'L']
    else:
        raise ValueError(f'Invalid keep_method: {keep_method}')
    return result

In [3]:
from nba_api.stats.endpoints import leaguegamefinder

gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable = "2013-14")
# The first DataFrame of those returned is what we want.
games = combine_team_games(gamefinder.get_data_frames()[0], keep_method = 'home')
games = games[games.GAME_ID.str[0:3] == '002']
games.sort_values('GAME_ID', inplace = True)
games.reset_index(inplace = True)
games.drop(['index', 'WL_B', 'MATCHUP_A', 'MATCHUP_B'], axis = 1, inplace = True)
games['WL_A'].replace({'W' : 1, 'L' : 0}, inplace = True)
games

Unnamed: 0,SEASON_ID_A,TEAM_ID_A,TEAM_ABBREVIATION_A,TEAM_NAME_A,GAME_ID,GAME_DATE_A,WL_A,MIN_A,PTS_A,FGM_A,...,FT_PCT_B,OREB_B,DREB_B,REB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,PLUS_MINUS_B
0,22013,1610612754,IND,Indiana Pacers,0021300001,2013-10-29,1,241,97,34,...,0.600,13,26,39,17,10,6,17,26,-10.0
1,22013,1610612748,MIA,Miami Heat,0021300002,2013-10-29,1,239,107,37,...,0.783,11,30,41,23,11,4,18,27,-12.0
2,22013,1610612747,LAL,Los Angeles Lakers,0021300003,2013-10-29,1,240,116,42,...,0.565,10,30,40,27,11,4,16,21,-13.0
3,22013,1610612739,CLE,Cleveland Cavaliers,0021300004,2013-10-30,1,241,98,35,...,0.792,9,28,37,24,8,8,15,27,-4.0
4,22013,1610612755,PHI,Philadelphia 76ers,0021300005,2013-10-30,1,238,114,43,...,0.769,7,24,31,30,7,0,19,25,-4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,22013,1610612752,NYK,New York Knicks,0021301226,2014-04-16,1,240,95,37,...,0.611,12,36,48,20,9,2,24,20,-3.0
1226,22013,1610612753,ORL,Orlando Magic,0021301227,2014-04-16,0,240,86,32,...,0.727,15,37,52,26,5,2,12,17,15.0
1227,22013,1610612757,POR,Portland Trail Blazers,0021301228,2014-04-16,1,240,110,42,...,0.848,8,34,42,20,4,4,8,20,-6.0
1228,22013,1610612758,SAC,Sacramento Kings,0021301229,2014-04-16,0,240,99,37,...,0.727,7,33,40,16,9,5,11,21,5.0


In [4]:
from nba_api.stats.static import teams

nba_teams = teams.get_teams()
# nba_teams

In [33]:
class Team_Cumulative_Stats: 
    def __init__(self, team_abbr): # team_abbr is the abbreviated name of the team city. EX: ATL for Alanta Hawks
        self.team = team_abbr 
        self.nth_game = 0 # used in part 1 (see below)
        self.curr_game = 0 # used in part 2 (see below)
        
        # this class tracks the features for 1 team, so we exclude all the team_b stats 
        features = [col for col in games.columns if col[-2:] != '_B']  
        
        # excluded features I decided not to include
        excluded_stats = ['TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'WL', 'MIN_A']
        excluded_other = ['SEASON_ID', 'GAME_ID', 'GAME_DATE']
        
        # The first element of the list is 0 for all features. This is because there are no values for the first game.
        self.nth_game = 0
        self.curr_game = 0 
        features = [col for col in games.columns if col[-2:] != '_B']
        excluded_stats = ['TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'WL', 'MIN_A']
        excluded_other = ['SEASON_ID', 'GAME_ID', 'GAME_DATE']
        self.features =  {feature[:-2]:[0] 
                          for feature in features 
                          if (feature[:-2] not in excluded_stats) and (feature not in excluded_other)}
        self.features['GAME_ID'] = [games.loc[0, 'GAME_ID']]
        
        # for whatever reason, these features were not excluded even though they were included in the excluded_other list
        # I ended up deleting them seperately, but I still haven't figured out why they were added to self.features in the first place
        del self.features['SEASON_ID']
        del self.features['GAME_DATE']
        
    def add_game(self, game, home): # Used to build self.features (part 1)
        del self.features['SEASON_ID']
        del self.features['GAME_DATE']
        
    def add_game(self, game, home): 
        '''
        game -- a row of the games dataframe
        home -- True if the team represented by this class is home, False if away
        '''
        suffix = '_A' if home else '_B' # in the games dataframe, _A is for the hometeam, and _B is for away
        for feature in self.features:
            if feature != 'GAME_ID': # all of the stats have to be averaged, except for GAME_ID
                
                # if this is the first game, there is nothing to average
                if self.nth_game == 0: 
                    new_val = row.loc[feature + suffix] 
                
                # otherwise, do the math to get the next average value
        suffix = '_A' if home else '_B' 
        for feature in self.features:
            if feature != 'GAME_ID':
                if self.nth_game == 0: 
                    new_val = row.loc[feature + suffix]
                else: 
                    last_entry = self.features[feature][self.nth_game]
                    new_val = (last_entry * self.nth_game + row.loc[feature + suffix]) / (self.nth_game + 1)
                self.features[feature].append(new_val) 
        self.features['GAME_ID'].append(row.loc['GAME_ID'])
        self.nth_game += 1 
        
    def get_next_game(self, home): # Used to use self.features, once it is built, to create the DataFrame (part 2)
        '''home --- True if the team is home, False if away'''
    def get_next_game(self, home): # home is True if the team is home, False if away 
        suffix = '_A' if home else '_B'
        result = {feature + suffix : [self.features[feature][self.curr_game]] for feature in self.features
                 if feature != 'GAME_ID'} 
        result['GAME_ID'] = [self.features['GAME_ID'][self.curr_game]]
        self.curr_game += 1 
        return result
            
# dictionary with a teams city abbreviation as the key, and a Team_Cumulative_Stats object for that team as the value
# see the above cell for the definition of nba_teams
teams = {team['abbreviation']:Team_Cumulative_Stats(team['abbreviation']) for team in nba_teams}

# part 1 (build the data from the games dataframe)
        
teams = {team['abbreviation']:Team_Cumulative_Stats(team['abbreviation']) for team in nba_teams}

for ind, row in games.iterrows():
    team_a = teams[row.loc['TEAM_ABBREVIATION_A']] 
    team_b = teams[row.loc['TEAM_ABBREVIATION_B']] 
    team_a.add_game(row, True) 
    team_b.add_game(row, False)
    
# part 2  (gather that data back together into a dictionary with the two competing teams on the same line)
for ind, row in games.iterrows(): 
    team_a = row.loc['TEAM_ABBREVIATION_A'] 
    team_b = row.loc['TEAM_ABBREVIATION_B']
    a = teams[team_a].get_next_game(True)
    b = teams[team_b].get_next_game(False)
    if ind == 0: # if this is the first loop, initialize the training_data
        training_data = {**a, **b} # training data is a dictionary, which will become the dataframe after it is fully built
    else: # build the training data using values returned by the get_next_game method
    if ind == 0: 
        training_data = {**a, **b} 
    else: 
        for feature in a:
            training_data[feature].append(a[feature][0])
        for feature in b: 
            if feature != 'GAME_ID':
                training_data[feature].append(b[feature][0])

# create the dataframe, add the WL_A column
test = pd.DataFrame.from_dict(training_data)
test['WL_A'] = games['WL_A']

test.head(50)

Unnamed: 0,MIN_A,PTS_A,FGM_A,FGA_A,FG_PCT_A,FG3M_A,FG3A_A,FG3_PCT_A,FTM_A,FTA_A,...,OREB_B,DREB_B,REB_B,AST_B,STL_B,BLK_B,TOV_B,PF_B,PLUS_MINUS_B,WL_A
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,35.0,40.0,26.0,10.0,7.0,18.0,21.0,12.0,1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.0,26.0,39.0,17.0,10.0,6.0,17.0,26.0,-10.0,1


1. Check to make sure that the dataframe I made actually represents what happened
2. Remove useless stats, add useful stats that we discussed last week
3. Make this work useful for an arbitrary # of games
4. Use the techniques in [here](https://math.stackexchange.com/questions/684519/what-is-the-most-scientific-way-to-assign-weights-to-historical-data) to weigh recent years more