In [75]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output

from tqdm import tqdm
from nba_api.stats.static import players, teams
from nba_api.stats.library.parameters import SeasonAll


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_absolute_error, mean_squared_error


pd.options.display.max_columns=200

In [16]:
seasons = ["20{}-{}".format(x, x+1) for x in range(10, 21)]
print(seasons)
def gather_data(seasons = seasons):
    full_player_data = []
    for season in seasons:
        player_gls = pd.read_csv('./data/player_gamelogs_{}.csv'.format(season), dtype={'GAME_ID':'object'})
        player_gls_adv = pd.read_csv('./data/player_advanced_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'object'})
        player_gls_scoring = pd.read_csv('./data/player_scoring_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'object'})
        player_gls_tracking = pd.read_csv('./data/player_tracking_boxscores_{}.csv'.format(season), dtype={'GAME_ID':'object'})
        
        player_gls = player_gls[['SEASON_YEAR', 'PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID',
       'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID', 'GAME_DATE', 'MATCHUP',
       'WL', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM',
       'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK',
       'BLKA', 'PF', 'PFD', 'PTS', 'PLUS_MINUS']]

        player_full = pd.merge(player_gls, player_gls_adv, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
        player_full = pd.merge(player_full, player_gls_scoring, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
        player_full = pd.merge(player_full, player_gls_tracking, how='left', on=['GAME_ID', 'PLAYER_ID'], suffixes=['', '_x'])
    
        # drop unnecessary columns
        player_full = player_full.drop(columns = ['TEAM_ID', 'TEAM_NAME', 'TEAM_ID_x',
                         'TEAM_ABBREVIATION_x','TEAM_CITY','PLAYER_NAME_x','START_POSITION',
                          'COMMENT','MIN_x','TEAM_ID_x', 'TEAM_ABBREVIATION_x','TEAM_CITY_x', 
                          'PLAYER_NAME_x', 'START_POSITION_x', 'COMMENT_x', 'MIN_x', 
                         'TEAM_ID_x', 'TEAM_ABBREVIATION_x', 'TEAM_CITY_x', 'PLAYER_NAME_x', 
                          'START_POSITION_x', 'COMMENT_x', 'MIN_x', 'AST_x', 'FG_PCT_x',
                         'FG_PCT', 'FG3_PCT', 'FT_PCT', 'E_NET_RATING', 'NET_RATING', 
                         'AST_PCT', 'OREB_PCT', 'DREB_PCT', 'REB_PCT', 'EFG_PCT', 'TS_PCT', 
                         'PCT_FGA_2PT', 'PCT_FGA_3PT', 'PCT_PTS_FT', 'CFG_PCT',
                        'UFG_PCT', 'DFG_PCT'], 
                       errors= 'ignore')

        # Convert date to datetime
        player_full['GAME_DATE'] = pd.to_datetime(player_full['GAME_DATE'])
        
        # Convert W/L to a binary 1/0 for win/loss
        player_full['WL'] = (player_full['WL'].str[0] == 'W').astype(int)
        
        # Add a binary home_game feature
        player_full['HOME_GAME'] = 0 
        player_full['HOME_GAME'] = (player_full['MATCHUP'].str[:6] == player_full['TEAM_ABBREVIATION'].str[:] + ' vs').astype(int)
        
        # convert the percentages into raw numbers (because we will be averaging them, we don't want to average percentages)
        # for example, percentage of points from midrange will be points scored from midrange
        player_full['FG2M'] = player_full['FGM'] - player_full['FG3M']
        player_full['FG2A'] = player_full['FGA'] - player_full['FG3A']
        player_full['PTS_2PT'] = (player_full['PTS'] * player_full['PCT_PTS_2PT']).astype('int8')
        player_full['PTS_2PT_MR'] = (player_full['PTS'] * player_full['PCT_PTS_2PT_MR']).astype('int8')
        player_full['PTS_3PT'] = (player_full['PTS'] * player_full['PCT_PTS_3PT']).astype('int8')
        player_full['PTS_FB'] = (player_full['PTS'] * player_full['PCT_PTS_FB']).astype('int8')
        player_full['PTS_OFF_TOV'] = (player_full['PTS'] * player_full['PCT_PTS_OFF_TOV']).astype('int8')
        player_full['PTS_PAINT'] = (player_full['PTS'] * player_full['PCT_PTS_PAINT']).astype('int8')
        player_full['AST_2PM'] = (player_full['FG2M'] * player_full['PCT_AST_2PM']).astype('int8')
        player_full['UAST_2PM'] = (player_full['FG2M'] * player_full['PCT_UAST_2PM']).astype('int8')
        player_full['AST_3PM'] = (player_full['FG3M'] * player_full['PCT_AST_3PM']).astype('int8')
        player_full['UAST_3PM'] = (player_full['FG3M'] * player_full['PCT_UAST_3PM']).astype('int8')



        player_full = player_full.drop(columns = ['PCT_PTS_2PT', 'PCT_PTS_2PT_MR', 'PCT_PTS_3PT', 
                                                  'PCT_PTS_FB', 'PCT_PTS_OFF_TOV', 'PCT_PTS_PAINT',
                                                  'PCT_AST_2PM', 'PCT_UAST_2PM', 'PCT_AST_3PM',
                                                  'PCT_UAST_3PM', 'PCT_AST_FGM', 'PCT_UAST_FGM', 
                                                  'FGM', 'FGA', 'AST_TOV', 'USG_PCT', 'PACE',
                                                  'PACE_PER40', ], errors='ignore')
        
        
        full_player_data.append(player_full)
        
        
    
    player_df = pd.concat(full_player_data)
    
    
    player_df['home_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 1 else row['MATCHUP'][-3:], axis=1)
    player_df['away_team_abbr'] = player_df.apply(lambda row: row['TEAM_ABBREVIATION'] if row['HOME_GAME'] == 0 else row['MATCHUP'][-3:], axis=1)
    
    player_df[['home_team_abbr', 'away_team_abbr']] = player_df[['home_team_abbr', 'away_team_abbr']].replace({'NOH':'NOP',
                                                                                                               'NJN':'BKN'})
    
    # Reorder columns
    player_df = player_df[['SEASON_YEAR', 'TEAM_ABBREVIATION', 'PLAYER_NAME', 'PLAYER_ID', 
                           'home_team_abbr', 'away_team_abbr', 'GAME_ID', 
                           'GAME_DATE', 'MATCHUP', 'WL', 'HOME_GAME', 'MIN',
                           'FG3M', 'FG3A', 'FTM', 'FTA', 'OREB', 'DREB', 'REB',
                           'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD',
                           'PTS', 'PLUS_MINUS', 'E_OFF_RATING', 'E_DEF_RATING',
                           'AST_RATIO', 'TM_TOV_PCT', 'E_USG_PCT',
                           'E_PACE', 'POSS', 'PIE', 'SPD', 'DIST', 'ORBC', 
                           'DRBC', 'RBC', 'TCHS', 'SAST', 'FTAST', 'PASS', 
                           'CFGM', 'CFGA', 'UFGM', 'UFGA', 'DFGM', 'DFGA', 
                           'FG2M', 'FG2A', 'PTS_2PT', 'PTS_2PT_MR', 'PTS_3PT',
                           'PTS_FB', 'PTS_OFF_TOV', 'PTS_PAINT', 'AST_2PM',
                           'UAST_2PM', 'AST_3PM', 'UAST_3PM']]
    
    return player_df

player_df = gather_data(seasons)

['2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21']


In [54]:
test = player_df.loc[player_df['SEASON_YEAR'] == '2010-11']

test = test.loc[test['PLAYER_ID'] == 1938].sort_values('GAME_DATE')
test.iloc[:, 11:].head()
# for player in test['PLAYER_NAME'].unique():
#     player_avgs = 

Unnamed: 0,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,TOV,...,PTS_2PT,PTS_2PT_MR,PTS_3PT,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
24872,31.9,5,9,5,5,1,2,3,4,3,...,2,0,15,4,2,2,0,1,3,2
24493,36.85,2,11,5,5,0,2,2,6,2,...,12,0,6,6,2,12,3,3,1,1
24214,27.93,1,3,5,6,0,2,2,7,3,...,6,2,2,2,2,4,2,0,1,0
23908,32.986667,1,4,5,8,0,3,3,5,4,...,10,1,3,0,0,7,1,4,1,0
23480,39.716667,4,11,2,2,0,3,3,5,4,...,14,9,12,9,4,4,2,4,3,1


In [46]:
t = pd.DataFrame(np.array([4,12,12,20,4,10,6,8, 4, 2, 14, 20]))
t['rolling_avg'] = t[0].rolling(window=4).mean().shift(1)
t['rolling_avg2'] = t[0].shift(1).rolling(window=4).mean()
t

Unnamed: 0,0,rolling_avg,rolling_avg2
0,4,,
1,12,,
2,12,,
3,20,,
4,4,12.0,12.0
5,10,12.0,12.0
6,6,11.5,11.5
7,8,10.0,10.0
8,4,7.0,7.0
9,2,7.0,7.0


In [58]:
def aggregate_player_stats(df = player_df):

    seasons = player_df['SEASON_YEAR'].unique()

    df = df.sort_values('GAME_DATE')

    for season in seasons:
        season_df = df.loc[df['SEASON_YEAR'] == season]
        player_ids = season_df['PLAYER_ID'].unique()
        for p_id in tqdm(player_ids, desc='Progress'):
            player_log = season_df.loc[season_df['PLAYER_ID'] == p_id]

            avg_player_data = player_log.iloc[:, 11:].shift(1).rolling(10, min_periods=5).mean()

            cols = avg_player_data.columns

            df.loc[(df['SEASON_YEAR'] == season) & (df['PLAYER_ID'] == p_id), cols] = avg_player_data

    df.to_csv('player_avg_last10.csv', index=False)
        
    return df
    
df

Progress: 100%|██████████| 452/452 [00:59<00:00,  7.61it/s]
Progress: 100%|██████████| 478/478 [01:02<00:00,  7.69it/s]
Progress: 100%|██████████| 469/469 [01:01<00:00,  7.62it/s]
Progress: 100%|██████████| 482/482 [01:04<00:00,  7.52it/s]
Progress: 100%|██████████| 492/492 [01:10<00:00,  6.94it/s]
Progress: 100%|██████████| 476/476 [01:02<00:00,  7.58it/s]
Progress: 100%|██████████| 486/486 [01:06<00:00,  7.33it/s]
Progress: 100%|██████████| 540/540 [01:12<00:00,  7.49it/s]
Progress: 100%|██████████| 530/530 [01:12<00:00,  7.30it/s]
Progress: 100%|██████████| 529/529 [01:11<00:00,  7.41it/s]
Progress: 100%|██████████| 474/474 [01:03<00:00,  7.50it/s]


Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,...,PTS_2PT,PTS_2PT_MR,PTS_3PT,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
25108,2010-11,BOS,Kevin Garnett,708,BOS,MIA,0021000001,2010-10-26,BOS vs. MIA,1,...,,,,,,,,,,
25122,2010-11,BOS,Shaquille O'Neal,406,BOS,MIA,0021000001,2010-10-26,BOS vs. MIA,1,...,,,,,,,,,,
25121,2010-11,BOS,Paul Pierce,1718,BOS,MIA,0021000001,2010-10-26,BOS vs. MIA,1,...,,,,,,,,,,
25120,2010-11,LAL,Lamar Odom,1885,LAL,HOU,0021000003,2010-10-26,LAL vs. HOU,1,...,,,,,,,,,,
25119,2010-11,MIA,James Jones,2592,BOS,MIA,0021000001,2010-10-26,MIA @ BOS,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2020-21,UTA,Jordan Clarkson,203903,ATL,UTA,0022000338,2021-02-04,UTA @ ATL,1,...,5.7,0.200000,9.300000,2.400000,3.400000,5.400000,0.600000,2.200000,2.100000,1.100000
85,2020-21,ATL,Danilo Gallinari,201568,ATL,UTA,0022000338,2021-02-04,ATL vs. UTA,0,...,3.0,1.444444,4.111111,0.222222,0.666667,1.444444,0.666667,0.666667,1.333333,0.111111
86,2020-21,GSW,Brad Wanamaker,202954,DAL,GSW,0022000339,2021-02-04,GSW @ DAL,1,...,3.1,0.900000,0.900000,1.000000,0.800000,2.100000,1.100000,0.500000,0.200000,0.100000
74,2020-21,DAL,Jalen Brunson,1628973,DAL,GSW,0022000339,2021-02-04,DAL vs. GSW,0,...,5.8,1.000000,3.200000,1.200000,0.900000,4.700000,0.600000,2.500000,0.900000,0.100000


In [63]:
test = player_df.loc[player_df['SEASON_YEAR'] == '2010-11']

test = test.loc[test['PLAYER_ID'] == 1938].sort_values('GAME_DATE')
test.iloc[:, 8:].head(10)

Unnamed: 0,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,...,PTS_2PT,PTS_2PT_MR,PTS_3PT,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
24872,SAS vs. IND,1,1,31.9,5,9,5,5,1,2,...,2,0,15,4,2,2,0,1,3,2
24493,SAS vs. NOH,0,1,36.85,2,11,5,5,0,2,...,12,0,6,6,2,12,3,3,1,1
24214,SAS @ LAC,1,0,27.93,1,3,5,6,0,2,...,6,2,2,2,2,4,2,0,1,0
23908,SAS @ PHX,1,0,32.986667,1,4,5,8,0,3,...,10,1,3,0,0,7,1,4,1,0
23480,SAS vs. HOU,1,1,39.716667,4,11,2,2,0,3,...,14,9,12,9,4,4,2,4,3,1
23102,SAS @ CHA,1,0,33.7,5,11,3,3,2,3,...,8,2,15,0,0,6,2,2,5,0
22792,SAS vs. LAC,1,1,35.705,3,5,7,7,2,2,...,6,0,8,4,4,6,0,3,2,0
22386,SAS vs. PHI,1,1,25.205,3,5,3,3,0,1,...,5,0,9,7,1,5,2,0,2,0
22248,SAS @ OKC,1,0,30.783333,2,5,11,11,2,1,...,3,1,6,0,1,1,1,1,0,2
21733,SAS vs. CHI,1,1,34.58,2,3,4,6,0,4,...,10,2,6,2,2,8,3,2,2,0


In [71]:
test2 = df.loc[df['SEASON_YEAR'] == '2015-16']

test2 = test2.loc[test2['TEAM_ABBREVIATION'] == 'SAS'].sort_values('GAME_DATE')
test2.groupby('PLAYER_NAME')['MIN'].mean().sort_values()

PLAYER_NAME
Matt Bonner           6.059327
Boban Marjanovic      8.214342
Ray McCallum          9.138930
Rasual Butler         9.551419
Andre Miller          9.562538
Kevin Martin         13.709823
Jonathon Simmons     14.266990
Kyle Anderson        15.309940
David West           17.429566
Boris Diaw           18.165929
Manu Ginobili        19.465494
Patty Mills          20.545638
Tim Duncan           24.930357
Danny Green          26.174990
Tony Parker          27.309269
LaMarcus Aldridge    30.540522
Kawhi Leonard        32.893754
Name: MIN, dtype: float64

In [84]:
df

Unnamed: 0,SEASON_YEAR,TEAM_ABBREVIATION,PLAYER_NAME,PLAYER_ID,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,MIN,FG3M,FG3A,FTM,FTA,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,E_OFF_RATING,E_DEF_RATING,AST_RATIO,TM_TOV_PCT,E_USG_PCT,E_PACE,POSS,PIE,SPD,DIST,ORBC,DRBC,RBC,TCHS,SAST,FTAST,PASS,CFGM,CFGA,UFGM,UFGA,DFGM,DFGA,FG2M,FG2A,PTS_2PT,PTS_2PT_MR,PTS_3PT,PTS_FB,PTS_OFF_TOV,PTS_PAINT,AST_2PM,UAST_2PM,AST_3PM,UAST_3PM
25108,2010-11,BOS,Kevin Garnett,708,BOS,MIA,0021000001,2010-10-26,BOS vs. MIA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25122,2010-11,BOS,Shaquille O'Neal,406,BOS,MIA,0021000001,2010-10-26,BOS vs. MIA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25121,2010-11,BOS,Paul Pierce,1718,BOS,MIA,0021000001,2010-10-26,BOS vs. MIA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25120,2010-11,LAL,Lamar Odom,1885,LAL,HOU,0021000003,2010-10-26,LAL vs. HOU,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25119,2010-11,MIA,James Jones,2592,BOS,MIA,0021000001,2010-10-26,MIA @ BOS,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2020-21,UTA,Jordan Clarkson,203903,ATL,UTA,0022000338,2021-02-04,UTA @ ATL,1,0,26.405333,3.200000,9.400000,2.1,2.100000,0.600000,3.600000,4.2,2.0,1.300000,0.900000,0.300000,0.400000,1.400000,2.100000,17.700000,3.500000,119.960000,111.760000,10.370000,7.140000,0.268200,101.229000,55.300000,0.120200,4.396000,2.030000,1.000000,5.100000,6.100000,40.200000,0.2,0.400000,22.200000,2.500000,5.600000,3.700000,8.700000,0.600000,1.000000,3.000000,5.2,5.7,0.200000,9.300000,2.400000,3.400000,5.400000,0.600000,2.200000,2.100000,1.100000
85,2020-21,ATL,Danilo Gallinari,201568,ATL,UTA,0022000338,2021-02-04,ATL vs. UTA,0,1,16.450185,1.444444,3.777778,3.0,3.111111,0.222222,1.777778,2.0,1.0,1.333333,0.111111,0.111111,0.444444,1.333333,1.888889,10.444444,2.111111,128.833333,127.933333,6.877778,12.933333,0.262111,98.663333,34.888889,0.096667,3.906667,1.156667,0.666667,3.222222,3.777778,30.555556,0.0,0.111111,20.777778,0.666667,1.888889,2.333333,4.888889,1.222222,2.111111,1.555556,3.0,3.0,1.444444,4.111111,0.222222,0.666667,1.444444,0.666667,0.666667,1.333333,0.111111
86,2020-21,GSW,Brad Wanamaker,202954,DAL,GSW,0022000339,2021-02-04,GSW @ DAL,1,0,16.988167,0.300000,1.800000,0.7,0.800000,0.100000,1.100000,1.2,2.8,0.800000,0.600000,0.100000,0.500000,1.600000,1.400000,4.800000,-1.200000,103.950000,110.540000,29.540000,7.910000,0.145300,98.559000,35.300000,0.064400,4.286000,1.213000,0.400000,1.900000,2.300000,34.700000,0.3,0.100000,27.800000,1.000000,1.700000,0.800000,2.500000,1.000000,1.100000,1.600000,2.8,3.1,0.900000,0.900000,1.000000,0.800000,2.100000,1.100000,0.500000,0.200000,0.100000
74,2020-21,DAL,Jalen Brunson,1628973,DAL,GSW,0022000339,2021-02-04,DAL vs. GSW,0,1,27.394167,1.100000,2.400000,2.1,2.300000,0.300000,2.700000,3.0,3.3,1.100000,0.400000,0.000000,0.200000,1.800000,3.000000,11.800000,-5.000000,103.130000,114.200000,24.460000,8.060000,0.167500,100.050000,57.300000,0.106800,3.915000,1.908000,1.000000,5.000000,5.900000,42.000000,0.3,0.400000,31.400000,1.700000,3.300000,2.200000,4.400000,1.200000,1.700000,3.200000,5.7,5.8,1.000000,3.200000,1.200000,0.900000,4.700000,0.600000,2.500000,0.900000,0.100000


In [104]:
test2 = df.loc[df['GAME_ID'] == '0021501223']
home = test2.loc[test2['HOME_GAME'] == 1].sort_values('MIN', ascending=False)
away = test2.loc[test2['HOME_GAME'] == 0].sort_values('MIN', ascending=False)

home_cols = home.columns[:11]
away_cols = away.columns[:11]

home_matchup_info = home[home_cols].iloc[0].values.reshape(1, -1)


home_stats = home.iloc[:8, 11:].values.reshape(1, -1)
away_stats = away.iloc[:8, 11:].values.reshape(1, -1)



[[ 3.71883333e+01  3.50000000e+00  8.50000000e+00  2.30000000e+00
   2.60000000e+00  1.00000000e-01  2.80000000e+00  2.90000000e+00
   1.90000000e+00  1.30000000e+00  1.00000000e+00  1.00000000e-01
   5.00000000e-01  2.50000000e+00  2.60000000e+00  1.58000000e+01
  -3.00000000e+00  1.03090000e+02  1.06960000e+02  1.16900000e+01
   8.21000000e+00  1.87000000e-01  9.34550000e+01  7.21000000e+01
   7.56000000e-02  4.18600000e+00  2.58900000e+00  1.90000000e+00
   5.30000000e+00  7.00000000e+00  5.67000000e+01  1.00000000e-01
   1.00000000e-01  4.00000000e+01  7.00000000e-01  2.40000000e+00
   4.30000000e+00  1.00000000e+01  1.30000000e+00  1.80000000e+00
   1.50000000e+00  3.90000000e+00  2.90000000e+00  1.80000000e+00
   1.01000000e+01  7.00000000e-01  2.00000000e+00  1.20000000e+00
   5.00000000e-01  8.00000000e-01  2.60000000e+00  7.00000000e-01
   3.27075000e+01  1.30000000e+00  4.40000000e+00  3.00000000e+00
   3.80000000e+00  3.00000000e-01  2.70000000e+00  3.00000000e+00
   7.40000

In [101]:
home_cols

Index(['SEASON_YEAR', 'TEAM_ABBREVIATION', 'PLAYER_NAME', 'PLAYER_ID',
       'home_team_abbr', 'away_team_abbr', 'GAME_ID', 'GAME_DATE', 'MATCHUP',
       'WL', 'HOME_GAME'],
      dtype='object')

In [115]:
def long_to_wide(df = df):
    matchup_data = np.empty((0, 840))
    
    # All game_ids
    game_ids = df['GAME_ID'].unique()
    
    # 
    new_cols = ['SEASON_YEAR', 'home_team_abbr',
                'away_team_abbr', 'GAME_ID', 'GAME_DATE', 
                'MATCHUP', 'WL', 'HOME_GAME']
    
    for game_id in tqdm(game_ids, desc="Progress"):
        
        # Select all the players boxscores from one game
        game_df = df.loc[df['GAME_ID'] == game_id]
               
        # Select the home team players, sort by minutes
        home_team = game_df.loc[game_df['HOME_GAME'] == 1].sort_values('MIN', ascending=False)
        
        # Select matchup info

        home_matchup_info = home_team[new_cols].iloc[0].values.reshape(1, -1)
        
        # unroll the home players' stats up to player 12 into one long row
        home_stats_row = home_team.iloc[:8, 11:].values.reshape(1, -1)
        
        # if 
        if home_stats_row.shape[1] < 416:
            missing_stats = 416 - home_stats_row.shape[1]
            padding = np.zeros((1, missing_stats))
            home_stats_row = np.append(home_stats_row, padding, axis=1)      
        
        home_stats_row = np.append(home_matchup_info, home_stats_row, axis=1)
        
    
        away_team = game_df.loc[game_df['HOME_GAME'] == 0].sort_values('MIN', ascending=False)
        
#         away_matchup_cols = away_team.columns[:11]
#         away_matchup_info = away_team[away_matchup_cols].iloc[0].values.reshape(1, -1)
        
        away_stats_row = away_team.iloc[:8, 11:].values.reshape(1, -1)

        if away_stats_row.shape[1] < 416:
            missing_stats = 416 - away_stats_row.shape[1]
            padding = np.zeros((1, missing_stats))
            away_stats_row = np.append(away_stats_row, padding, axis=1)        
    
        matchup_row = np.append(home_stats_row, away_stats_row, axis=1)    
        matchup_data = np.append(matchup_data, matchup_row, axis=0)
        

    for i in range(1, 9):
        home_player_cols = ['home_P{}_'.format(i) + x for x in df.columns[11:]]
        away_player_cols = ['away_P{}_'.format(i) + x for x in df.columns[11:]]
        new_cols.extend(home_player_cols)
        new_cols.extend(away_player_cols)
        
    matchup_df = pd.DataFrame(matchup_data, columns = new_cols)
    
    return matchup_df


matchup_df = long_to_wide(df=df)
matchup_df.to_csv("matchup_boxscores_wide.csv", index=False)
matchup_df
    

Progress: 100%|██████████| 12207/12207 [16:58<00:00, 11.98it/s]


Unnamed: 0,SEASON_YEAR,home_team_abbr,away_team_abbr,GAME_ID,GAME_DATE,MATCHUP,WL,HOME_GAME,home_P1_MIN,home_P1_FG3M,home_P1_FG3A,home_P1_FTM,home_P1_FTA,home_P1_OREB,home_P1_DREB,home_P1_REB,home_P1_AST,home_P1_TOV,home_P1_STL,home_P1_BLK,home_P1_BLKA,home_P1_PF,home_P1_PFD,home_P1_PTS,home_P1_PLUS_MINUS,home_P1_E_OFF_RATING,home_P1_E_DEF_RATING,home_P1_AST_RATIO,home_P1_TM_TOV_PCT,home_P1_E_USG_PCT,home_P1_E_PACE,home_P1_POSS,home_P1_PIE,home_P1_SPD,home_P1_DIST,home_P1_ORBC,home_P1_DRBC,home_P1_RBC,home_P1_TCHS,home_P1_SAST,home_P1_FTAST,home_P1_PASS,home_P1_CFGM,home_P1_CFGA,home_P1_UFGM,home_P1_UFGA,home_P1_DFGM,home_P1_DFGA,home_P1_FG2M,home_P1_FG2A,home_P1_PTS_2PT,home_P1_PTS_2PT_MR,home_P1_PTS_3PT,home_P1_PTS_FB,home_P1_PTS_OFF_TOV,home_P1_PTS_PAINT,home_P1_AST_2PM,home_P1_UAST_2PM,home_P1_AST_3PM,home_P1_UAST_3PM,away_P1_MIN,away_P1_FG3M,away_P1_FG3A,away_P1_FTM,away_P1_FTA,away_P1_OREB,away_P1_DREB,away_P1_REB,away_P1_AST,away_P1_TOV,away_P1_STL,away_P1_BLK,away_P1_BLKA,away_P1_PF,away_P1_PFD,away_P1_PTS,away_P1_PLUS_MINUS,away_P1_E_OFF_RATING,away_P1_E_DEF_RATING,away_P1_AST_RATIO,away_P1_TM_TOV_PCT,away_P1_E_USG_PCT,away_P1_E_PACE,away_P1_POSS,away_P1_PIE,away_P1_SPD,away_P1_DIST,away_P1_ORBC,away_P1_DRBC,away_P1_RBC,away_P1_TCHS,away_P1_SAST,away_P1_FTAST,away_P1_PASS,away_P1_CFGM,away_P1_CFGA,away_P1_UFGM,away_P1_UFGA,away_P1_DFGM,away_P1_DFGA,...,home_P8_FTA,home_P8_OREB,home_P8_DREB,home_P8_REB,home_P8_AST,home_P8_TOV,home_P8_STL,home_P8_BLK,home_P8_BLKA,home_P8_PF,home_P8_PFD,home_P8_PTS,home_P8_PLUS_MINUS,home_P8_E_OFF_RATING,home_P8_E_DEF_RATING,home_P8_AST_RATIO,home_P8_TM_TOV_PCT,home_P8_E_USG_PCT,home_P8_E_PACE,home_P8_POSS,home_P8_PIE,home_P8_SPD,home_P8_DIST,home_P8_ORBC,home_P8_DRBC,home_P8_RBC,home_P8_TCHS,home_P8_SAST,home_P8_FTAST,home_P8_PASS,home_P8_CFGM,home_P8_CFGA,home_P8_UFGM,home_P8_UFGA,home_P8_DFGM,home_P8_DFGA,home_P8_FG2M,home_P8_FG2A,home_P8_PTS_2PT,home_P8_PTS_2PT_MR,home_P8_PTS_3PT,home_P8_PTS_FB,home_P8_PTS_OFF_TOV,home_P8_PTS_PAINT,home_P8_AST_2PM,home_P8_UAST_2PM,home_P8_AST_3PM,home_P8_UAST_3PM,away_P8_MIN,away_P8_FG3M,away_P8_FG3A,away_P8_FTM,away_P8_FTA,away_P8_OREB,away_P8_DREB,away_P8_REB,away_P8_AST,away_P8_TOV,away_P8_STL,away_P8_BLK,away_P8_BLKA,away_P8_PF,away_P8_PFD,away_P8_PTS,away_P8_PLUS_MINUS,away_P8_E_OFF_RATING,away_P8_E_DEF_RATING,away_P8_AST_RATIO,away_P8_TM_TOV_PCT,away_P8_E_USG_PCT,away_P8_E_PACE,away_P8_POSS,away_P8_PIE,away_P8_SPD,away_P8_DIST,away_P8_ORBC,away_P8_DRBC,away_P8_RBC,away_P8_TCHS,away_P8_SAST,away_P8_FTAST,away_P8_PASS,away_P8_CFGM,away_P8_CFGA,away_P8_UFGM,away_P8_UFGA,away_P8_DFGM,away_P8_DFGA,away_P8_FG2M,away_P8_FG2A,away_P8_PTS_2PT,away_P8_PTS_2PT_MR,away_P8_PTS_3PT,away_P8_PTS_FB,away_P8_PTS_OFF_TOV,away_P8_PTS_PAINT,away_P8_AST_2PM,away_P8_UAST_2PM,away_P8_AST_3PM,away_P8_UAST_3PM
0,2010-11,BOS,MIA,0021000001,2010-10-26,BOS vs. MIA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2010-11,LAL,HOU,0021000003,2010-10-26,LAL vs. HOU,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2010-11,POR,PHX,0021000002,2010-10-26,POR vs. PHX,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2010-11,DEN,UTA,0021000014,2010-10-27,DEN vs. UTA,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2010-11,TOR,NYK,0021000007,2010-10-27,TOR vs. NYK,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12202,2020-21,ATL,UTA,0022000338,2021-02-04,ATL vs. UTA,0,1,36.061,2.3,5.3,0.6,0.8,0.8,3.5,4.3,4.3,1.3,1.2,0.2,0.6,2.4,0.8,11.9,0.8,111.09,108,23.96,7.41,0.1494,101.766,75.7,0.0687,4.194,2.686,1.5,5.9,7.4,51,0.1,0.2,36.7,1,3.7,3.5,7.3,2.1,3.1,2.2,5.7,4.3,0.7,6.4,1.3,2.1,3.5,1.3,0.8,1.8,0.3,35.5072,1.7,3.5,1.3,1.6,1.9,6.5,8.4,1.6,1.4,0.1,1.4,1,3.5,2.3,19.4,4.6,111.19,103.89,10.2,7.83,0.1958,100.509,74,0.1173,4.243,2.671,4.6,11.9,16.3,65.3,0.5,0.1,47.8,3.4,6.6,4.8,7.6,4,6.9,...,0.6,0.3,2.2,2.5,4.8,1.6,0.3,0.2,0.1,1.7,2,11.4,4.2,121.69,110.58,32.87,10.09,0.1677,101.033,50.5,0.1141,4.288,1.826,0.6,3.8,4.3,51.7,0.8,0.8,40.7,0.8,1.5,3.2,6.2,1.5,1.5,1.1,2,2,0,8.6,1.4,1,2,0.5,0.6,2.1,0.6,16.3397,0,0.1,1.1,1.4,2.1,3.8,5.9,0.6,0.5,0.5,1.1,0.3,2.6,1,5.1,2,117.52,110.02,16.25,8.92,0.1257,100.522,33.8,0.0934,4.198,1.188,4,5.9,9.7,16.8,0.2,0,12.1,1.1,2,0.9,1.6,3,5.6,2,3.5,3.9,0.2,0,0.2,0.7,3.5,1.4,0.5,0,0
12203,2020-21,MEM,HOU,0022000340,2021-02-04,MEM vs. HOU,0,1,29.3755,0.7,2.1,1.1,1.5,1.9,4.4,6.3,2.1,0.5,1.6,0.7,0.8,1.7,1.9,15,-3.3,106.56,108.93,12.88,2.84,0.1996,102.432,61.6,0.1194,4.405,2.283,3.7,7.3,10.7,40.9,0.6,0,26.3,3.5,6.4,3.1,6.3,2.1,3.7,5.9,10.7,11.3,0.8,2,2.2,2.8,10.6,4.8,0.8,0.7,0,27.9746,0.75,2.625,4.375,5.25,0.5,1.5,2,7.375,3.375,1.125,0.5,1.625,1.75,4.125,19.375,-0.125,113.062,110.537,27.1125,14.225,0.295,106.757,62.125,0.11525,4.3725,2.1925,2.25,3.625,5.625,65.375,0.125,0.375,43.375,4.25,8,2.875,6,1.625,2.625,...,3,1.5,6.4,7.9,3.2,2.2,0.9,0.5,0.5,3.7,2.2,10.3,-2.9,98.7,113.19,21.35,14.62,0.2407,105.324,46.7,0.0941,3.668,1.221,2.9,8.2,10.7,42.6,0.2,0.1,31.2,0.4,2,2.5,5.9,2.6,3.5,1.2,3.4,2.2,0,5.6,0.1,1.2,2.2,0.5,0.7,1.8,0.1,21.456,1.5,3.8,0.5,0.5,0.9,2.9,3.8,2,0.9,1.5,0.5,0.7,2,0.7,8,3.8,110.79,101.16,20.32,9.81,0.1568,103.801,45.8,0.1024,4.241,1.463,1.9,4.3,6,25.9,0,0.1,17.4,1,2.6,1.8,4.1,1.6,2.3,1.5,3.4,3,0,4.2,1.3,1.8,3,0.8,0.7,1.5,0
12204,2020-21,LAL,DEN,0022000341,2021-02-04,LAL vs. DEN,1,1,35.0242,3.4,7.4,4.5,6.1,0.5,6.6,7.1,7.6,3.4,1,0.5,0.6,1.8,4.7,26.1,6.7,110.63,100.55,24.18,10.71,0.3107,98.414,71.4,0.1994,3.536,2.198,2.5,9.7,11.5,92.9,1.3,0.8,65.3,2.5,4.9,6.6,13.5,2.3,3.3,5.7,11,11.2,1.5,9.8,3.6,4,9.1,1.9,3.3,1.5,1.7,33.1683,0.4,1.9,4.8,6.9,2.2,6.7,8.9,3.4,1.5,1.3,2.2,0.9,2,4.9,21.8,5.7,112.28,102.01,13.83,5.79,0.2707,101.755,68.5,0.1704,3.659,2.153,3.9,10.3,14.1,60.9,0.4,0.2,38.8,4.7,7.8,3.6,8.7,5.2,7.9,...,1.9,1.9,4.9,6.8,1,0.7,0.6,0.2,0.6,2.4,1.3,10.2,5.6,115.45,107.25,10.97,7.25,0.1842,99.639,44.7,0.1076,3.691,1.356,3.7,4.9,8,26.7,0.2,0,18.7,1.1,2.7,1.7,4.1,1.3,1.9,1.4,3.6,2.5,0.1,5.5,0.3,1.4,2.2,1,0.4,1.9,0,12.6665,0.8,2.3,0.3,0.4,0.3,0.2,0.5,1.9,0.7,0.8,0.1,0,2,1,3.1,2.8,114.1,103.6,37.95,11.34,0.1347,99.417,26.9,0.0178,3.592,0.855,0.1,0.4,0.5,20.4,0.2,0.1,15.8,0.1,0.5,0.9,2.6,0.2,0.3,0.2,0.9,0.4,0.2,2.3,0.3,0.6,0.2,0.1,0.1,0.6,0.2
12205,2020-21,DAL,GSW,0022000339,2021-02-04,DAL vs. GSW,0,1,35.305,1.7,6.1,6.4,7.7,0.5,7,7.5,9.8,3.4,0.9,0.6,0.8,1.7,5.4,26.1,-3.3,109.59,116.84,27.03,9.52,0.3316,101.026,73.1,0.1808,3.521,2.165,1.8,9.3,10.8,77.1,0.7,0.9,50,2.6,5.9,5.3,11.4,2.1,3.6,7.3,13.6,14.1,2.5,4.6,2.8,2.6,11.6,0.6,6.2,0.2,1.4,32.5825,1.1,5,2.4,2.6,1.1,1.8,2.9,1.8,1.4,0.9,0.4,0.3,3.1,2.4,11.9,-2.6,106.37,110.55,11.09,8.64,0.1851,100.186,66.6,0.0407,4.263,2.474,2.1,3.7,5.8,44.4,0.8,0.1,29.3,1.5,3.7,2.5,7.3,0.4,0.8,...,0.75,0.25,2.625,2.875,1.625,0.625,0.375,0.625,0.125,1.875,1.125,4.75,-2.625,94.95,106.762,35.3625,8.0375,0.085375,111.045,34.875,0.11575,4.62125,1.29,1.125,4.5,5.5,32.75,0,0.125,28.875,0.625,0.875,1.125,1.875,1.375,2,1,1.375,2,0,2.25,0.875,0.625,2,0.875,0.125,0.75,0,14.0382,0.8,1.8,1,1.2,0.4,2.5,2.9,1.2,1.1,1,0.5,0,2.4,1.6,5.4,1,106.45,104.63,19.01,22.28,0.1556,103.879,30.2,0.0929,4.449,1.047,0.8,4.2,4.8,20,0.1,0,14.5,0.5,0.7,1.2,2.5,0.7,1.2,1,1.7,1.9,0.2,2.3,1.6,1.2,1.7,0.3,0.6,0.8,0


In [118]:
matchup_df.to_csv('matchup_df.csv')