# NBA Preseason Prediction - Data Setup
Jordan Wegner 

## Purpose  
This document pulls and organizes the data for modeling. It's intended to use prewritten functions and also provide a visual check to ensure that everything is working as it should be.  

## Imports 
### Custom Functions 

In [1]:
import get_new_players
import get_new_teams
import get_new_seasons
import get_new_rosters
import get_new_game_ids
import get_new_draft
import get_new_playoff_wins
import get_new_boxscorefourfactors
import averaged_team_data
import lagged_data
import averaged_player_data
import adding_pwins

### Libraries 

In [2]:
import pandas as pd
import numpy as np 

## API Calls and Reading Data 

In [3]:
# Main Script

print("BEGIN: get_new_players.py")
get_new_players.get_new_players()
print("END: get_new_players.py")

print("BEGIN: get_new_teams.py")
get_new_teams.get_new_teams()
print("END: get_new_teams.py")

print("BEGIN: get_new_seasons.py")
get_new_seasons.get_new_seasons()
print("END: get_new_seasons.py")

print("READING: /users/jordanwegner/Desktop/nba2/03_data/seasons.csv")
seasons = pd.read_csv("/users/jordanwegner/Desktop/nba2/03_data/seasons.csv")
print("READING: /users/jordanwegner/Desktop/nba2/03_data/teams.csv")
teams = pd.read_csv("/users/jordanwegner/Desktop/nba2/03_data/teams.csv")

print("BEGIN: get_new_rosters.py")
#get_new_rosters.get_new_rosters(seasons,teams)
print("END: get_new_rosters.py")

print("BEGIN: get_new_game_ids.py")
get_new_game_ids.get_new_game_ids()
print("END: get_new_game_ids.py")

print("BEGIN: get_new_draft.py")
get_new_draft.get_new_draft()
print("END: get_new_draft.py") 

print("BEGIN: get_new_playoff_wins.py")
get_new_playoff_wins.get_new_playoff_wins()
print("END: get_new_playoff_wins.py") 

print("READING: /users/jordanwegner/Desktop/nba2/03_data/game_ids.csv")
gids = pd.read_csv("/users/jordanwegner/Desktop/nba2/03_data/game_ids.csv")

print("BEGIN: get_new_boxscorefourfactors.py")
gotten_or_error = ['G0020300778'] # ids to skip 
get_new_boxscorefourfactors.get_new_boxscorefourfactors(gids,gotten_or_error)
print("END: get_new_boxscorefourfactors.py")

print("READING: /users/jordanwegner/Desktop/nba2/03_data/box_score_four_factors.csv")
bsff = pd.read_csv("/users/jordanwegner/Desktop/nba2/03_data/box_score_four_factors.csv")

print("READING: /users/jordanwegner/Desktop/nba2/03_data/playoff_wins.csv")
pwins = pd.read_csv("/users/jordanwegner/Desktop/nba2/03_data/playoff_wins.csv")

print("READING: /users/jordanwegner/Desktop/nba2/03_data/rosters.csv")
rs = pd.read_csv("/users/jordanwegner/Desktop/nba2/03_data/rosters.csv")

BEGIN: get_new_players.py
No new players to add. No update required.
END: get_new_players.py
BEGIN: get_new_teams.py
No new teams to add. No update required.
END: get_new_teams.py
BEGIN: get_new_seasons.py
No new seasons to add. No update required.
END: get_new_seasons.py
READING: /users/jordanwegner/Desktop/nba2/03_data/seasons.csv
READING: /users/jordanwegner/Desktop/nba2/03_data/teams.csv
BEGIN: get_new_rosters.py
END: get_new_rosters.py
BEGIN: get_new_game_ids.py
No new Game IDs to add. No update required.
END: get_new_game_ids.py
BEGIN: get_new_draft.py
No new teams to add. No update required.
END: get_new_draft.py
BEGIN: get_new_playoff_wins.py
No new playoff wins to add. No update required.
END: get_new_playoff_wins.py
READING: /users/jordanwegner/Desktop/nba2/03_data/game_ids.csv
BEGIN: get_new_boxscorefourfactors.py
No new games to scrape. No update required.
END: get_new_boxscorefourfactors.py
READING: /users/jordanwegner/Desktop/nba2/03_data/box_score_four_factors.csv
READIN

### Data: `seasons`

In [4]:
print(seasons.shape)
seasons.head()

(24, 3)


Unnamed: 0,id,y1,y2
0,2000-01,2000,2001
1,2001-02,2001,2002
2,2002-03,2002,2003
3,2003-04,2003,2004
4,2004-05,2004,2005


### Data: `teams`

In [5]:
print(teams.shape)
teams.head()

(30, 7)


Unnamed: 0,id,full_name,abbreviation,nickname,city,state,year_founded
0,1610612737,Atlanta Hawks,ATL,Hawks,Atlanta,Georgia,1949
1,1610612738,Boston Celtics,BOS,Celtics,Boston,Massachusetts,1946
2,1610612739,Cleveland Cavaliers,CLE,Cavaliers,Cleveland,Ohio,1970
3,1610612740,New Orleans Pelicans,NOP,Pelicans,New Orleans,Louisiana,2002
4,1610612741,Chicago Bulls,CHI,Bulls,Chicago,Illinois,1966


### Data: `gids` 
Game IDs 

In [6]:
print(gids.shape)
gids.head()

(55128, 3)


Unnamed: 0,SEASON_YEAR,TEAM_ID,GAME_ID
0,2007-08,1610612737,G0020701222
1,2007-08,1610612764,G0020701218
2,2007-08,1610612746,G0020701226
3,2007-08,1610612759,G0020701223
4,2007-08,1610612745,G0020701226


### Data: `bsff` 
Box Score Four Factors. 

In [7]:
print(bsff.shape)
bsff.head()

(682841, 18)


Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,G0020000362,1610612748,MIA,Miami,1477,Bruce Bowen,Bruce,F,,32.000000:46,0.302,0.396,0.185,0.216,0.319,0.213,0.314,0.387
1,G0020000362,1610612748,MIA,Miami,193,Anthony Mason,Anthony,F,,36.000000:46,0.294,0.373,0.211,0.184,0.425,0.226,0.268,0.346
2,G0020000362,1610612748,MIA,Miami,258,Brian Grant,Brian,C,,34.000000:22,0.348,0.457,0.225,0.206,0.41,0.32,0.29,0.448
3,G0020000362,1610612748,MIA,Miami,224,Eddie Jones,Eddie,G,,40.000000:23,0.345,0.218,0.231,0.184,0.408,0.35,0.246,0.424
4,G0020000362,1610612748,MIA,Miami,896,Tim Hardaway,Tim,G,,28.000000:52,0.288,0.475,0.239,0.233,0.413,0.225,0.302,0.333


### Data: `pwins` 
Playoff Wins 

In [8]:
print(pwins.shape)
pwins.head()

(686, 34)


Unnamed: 0,TEAM_ID,TEAM_CITY,TEAM_NAME,YEAR,GP,WINS,LOSSES,WIN_PCT,CONF_RANK,DIV_RANK,...,OREB,DREB,REB,AST,PF,STL,TOV,BLK,PTS,PTS_RANK
0,1610612737,Atlanta,Hawks,2000-01,82,25,57,0.305,13,7,...,1028,2490,3518,1559,1863,634,1368,387,7459,26
1,1610612737,Atlanta,Hawks,2001-02,82,33,49,0.402,12,6,...,955,2445,3400,1656,1702,667,1275,350,7711,19
2,1610612737,Atlanta,Hawks,2002-03,82,35,47,0.427,11,5,...,937,2558,3495,1679,1783,611,1367,473,7714,18
3,1610612737,Atlanta,Hawks,2003-04,82,28,54,0.341,12,7,...,996,2507,3503,1648,1826,627,1350,408,7611,15
4,1610612737,Atlanta,Hawks,2004-05,82,13,69,0.159,15,5,...,1100,2335,3435,1614,2009,629,1319,344,7605,28


### Data: `rs` 
Rosters 

In [9]:
print(rs.shape)
rs.head()

(10841, 17)


Unnamed: 0,TeamID,SEASON,LeagueID,PLAYER,NICKNAME,PLAYER_SLUG,NUM,POSITION,HEIGHT,WEIGHT,BIRTH_DATE,AGE,EXP,SCHOOL,PLAYER_ID,HOW_ACQUIRED,HEIGHT_INCHES
0,1610612737,2000,0,DerMarr Johnson,DerMarr,dermarr-johnson,1.0,F,9-Jun,201.0,5-May-80,21.0,R,Cincinnati,2035,,81
1,1610612737,2000,0,Nazr Mohammed,Nazr,nazr-mohammed,2.0,C,10-Jun,240.0,5-Sep-77,23.0,2,Kentucky,1737,,82
2,1610612737,2000,0,Cal Bowdler,Cal,cal-bowdler,3.0,F,10-Jun,245.0,31-Mar-77,24.0,1,Old Dominion,1898,,82
3,1610612737,2000,0,Chris Crawford,Chris,chris-crawford,4.0,F,9-Jun,235.0,13-May-75,26.0,3,Marquette,1544,,81
4,1610612737,2000,0,Dion Glover,Dion,dion-glover,5.0,G,5-Jun,228.0,22-Oct-78,22.0,1,Georgia Tech,1901,,77


## Lagging and Averaging the Data 

In [10]:
upcoming_season = input() # input as an integer 
# example, in July 2023, I would type 2023

 2023


In [11]:
print("BEGIN: averaged_player_data.py")
t1 = averaged_player_data.averaged_player_data(bsff,gids,upcoming_season,seasons,rs)
print("END: averaged_player_data.py")

BEGIN: averaged_player_data.py
player data shape: (682841, 18)
merging seasons to the player data via game_id data frame
dropping duplicates
new shape (should match the old one): (682841, 19)
columns to drop:
['GAME_ID', 'TEAM_ABBREVIATION', 'TEAM_CITY', 'PLAYER_NAME', 'NICKNAME', 'START_POSITION', 'COMMENT']
columns to average:
['MIN', 'EFG_PCT', 'FTA_RATE', 'TM_TOV_PCT', 'OREB_PCT', 'OPP_FTA_RATE', 'OPP_TOV_PCT', 'OPP_OREB_PCT']
Formatting MIN column to float
creating ID column
averaging by team, player, and season IDs
adding in next season rosters
END: averaged_player_data.py


In [12]:
print(t1.shape)
t1.head()

(13306, 10)


Unnamed: 0,ID,MIN,EFG_PCT,FTA_RATE,TM_TOV_PCT,OREB_PCT,OPP_EFG_PCT,OPP_FTA_RATE,OPP_TOV_PCT,OPP_OREB_PCT
0,1610612737_101107_2005-06,23.317073,0.486924,0.382937,0.160544,0.308405,0.508468,0.422709,0.162481,0.291899
1,1610612737_101107_2006-07,33.515625,0.475203,0.352078,0.171031,0.295438,0.510625,0.378469,0.161891,0.273859
2,1610612737_101107_2007-08,33.304878,0.48575,0.338325,0.152975,0.28005,0.508225,0.2833,0.143688,0.278175
3,1610612737_101107_2008-09,32.730159,0.503902,0.328082,0.138852,0.239197,0.505361,0.269393,0.147197,0.278344
4,1610612737_101107_2009-10,29.682927,0.509654,0.274,0.12321,0.26,0.494864,0.257247,0.147123,0.258037


In [13]:
print("BEGIN: lagged_data.py")
t2 = lagged_data.lagged_data(t1)
print("END: lagged_data.py")

BEGIN: lagged_data.py
averaged data shape: (13306, 10)
creating the team, season, and player id
averaging data for those players who were traded
no averaging columns:
['ID', 'TEAM_ID', 'PLAYER_ID', 'SEASON_YEAR']
creating the average
recreating SEASON_YEAR and PLAYER_ID
sorting data
lag columns and PLAYER_ID:
['PLAYER_ID', 'MIN', 'EFG_PCT', 'FTA_RATE', 'TM_TOV_PCT', 'OREB_PCT', 'OPP_EFG_PCT', 'OPP_FTA_RATE', 'OPP_TOV_PCT', 'OPP_OREB_PCT']
no lag columns
Index(['PLAYER_SEASON', 'SEASON_YEAR'], dtype='object')
lagging the data
adding LAST_YEAR_ prefix to column names
adding back the other columns
recreating PLAYER_ID
adding back the previous year team
merging
END: lagged_data.py


In [14]:
print(t2.shape)
t2.head()

(11770, 13)


Unnamed: 0,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT,PLAYER_SEASON,SEASON_YEAR,PLAYER_ID,TEAM_ID
0,,,,,,,,,,1000_2000-01,2000-01,1000,1610612745
1,28.792683,0.493671,0.29572,0.16439,0.265585,0.487732,0.271976,0.136244,0.271976,1000_2001-02,2001-02,1000,1610612752
2,18.914634,0.456768,0.303915,0.186793,0.265585,0.454012,0.373866,0.156902,0.278415,1000_2002-03,2002-03,1000,1610612752
3,20.560976,0.483354,0.267061,0.165402,0.255366,0.476524,0.376915,0.154976,0.274817,1000_2003-04,2003-04,1000,1610612752
4,23.512195,0.463612,0.405588,0.178038,0.258487,0.45985,0.393775,0.146913,0.297425,1000_2004-05,2004-05,1000,1610612752


In [15]:
print("BEGIN: averaged_team_data.py")
t3 = averaged_team_data.averaged_team_data(t2)
print("END: averaged_team_data.py")

BEGIN: averaged_team_data.py
creating ids and dropping old ones
averaging by TEAM_SEASON
END: averaged_team_data.py


In [16]:
print(t3.shape)
t3.head()

(716, 10)


Unnamed: 0,TEAM_SEASON,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT
0,1610612737_2000-01,,,,,,,,,
1,1610612737_2001-02,18.097566,0.461098,0.324734,0.163691,0.266759,0.476202,0.355158,0.153661,0.286407
2,1610612737_2002-03,17.498683,0.463597,0.29443,0.170798,0.251704,0.485525,0.330209,0.153059,0.297139
3,1610612737_2003-04,16.50564,0.466556,0.317508,0.166936,0.281549,0.483793,0.324815,0.145321,0.289475
4,1610612737_2004-05,13.491047,0.433552,0.334024,0.167767,0.246573,0.492566,0.366926,0.166127,0.280384


## Adding in Unlagged and Unaveraged Variables 

### Height, Weight, and Experience 

In [17]:
print("adding height, weight, and experience")
rs2 = rs.merge(seasons,how='left',left_on='SEASON',right_on='y1')
rs2['TEAM_SEASON'] = rs2['TeamID'].astype(str)+'_'+rs2['id']
rs2['EXP'] = rs2['EXP'].mask(rs2['EXP']=='R',0)
rs2['EXP'] = rs2['EXP'].astype(float)
rsh = rs2[['TEAM_SEASON','HEIGHT_INCHES','WEIGHT','EXP']].groupby('TEAM_SEASON').mean().reset_index()
t4 = t3.merge(rsh,how='left',on='TEAM_SEASON')

adding height, weight, and experience


In [18]:
print(t4.shape)
t4.head()

(716, 13)


Unnamed: 0,TEAM_SEASON,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT,HEIGHT_INCHES,WEIGHT,EXP
0,1610612737_2000-01,,,,,,,,,,79.333333,220.666667,2.583333
1,1610612737_2001-02,18.097566,0.461098,0.324734,0.163691,0.266759,0.476202,0.355158,0.153661,0.286407,80.0,224.615385,3.153846
2,1610612737_2002-03,17.498683,0.463597,0.29443,0.170798,0.251704,0.485525,0.330209,0.153059,0.297139,79.1875,224.5625,4.125
3,1610612737_2003-04,16.50564,0.466556,0.317508,0.166936,0.281549,0.483793,0.324815,0.145321,0.289475,80.133333,227.0,4.0
4,1610612737_2004-05,13.491047,0.433552,0.334024,0.167767,0.246573,0.492566,0.366926,0.166127,0.280384,79.769231,225.923077,5.076923


### Last Season's Win Percentage 

In [19]:
print("creating TEAM_SEASON in the playoff wins data")
print('create next season id')
years = pwins['YEAR'].str.split('-')
y1 = [int(x[0])+1 for x in years]
y2 = [str((int(x[0]))+2)[2:4] for x in years]
pwins['y1'] = y1
pwins['y2'] = y2
pwins['TEAM_NEXT_SEASON'] = pwins['TEAM_ID'].astype(str)+'_'+pwins['y1'].astype(str)+'-'+pwins['y2'].astype(str)
pwins = pwins.sort_values('TEAM_NEXT_SEASON')
t5 = t4.merge(pwins[['WIN_PCT','TEAM_NEXT_SEASON']],how='left',left_on='TEAM_SEASON',right_on='TEAM_NEXT_SEASON')
t5.drop('TEAM_NEXT_SEASON',axis=1,inplace=True)
t5 = t5.rename({'WIN_PCT':'LS_WIN_PCT'},axis=1)

creating TEAM_SEASON in the playoff wins data
create next season id


In [20]:
print(t5.shape)
t5.head()

(716, 14)


Unnamed: 0,TEAM_SEASON,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT,HEIGHT_INCHES,WEIGHT,EXP,LS_WIN_PCT
0,1610612737_2000-01,,,,,,,,,,79.333333,220.666667,2.583333,
1,1610612737_2001-02,18.097566,0.461098,0.324734,0.163691,0.266759,0.476202,0.355158,0.153661,0.286407,80.0,224.615385,3.153846,0.305
2,1610612737_2002-03,17.498683,0.463597,0.29443,0.170798,0.251704,0.485525,0.330209,0.153059,0.297139,79.1875,224.5625,4.125,0.402
3,1610612737_2003-04,16.50564,0.466556,0.317508,0.166936,0.281549,0.483793,0.324815,0.145321,0.289475,80.133333,227.0,4.0,0.427
4,1610612737_2004-05,13.491047,0.433552,0.334024,0.167767,0.246573,0.492566,0.366926,0.166127,0.280384,79.769231,225.923077,5.076923,0.341


## Adding the Target Variable: Playoff Wins 

In [21]:
print("BEGIN: adding_pwins.py")
t6 = adding_pwins.adding_pwins(t5,pwins)
print("END: adding_pwins.py")

BEGIN: adding_pwins.py
creating TEAM_SEASON in the playoff wins data
merging with the lagged data set
END: adding_pwins.py


In [22]:
print(t6.shape)
t6.head()

(716, 15)


Unnamed: 0,TEAM_SEASON,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT,HEIGHT_INCHES,WEIGHT,EXP,LS_WIN_PCT,PO_WINS
0,1610612737_2000-01,,,,,,,,,,79.333333,220.666667,2.583333,,0.0
1,1610612737_2001-02,18.097566,0.461098,0.324734,0.163691,0.266759,0.476202,0.355158,0.153661,0.286407,80.0,224.615385,3.153846,0.305,0.0
2,1610612737_2002-03,17.498683,0.463597,0.29443,0.170798,0.251704,0.485525,0.330209,0.153059,0.297139,79.1875,224.5625,4.125,0.402,0.0
3,1610612737_2003-04,16.50564,0.466556,0.317508,0.166936,0.281549,0.483793,0.324815,0.145321,0.289475,80.133333,227.0,4.0,0.427,0.0
4,1610612737_2004-05,13.491047,0.433552,0.334024,0.167767,0.246573,0.492566,0.366926,0.166127,0.280384,79.769231,225.923077,5.076923,0.341,0.0


## Adding the Team Name for Checking Purposes 

In [23]:
print('adding in team name just to check stuff')
t7 = t6.copy()
ids = t7['TEAM_SEASON'].str.split('_')
t7['TEAM_ID'] = [x[0] for x in ids]
pw = pwins[['TEAM_ID','TEAM_NAME']].astype(str)
pw.drop_duplicates(inplace=True)
old_teams = ['Bobcats','SuperSonics']
pw = pw[~pw['TEAM_NAME'].isin(old_teams)&~((pw['TEAM_NAME']=='Hornets')&(pw['TEAM_ID']=='1610612740'))]
t7 = t7.merge(pw,how='left',on='TEAM_ID')

adding in team name just to check stuff


## Finalizing and Checking 

In [24]:
print('dropping the teams first records')
t8 = t7.dropna(subset=['LS_MIN'])

dropping the teams first records


In [25]:
print(t8.shape)
t8.head()

(687, 17)


Unnamed: 0,TEAM_SEASON,LS_MIN,LS_EFG_PCT,LS_FTA_RATE,LS_TM_TOV_PCT,LS_OREB_PCT,LS_OPP_EFG_PCT,LS_OPP_FTA_RATE,LS_OPP_TOV_PCT,LS_OPP_OREB_PCT,HEIGHT_INCHES,WEIGHT,EXP,LS_WIN_PCT,PO_WINS,TEAM_ID,TEAM_NAME
1,1610612737_2001-02,18.097566,0.461098,0.324734,0.163691,0.266759,0.476202,0.355158,0.153661,0.286407,80.0,224.615385,3.153846,0.305,0.0,1610612737,Hawks
2,1610612737_2002-03,17.498683,0.463597,0.29443,0.170798,0.251704,0.485525,0.330209,0.153059,0.297139,79.1875,224.5625,4.125,0.402,0.0,1610612737,Hawks
3,1610612737_2003-04,16.50564,0.466556,0.317508,0.166936,0.281549,0.483793,0.324815,0.145321,0.289475,80.133333,227.0,4.0,0.427,0.0,1610612737,Hawks
4,1610612737_2004-05,13.491047,0.433552,0.334024,0.167767,0.246573,0.492566,0.366926,0.166127,0.280384,79.769231,225.923077,5.076923,0.341,0.0,1610612737,Hawks
5,1610612737_2005-06,21.489895,0.463531,0.327702,0.170385,0.274347,0.510406,0.36932,0.144927,0.280918,78.846154,223.615385,1.923077,0.159,0.0,1610612737,Hawks


In [35]:
check1 = t8.copy()
ids = t8['TEAM_SEASON'].str.split('_')
sn = [x[1] for x in ids]
check1['sn'] = sn
sn1 = check1['sn'].str.split('-')
sn2 = [x[0] for x in sn1] 
check1['yr'] = sn2
check1.sort_values('yr',inplace=True)

In [36]:
check1 = check1[check1['PO_WINS']>15][['TEAM_NAME','PO_WINS','yr']]
check1

Unnamed: 0,TEAM_NAME,PO_WINS,yr
528,Spurs,16.0,2002
673,Pistons,16.0,2003
530,Spurs,16.0,2004
267,Heat,16.0,2005
532,Spurs,16.0,2006
31,Celtics,16.0,2007
246,Lakers,16.0,2008
247,Lakers,16.0,2009
128,Mavericks,16.0,2010
273,Heat,16.0,2011


In [None]:
print("WRITING: /users/jordanwegner/Desktop/nba2/03_data/modeling_data.csv")
#t8.to_csv('/users/jordanwegner/Desktop/nba2/03_data/modeling_data.csv',index=False) 