In [1]:
import pandas as pd
import numpy as np

In [8]:
injuries = pd.read_csv('datasets/injuries_2010-2020.csv.zip', parse_dates=['Date'])

In [9]:
injuries = injuries.dropna(subset=['Relinquished'])

In [10]:
injuries

Unnamed: 0,Date,Team,Acquired,Relinquished,Notes
0,2010-10-03,Bulls,,Carlos Boozer,fractured bone in right pinky finger (out inde...
1,2010-10-06,Pistons,,Jonas Jerebko,torn right Achilles tendon (out indefinitely)
2,2010-10-06,Pistons,,Terrico White,broken fifth metatarsal in right foot (out ind...
3,2010-10-08,Blazers,,Jeff Ayres,torn ACL in right knee (out indefinitely)
4,2010-10-08,Nets,,Troy Murphy,strained lower back (out indefinitely)
...,...,...,...,...,...
27097,2020-09-22,Celtics,,Romeo Langford,surgery on right wrist (out for season)
27098,2020-09-23,Heat,,Gabe Vincent,sore right knee (DTD)
27099,2020-09-30,Heat,,Bam Adebayo,strained left shoulder (DTD)
27101,2020-10-02,Heat,,Bam Adebayo,strained neck (DTD)


In [11]:
injuries.Notes.value_counts().head(20)

placed on IL                              3389
sprained left ankle (DNP)                  362
rest (DTD)                                 286
placed on IL with illness                  274
placed on IL with sprained left ankle      230
illness (DTD)                              228
placed on IL with sprained right ankle     206
sprained left ankle (DTD)                  188
sprained right ankle (DNP)                 175
placed on IL for rest                      175
sprained right ankle (DTD)                 166
sore left knee (DNP)                       144
placed on IL with sore left knee           128
rest (DNP)                                 119
placed on IL with sore right knee          119
sore right knee (DNP)                      109
sore left knee (DTD)                       109
illness (DNP)                              100
sore right knee (DTD)                       97
concussion (DNP)                            96
Name: Notes, dtype: int64

In [12]:
injuries['out_for_season'] = injuries.Notes.str.contains('out for season')
injuries['out_indefinitely'] = injuries.Notes.str.contains('out indefinitely')
injuries['year'] = injuries.Date.dt.year
injuries['month'] = injuries.Date.dt.month

In [13]:
injuries['year'] = np.where(injuries['month']<=6, injuries['year'], injuries['year'] + 1)

In [14]:
injuries_by_year = injuries.groupby(['Relinquished', 'year'])\
                            .agg(out_for_season = ('out_for_season', 'sum'),
                                 out_indefinitely = ('out_indefinitely', 'sum'),).reset_index()
injuries_by_year = injuries_by_year.rename(columns={'Relinquished':'player'})
injuries_by_year['player'] = injuries_by_year.player.str.strip()


In [15]:
injuries_by_year = injuries_by_year.sort_values(['player', 'year'])

In [16]:
injuries_by_year['out_for_season'] = np.where(injuries_by_year['out_for_season']>=1,
                                              1, 0)

In [25]:
years = injuries_by_year.year.unique()
players = injuries_by_year.player.unique()

In [38]:
players_list = np.repeat(players, len(years))
years_list =  list(years) * len(players)
joined_df = pd.DataFrame({'player': players_list, 'year': years_list})

In [41]:
injuries_by_year.year.value_counts()

2018    466
2017    416
2016    401
2015    401
2011    392
2013    382
2020    379
2019    372
2012    366
2014    365
2021    137
Name: year, dtype: int64

In [43]:
injuries_by_year = pd.merge(joined_df, injuries_by_year, how='left').fillna(0).sort_values(['player', 'year'])

In [45]:
acum_injuries = injuries_by_year.groupby(['player'])[['out_for_season', 'out_indefinitely']].cumsum()
acum_injuries = acum_injuries.rename(columns={'out_for_season':'acum_out_for_season',
                                                   'out_indefinitely':'acum_out_indefinitely'})

In [46]:
acum_injuries

Unnamed: 0,acum_out_for_season,acum_out_indefinitely
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,0.0
...,...,...
12696,0.0,0.0
12697,0.0,0.0
12695,0.0,0.0
12704,0.0,0.0


In [47]:
full_injuries = injuries_by_year.join(acum_injuries)

In [49]:
full_injuries[full_injuries.player.str.contains('Anthony Davis')]

Unnamed: 0,player,year,out_for_season,out_indefinitely,acum_out_for_season,acum_out_indefinitely
696,Anthony Davis,2011,0.0,0.0,0.0,0.0
697,Anthony Davis,2012,0.0,0.0,0.0,0.0
698,Anthony Davis,2013,0.0,1.0,0.0,1.0
699,Anthony Davis,2014,0.0,1.0,0.0,2.0
700,Anthony Davis,2015,0.0,0.0,0.0,2.0
701,Anthony Davis,2016,1.0,0.0,1.0,2.0
694,Anthony Davis,2017,1.0,0.0,2.0,2.0
695,Anthony Davis,2018,0.0,0.0,2.0,2.0
693,Anthony Davis,2019,1.0,1.0,3.0,3.0
702,Anthony Davis,2020,0.0,0.0,3.0,3.0


In [50]:
full_injuries.to_csv('datasets/full_injuries.csv', index=False)