In [1]:
import pandas as pd
import numpy as np
import re
import datetime

In [2]:
df = pd.read_csv('df_injury_player_data.csv')

In [3]:
df

Unnamed: 0,p_id2,start_year,season_days_injured,total_days_injured,season_minutes_played,season_games_played,season_matches_in_squad,total_minutes_played,total_games_played,dob,height_cm,weight_kg,nationality,work_rate,pace,physic,fifa_rating,position
0,aaronconnolly,2019,13,161,1312.0,24,28,2148.0,41,2000-01-28,175.333333,75.666667,Republic of Ireland,Medium/Low,72.333333,58.000000,63.000000,Forward
1,aaronconnolly,2020,71,161,836.0,17,28,2148.0,41,2000-01-28,175.333333,75.666667,Republic of Ireland,Medium/Low,72.333333,58.000000,63.000000,Forward
2,aaroncresswell,2016,95,226,2247.0,26,27,13368.0,149,1989-12-15,171.666667,66.000000,England,High/Medium,74.333333,67.000000,75.333333,Defender
3,aaroncresswell,2018,87,226,1680.0,20,27,13368.0,149,1989-12-15,171.666667,66.000000,England,High/Medium,74.333333,67.000000,75.333333,Defender
4,aaroncresswell,2019,35,226,2870.0,31,31,13368.0,149,1989-12-15,171.666667,66.000000,England,High/Medium,74.333333,67.000000,75.333333,Defender
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296,yoshinorimuto,2019,46,210,288.0,8,17,915.0,25,1992-07-15,178.500000,72.000000,Japan,High/High,77.500000,58.500000,74.333333,Forward
1297,yvesbissouma,2018,124,124,1880.0,28,34,6477.0,86,1996-08-30,182.000000,80.000000,Mali,High/Medium,73.750000,67.000000,73.000000,Midfielder
1298,zacksteffen,2020,30,307,95.0,1,35,95.0,1,1995-04-02,191.000000,84.800000,United States,Medium/Medium,,,72.000000,Goalkeeper
1299,zlatanibrahimovic,2016,210,579,2577.0,28,28,2699.0,33,1981-10-03,195.000000,95.000000,Sweden,Medium/Low,62.833333,81.166667,86.666667,Forward


In [4]:
df.reset_index(drop=True, inplace=True)

In [5]:
# Calculate age at the beginning of the season
df['age'] = df['start_year'] - df['dob'].str[:4].astype(int)

In [6]:
# function to calculate cumulative minutes and games played
def calculate_cumulative(x):
    return x.shift(1).expanding().sum()

In [7]:
# Calculate minutes per game considering all years prior to the current year
df['cumulative_minutes_played'] = df.groupby('p_id2')['season_minutes_played'].apply(calculate_cumulative).reset_index(level=0, drop=True)
df['cumulative_games_played'] = df.groupby('p_id2')['season_games_played'].apply(calculate_cumulative).reset_index(level=0, drop=True)
df['minutes_per_game_prev_seasons'] = df['cumulative_minutes_played'] / df['cumulative_games_played']

In [8]:
# Custom function to calculate average of a Series
def calculate_average(x):
    return x.shift(1).expanding().mean()

In [9]:
# Calculate average days injured per season in previous seasons
df['avg_days_injured_prev_seasons'] = df.groupby('p_id2')['season_days_injured'].apply(calculate_average).reset_index(level=0, drop=True)


In [10]:
# Calculate average games per season in previous seasons
df['avg_games_per_season_prev_seasons'] = df.groupby('p_id2')['season_games_played'].apply(calculate_average).reset_index(level=0, drop=True)


In [11]:
# Calculate the Body Mass Index (BMI)
df['bmi'] = df['weight_kg'] / (df['height_cm'] / 100)**2


In [12]:
#Converting work rate into numeric variable : low = 1, medium = 1.5, high = 2. 
work_rate_map = {'Low': 1, 'Medium': 1.5, 'High': 2}
df['work_rate_numeric'] = df['work_rate'].apply(lambda x: sum([work_rate_map[w] for w in x.split('/')]))


In [13]:
#Convert position to numeric variable : (GK -> defender -> forward -> midfielder)
pos_order = {'Goalkeeper': 0, 'Defender': 1, 'Forward': 2, 'Midfielder': 3}
df['position_numeric'] = df['position'].map(pos_order)


In [15]:
injury_threshold = 120

In [16]:
# Create a feature indicating if the player had a significant injury in the previous season
#injury_threshold defined previously
prev_season_injury = df.groupby('p_id2')['season_days_injured'].shift(1)
df['significant_injury_prev_season'] = np.where(prev_season_injury.isna(), np.nan, (prev_season_injury >= injury_threshold).astype(int))
df['significant_injury_prev_season'].astype('category')

  output = repr(obj)


0       NaN
1       0.0
2       NaN
3       0.0
4       0.0
       ... 
1296    0.0
1297    NaN
1298    NaN
1299    NaN
1300    1.0
Name: significant_injury_prev_season, Length: 1301, dtype: category
Categories (2, float64): [0.0, 1.0]

In [17]:
# Calculate cumulative days injured and days injured in the previous season
df['cumulative_days_injured'] = df.groupby('p_id2')['total_days_injured'].apply(calculate_cumulative).reset_index(level=0, drop=True)
df['season_days_injured_prev_season'] = df.groupby('p_id2')['season_days_injured'].shift(1)


In [18]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [19]:
df.position.unique()

array(['Forward', 'Defender', 'Midfielder', 'Goalkeeper', nan],
      dtype=object)

In [20]:
df.isnull().sum()

p_id2                                  0
start_year                             0
season_days_injured                    0
total_days_injured                     0
season_minutes_played                  0
season_games_played                    0
season_matches_in_squad                0
total_minutes_played                   0
total_games_played                     0
dob                                    0
height_cm                              0
weight_kg                              0
nationality                            0
work_rate                              0
pace                                  95
physic                                95
fifa_rating                            0
position                               2
age                                    0
cumulative_minutes_played            604
cumulative_games_played              604
minutes_per_game_prev_seasons        616
avg_days_injured_prev_seasons        604
avg_games_per_season_prev_seasons    604
bmi             

In [21]:
df.to_csv('dataset.csv', index = False)