In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
raw_df_match_details = pd.read_csv('../data/raw/match_details.csv')
raw_df_player_attr = pd.read_csv('../data/raw/player_attributes.csv')

In [3]:
# raw_df_match_details.info()

In [4]:
# raw_df_player_attr.info()

In [5]:
raw_df_match_details = (raw_df_match_details
 .assign(date=pd.to_datetime(raw_df_match_details['date']),
         result_match=lambda df: (df['result_match'] == 'H').astype(int))
 )

In [6]:
raw_df_match_details['date'] = pd.to_datetime(raw_df_match_details['date'])
raw_df_player_attr['date'] = pd.to_datetime(raw_df_player_attr['date'])
raw_df_match_details.sort_values(by=['season', 'date'],inplace=True)
# raw_df_match_details.drop('stage', axis=1, inplace=True)

In [7]:
from playerstats import player_stats

players_cols = ['{}_player_{}'.format(team, i) for team in ['home', 'away'] for i in range(1, 12)]

player_stats_dict_series = raw_df_match_details.apply(
    lambda row: player_stats.calculate_player_stat(
        match_row=row,
        df_matches=raw_df_match_details,
        df_player_attr=raw_df_player_attr,
        players=players_cols
    ),
    axis=1
)

new_player_stats_df = pd.json_normalize(player_stats_dict_series)

df = pd.merge(raw_df_match_details, new_player_stats_df, how='left', on='match_api_id')
df.drop(players_cols, axis=1, inplace=True)
matching_columns = [col for col in df.columns if 'player_rating' in col]
df.dropna(subset=matching_columns, inplace=True)
matching_columns_filtered = [c for c in df.columns if c != "date" and c != "season"]
df[matching_columns_filtered] = df[matching_columns_filtered].astype('int16')

In [23]:
df_=df.copy()

In [24]:
home_df = df_[['match_api_id', 'season', 'stage', 'date', 'home_team', 'home_team_goal', 'away_team_goal',
             'result_match', 'home_shoton', 'home_possession'] +
             [col for col in df_.columns if 'player_rating_home_player_' in col]]
home_df = home_df.rename(columns={
    'home_team': 'team',
    'home_team_goal': 'team_goal',
    'away_team_goal': 'opponent_goal',
    'home_shoton': 'team_shoton',
    'home_possession': 'team_possession'
})
home_df['is_home'] = 1

away_df = df_[['match_api_id', 'season', 'stage', 'date', 'away_team', 'home_team', 'home_team_goal', 'away_team_goal',
             'result_match', 'away_shoton', 'away_possession'] +
             [col for col in df_.columns if 'player_rating_away_player_' in col]]
away_df = away_df.rename(columns={
    'away_team': 'team',
    'away_team_goal': 'team_goal',
    'home_team_goal': 'opponent_goal',
    'away_shoton': 'team_shoton',
    'away_possession': 'team_possession'
})
away_df['is_home'] = 0

team_df = pd.concat([home_df, away_df], ignore_index=True)
team_df = team_df.sort_values(by=['team', 'date']).reset_index(drop=True)
team_df.head()

Unnamed: 0,match_api_id,season,stage,date,team,team_goal,opponent_goal,result_match,team_shoton,team_possession,...,player_rating_away_player_2,player_rating_away_player_3,player_rating_away_player_4,player_rating_away_player_5,player_rating_away_player_6,player_rating_away_player_7,player_rating_away_player_8,player_rating_away_player_9,player_rating_away_player_10,player_rating_away_player_11
0,3221,2009/2010,1,2009-08-15,8191,0,2,1,0,0,...,66.0,66.0,66.0,66.0,66.0,56.0,68.0,69.0,61.0,70.0
1,3229,2009/2010,2,2009-08-19,8191,1,0,1,10,52,...,,,,,,,,,,
2,3241,2009/2010,3,2009-08-23,8191,1,0,1,2,36,...,,,,,,,,,,
3,3252,2009/2010,4,2009-08-29,8191,0,3,1,5,61,...,68.0,68.0,68.0,68.0,68.0,70.0,56.0,66.0,69.0,61.0
4,3253,2009/2010,5,2009-09-12,8191,0,4,1,2,36,...,68.0,68.0,68.0,68.0,68.0,71.0,59.0,68.0,70.0,64.0


In [25]:
features_to_shift = ['team_shoton', 'team_possession'] + [col for col in team_df.columns if 'player_rating_' in col]
team_df_shifted = team_df.groupby('team')[features_to_shift].shift(1)

home_prev = team_df[team_df['is_home'] == 1][['match_api_id']].copy()
home_prev = home_prev.merge(team_df_shifted, left_index=True, right_index=True)
home_prev = home_prev.rename(columns=lambda x: 'home_prev_' + x if x != 'match_api_id' else x)

away_prev = team_df[team_df['is_home'] == 0][['match_api_id']].copy()
away_prev = away_prev.merge(team_df_shifted, left_index=True, right_index=True)
away_prev = away_prev.rename(columns=lambda x: 'away_prev_' + x if x != 'match_api_id' else x)

df_ = df_.merge(home_prev, on='match_api_id', how='left')

df_ = df_.merge(away_prev, on='match_api_id', how='left')

original_home_features = ['home_shoton', 'home_possession'] + [
    col for col in df_.columns if col.startswith('player_rating_home_player_')
]

original_away_features = ['away_shoton', 'away_possession'] + [
    col for col in df_.columns if col.startswith('player_rating_away_player_')
]

df_final = df_.drop(columns=original_home_features + original_away_features)

In [26]:
df_final.head()

Unnamed: 0,match_api_id,season,stage,date,away_team,home_team,home_team_goal,away_team_goal,result_match,home_prev_team_shoton,...,away_prev_player_rating_away_player_2,away_prev_player_rating_away_player_3,away_prev_player_rating_away_player_4,away_prev_player_rating_away_player_5,away_prev_player_rating_away_player_6,away_prev_player_rating_away_player_7,away_prev_player_rating_away_player_8,away_prev_player_rating_away_player_9,away_prev_player_rating_away_player_10,away_prev_player_rating_away_player_11
0,30291,2008/2009,1,2008-08-16,8659,9825,1,0,1,,...,,,,,,,,,,
1,30292,2008/2009,1,2008-08-16,8650,8472,0,1,0,,...,,,,,,,,,,
2,30293,2008/2009,1,2008-08-16,8528,8654,2,1,1,,...,,,,,,,,,,
3,30295,2008/2009,1,2008-08-16,8655,8668,2,3,0,,...,,,,,,,,,,
4,30296,2008/2009,1,2008-08-16,8586,8549,2,1,1,,...,,,,,,,,,,
