In [2]:
import pandas as pd
import numpy as np
import glob
import xml.etree.ElementTree as etree

# Load Data

In [3]:
filelocation = '/Users/matthiashugli/Dropbox/bucket/super-league-stats/*.xml'

In [86]:
all_events = []
id_iterator = 0
for filename in glob.glob(filelocation):
    prstree = etree.parse(filename)
    root = prstree.getroot()

    event_items = []
    event_subitems = []
    header = ['game_id', 'start', 'end', 'player', 'x_pos', 'y_pos', 'team', 'action', 'half']

    for r in root.iter('ALL_INSTANCES'):
        for inst in r.iter('instance'):
        
            start = inst.find('start').text
            end = inst.find('end').text
            player = inst.find('code').text

            if inst.find('pos_x') != None:
                x_pos = inst.find('pos_x').text
                y_pos = inst.find('pos_y').text
            
            event_items = [start, end, player, x_pos, y_pos]
            
            for label in inst.iter('label'):
                labels = label.find('text').text
                event_items.append(labels)

            all_events.append(event_items)
            event_items.insert(0, id_iterator)
    id_iterator += 1

df = pd.DataFrame(all_events, columns=header)

In [114]:
data_types_dict = {'start': float, 'end': float, 'x_pos': float, 'y_pos': float}
df = df.astype(data_types_dict)
df.head()

Unnamed: 0,game_id,start,end,player,x_pos,y_pos,team,action,half
0,0,0.0,14.26,17. Filip Stojilkovic,52.5,34.0,Sion,Passes accurate,1st half
1,0,0.0,19.61,32. Loris Benito Souto,39.1,59.9,Sion,Passes accurate,1st half
2,0,0.0,17.18,14. Anto Grgic,39.0,39.9,Sion,Passes accurate,1st half
3,0,4.01,24.01,52. Wesley,45.7,63.2,Sion,Passes accurate,1st half
4,0,4.26,5.26,Start,45.7,63.2,,,


In [115]:
# Create fixtures table with team one and team two. There is no clear distinction of Home and away team.

team_one = df.groupby(['game_id', 'team']).count().reset_index()[['game_id', 'team']] \
            .sort_values(['game_id', 'team']) \
                .drop_duplicates(subset='game_id', keep='last')

team_two = df.groupby(['game_id', 'team']).count().reset_index()[['game_id', 'team']] \
            .sort_values(['game_id', 'team']) \
                .drop_duplicates(subset='game_id', keep='first')

fixtures = team_one.merge(team_two, how='inner', left_on='game_id', right_on='game_id')
fixtures.head()

Unnamed: 0,game_id,team_x,team_y
0,0,Sion,Luzern
1,1,Servette,Luzern
2,2,Lugano,Grasshopper
3,3,St. Gallen,Lausanne Sport
4,4,Young Boys,Servette


In [116]:
# Action variables
shot_outcome_actions = ['Wide shot', 'Shot on target', 'Shot into the bar/post', 'Goals']
goal_situations = ['Assists', 'Goals', 'Free-kick attacks', 'penalty attack']
gk_actions = ['Wide shot (Goalkeepers)', 'Shot on target (saved)', 'Goals conceded']
passing_action = ['Passes accurate', 'Passes (inaccurate)', 'Assists']

In [120]:
shots = df[df['action'] == 'Shots'].reset_index()

action_before_shots = df.reset_index()
action_before_shots.rename(columns={'index': 'id'}, inplace=True)

for i in range(1, 5):
    action_before_shots['action-' + str(i)] = action_before_shots.id.shift(i)
    action_before_shots['action+' + str(i)] = action_before_shots.id.shift(-i)

melt_actions = pd.melt(action_before_shots, id_vars=action_before_shots.iloc[:,:9], value_vars=action_before_shots.iloc[:,10:])
melt_actions = melt_actions[melt_actions['action'] == 'Shots']
melt_actions.dropna(subset=['value'], inplace=True)
melt_actions.merge(action_before_shots, left_on='value', right_on='id')
melt_actions['value'] = melt_actions['value'].astype('int64')
merge_actions = melt_actions.merge(action_before_shots, left_on='value', right_on='id').reset_index()

merge_actions['time_diff'] = (merge_actions['start_y'] - merge_actions['start_x'])
merge_actions['field_progression'] = merge_actions['x_pos_x'] - merge_actions['x_pos_y']

last_action = merge_actions[(merge_actions['time_diff'] <= 0) & \
                         (~merge_actions['action_y'].isin(shot_outcome_actions)) & \
                         (merge_actions['team_x'] == merge_actions['team_y'])] \
        .sort_values(by=['id_x', 'time_diff']).drop_duplicates(subset=['id_x'], keep='last')

gk_action = merge_actions[merge_actions['action_y'].isin(['Wide shot (Goalkeepers)', 'Shot on target (saved)'])].sort_values(by=['id_x'])

shot_creation = shots.merge(last_action[['id_x', 'x_pos_y', 'y_pos_y', 'action_y', 'time_diff']], how='left', left_on='index', right_on='id_x')
shot_creation = shot_creation.merge(gk_action[['id_x', 'team_y', 'action_y', 'player_y']], how='left', left_on='index', right_on='id_x') \
    .drop(columns=['index', 'half', 'id_x_x', 'id_x_y', 'team_y']) \
        .rename(columns={'team': 'attacking_team', 'x_pos_y': 'x_pos_ass', 'y_pos_y': 'y_pos_ass', \
            'player_y': 'goalkeeper', 'action_y_x': 'action', 'action_y_y': 'outcome_gk', 'action': 'outcome'})

In [121]:
goals = df[df['action'] == 'Goals'].reset_index()

action_before_goals = df.reset_index()
action_before_goals.rename(columns={'index': 'id'}, inplace=True)

for i in range(1, 5):
    action_before_goals['action-' + str(i)] = action_before_goals.id.shift(i)
    action_before_goals['action+' + str(i)] = action_before_goals.id.shift(-i)

melt_goals = pd.melt(action_before_goals, id_vars=action_before_goals.iloc[:,:9], value_vars=action_before_goals.iloc[:,10:])
melt_goals = melt_goals[melt_goals['action'] == 'Goals']
melt_goals.dropna(subset=['value'], inplace=True)
melt_goals.merge(action_before_goals, left_on='value', right_on='id')
melt_goals['value'] = melt_goals['value'].astype('int64')
merge_goals = melt_goals.merge(action_before_goals, left_on='value', right_on='id').reset_index()

merge_goals['time_diff'] = np.where(merge_goals['start_y'] - merge_goals['start_x'] > 0, \
                                    (merge_goals['start_y'] - merge_goals['start_x']) * -1, \
                                    merge_goals['start_y'] - merge_goals['start_x'])
merge_goals['field_progression'] = merge_goals['x_pos_x'] - merge_goals['x_pos_y']

last_action = merge_goals[(merge_goals['action_y'].isin(goal_situations)) & \
                         (merge_goals['team_x'] == merge_goals['team_y'])] \
        .sort_values(by=['id_x', 'time_diff'], ascending=True).drop_duplicates(subset=['id_x'], keep='last')

gk_action = merge_goals[merge_goals['action_y'] == 'Goals conceded'].sort_values(by=['id_x'])

goal_creation = goals.merge(last_action[['id_x', 'x_pos_y', 'y_pos_y', 'action_y', 'time_diff']], how='left', left_on='index', right_on='id_x')
goal_creation = goal_creation.merge(gk_action[['id_x', 'team_y', 'action_y', 'player_y']], how='left', left_on='index', right_on='id_x') \
    .drop(columns=['index', 'half', 'id_x_x', 'id_x_y', 'team_y']) \
        .rename(columns={'team': 'attacking_team', 'x_pos_y': 'x_pos_ass', 'y_pos_y': 'y_pos_ass', \
            'player_y': 'goalkeeper', 'action_y_x': 'action', 'action_y_y': 'outcome_gk', 'action': 'outcome'})

In [144]:
chance_creation = pd.concat([shot_creation, goal_creation]).reset_index()
chance_creation.count()

index             886
game_id           886
start             886
end               886
player            886
x_pos             886
y_pos             886
attacking_team    886
outcome           886
x_pos_ass         865
y_pos_ass         865
action            865
time_diff         865
outcome_gk        674
goalkeeper        674
dtype: int64

In [160]:
chance_creation = pd.concat([shot_creation, goal_creation]).reset_index()
passing_deviation = chance_creation['time_diff'].mean() + (chance_creation['time_diff'].std() * -3)
columns = ['x_pos_ass', 'y_pos_ass', 'action']
for col in columns:    
    chance_creation[col] = np.where((chance_creation['action'].isin(passing_action)) & (chance_creation['time_diff'] < passing_deviation), \
        np.nan, \
        chance_creation[col])

chance_creation = chance_creation.merge(fixtures, how='left', left_on='game_id', right_on='game_id').reset_index()
chance_creation['defending_team'] = np.where(chance_creation['attacking_team'] == chance_creation['team_x'], chance_creation['team_y'], chance_creation['team_x'])

In [163]:
#chance_creation.drop(columns=['level_0', 'index', 'time_diff', 'team_x', 'team_y']).tail()
#chance_creation['defending_team'] = np.where(chance_creation['attacking_team'] == 'team_x', '0', '1')
chance_creation[chance_creation['game_id'] == 25]

Unnamed: 0,level_0,index,game_id,start,end,player,x_pos,y_pos,attacking_team,outcome,x_pos_ass,y_pos_ass,action,time_diff,outcome_gk,goalkeeper,team_x,team_y,defending_team
577,577,577,25,132.79,152.79,5. Jan Alexandre Bamert,99.8,26.8,Sion,Shots,105.0,0.0,Passes accurate,-1.36,Wide shot (Goalkeepers),1. Lawrence Ati Zigi,St. Gallen,Sion,St. Gallen
578,578,578,25,628.94,648.94,11. Kwadwo Duah,80.7,20.8,St. Gallen,Shots,56.2,41.1,Passes accurate,-3.82,Shot on target (saved),18. Kevin Fickentscher,St. Gallen,Sion,Sion
579,579,579,25,668.4,688.4,11. Kwadwo Duah,93.0,22.3,St. Gallen,Shots,105.0,0.0,Passes accurate,-1.25,Shot on target (saved),18. Kevin Fickentscher,St. Gallen,Sion,Sion
580,580,580,25,1251.67,1271.67,45. Alexandre Tounde Dimitri Jankewitz,79.3,37.3,St. Gallen,Shots,74.0,37.7,Passes accurate,-1.72,,,St. Gallen,Sion,Sion
581,581,581,25,1416.01,1436.01,8. Baltazar Costa Rodrigues de Oliveira,92.5,50.3,Sion,Shots,75.4,60.1,Passes accurate,-1.58,,,St. Gallen,Sion,St. Gallen
582,582,582,25,1672.77,1692.77,7. Fabian Schubert,98.5,23.5,St. Gallen,Shots,104.4,6.3,Passes accurate,-1.22,Wide shot (Goalkeepers),18. Kevin Fickentscher,St. Gallen,Sion,Sion
583,583,583,25,1726.24,1746.24,8. Jordi Quintilla,87.4,34.3,St. Gallen,Shots,92.5,15.6,Passes accurate,-2.37,Wide shot (Goalkeepers),18. Kevin Fickentscher,St. Gallen,Sion,Sion
584,584,584,25,1803.83,1823.83,52. Wesley,80.3,26.9,Sion,Shots,76.2,28.1,Interceptions,-2.0,Shot on target (saved),1. Lawrence Ati Zigi,St. Gallen,Sion,St. Gallen
585,585,585,25,1959.29,1979.29,9. Jeremy Bruno Guillemenot,92.2,38.1,St. Gallen,Shots,95.6,50.6,Passes accurate,-0.66,Shot on target (saved),18. Kevin Fickentscher,St. Gallen,Sion,Sion
586,586,586,25,1961.42,1981.42,45. Alexandre Tounde Dimitri Jankewitz,98.8,31.9,St. Gallen,Shots,100.7,33.7,Passes accurate,-0.51,Shot on target (saved),18. Kevin Fickentscher,St. Gallen,Sion,Sion


In [122]:

passing = chance_creation[chance_creation['action'].isin(['Passes accurate', 'Passes (inaccurate)', 'Assists'])]
passing_deviation = passing['time_diff'].mean() + (passing['time_diff'].std() * -3)
passing = passing[passing['time_diff'] >= passing_deviation]
passing.to_csv('/Users/matthiashugli/Dropbox/bucket/super-league-stats/chance_creation_passes.csv')

In [95]:
passing_deviation = passing['time_diff'].mean() + (passing['time_diff'].std() * -3)
columns = ['x_pos_ass', 'y_pos_ass', 'action']
for col in columns:    
    chance_creation[col] = np.where((chance_creation['action'].isin(passing_action)) & (chance_creation['time_diff'] < passing_deviation), \
        np.nan, \
        chance_creation[col])