In [7]:
import pandas as pd
import numpy as np
import glob
import xml.etree.ElementTree as etree

# Load Data

In [8]:
filelocation = '/Users/matthiashugli/Dropbox/bucket/super-league-stats/*.xml'

In [9]:
all_events = []
id_iterator = 0
for filename in glob.glob(filelocation):
    prstree = etree.parse(filename)
    root = prstree.getroot()

    event_items = []
    event_subitems = []
    header = ['game_id', 'start', 'end', 'player', 'x_pos', 'y_pos', 'team', 'action', 'half']

    for r in root.iter('ALL_INSTANCES'):
        for inst in r.iter('instance'):
        
            start = inst.find('start').text
            end = inst.find('end').text
            player = inst.find('code').text

            if inst.find('pos_x') != None:
                x_pos = inst.find('pos_x').text
                y_pos = inst.find('pos_y').text
            
            event_items = [start, end, player, x_pos, y_pos]
            
            for label in inst.iter('label'):
                labels = label.find('text').text
                event_items.append(labels)

            all_events.append(event_items)
            event_items.insert(0, id_iterator)
    id_iterator += 1

df = pd.DataFrame(all_events, columns=header)

In [10]:
data_types_dict = {'start': float, 'end': float, 'x_pos': float, 'y_pos': float}
df = df.astype(data_types_dict)
df.head()

Unnamed: 0,game_id,start,end,player,x_pos,y_pos,team,action,half
0,0,0.0,14.26,17. Filip Stojilkovic,52.5,34.0,Sion,Passes accurate,1st half
1,0,0.0,19.61,32. Loris Benito Souto,39.1,59.9,Sion,Passes accurate,1st half
2,0,0.0,17.18,14. Anto Grgic,39.0,39.9,Sion,Passes accurate,1st half
3,0,4.01,24.01,52. Wesley,45.7,63.2,Sion,Passes accurate,1st half
4,0,4.26,5.26,Start,45.7,63.2,,,


In [11]:
# Create fixtures table with team one and team two. There is no clear distinction of Home and away team.

team_one = df.groupby(['game_id', 'team']).count().reset_index()[['game_id', 'team']] \
            .sort_values(['game_id', 'team']) \
                .drop_duplicates(subset='game_id', keep='last')

team_two = df.groupby(['game_id', 'team']).count().reset_index()[['game_id', 'team']] \
            .sort_values(['game_id', 'team']) \
                .drop_duplicates(subset='game_id', keep='first')

fixtures = team_one.merge(team_two, how='inner', left_on='game_id', right_on='game_id')
fixtures.head()

Unnamed: 0,game_id,team_x,team_y
0,0,Sion,Luzern
1,1,Servette,Luzern
2,2,Lugano,Grasshopper
3,3,St. Gallen,Lausanne Sport
4,4,Young Boys,Servette


In [12]:
# Action variables
shot_outcome_actions = ['Wide shot', 'Shot on target', 'Shot into the bar/post', 'Goals']
goal_situations = ['Assists', 'Goals', 'Free-kick attacks', 'penalty attack', 'Own goal']
gk_actions = ['Wide shot (Goalkeepers)', 'Shot on target (saved)', 'Goals conceded']
passing_action = ['Passes accurate', 'Passes (inaccurate)', 'Assists']

In [13]:
shots = df[df['action'] == 'Shots'].reset_index()

action_before_shots = df.reset_index()
action_before_shots.rename(columns={'index': 'id'}, inplace=True)

for i in range(1, 5):
    action_before_shots['action-' + str(i)] = action_before_shots.id.shift(i)
    action_before_shots['action+' + str(i)] = action_before_shots.id.shift(-i)

melt_actions = pd.melt(action_before_shots, id_vars=action_before_shots.iloc[:,:9], value_vars=action_before_shots.iloc[:,10:])
melt_actions = melt_actions[melt_actions['action'] == 'Shots']
melt_actions.dropna(subset=['value'], inplace=True)
melt_actions.merge(action_before_shots, left_on='value', right_on='id')
melt_actions['value'] = melt_actions['value'].astype('int64')
merge_actions = melt_actions.merge(action_before_shots, left_on='value', right_on='id').reset_index()

merge_actions['time_diff'] = (merge_actions['start_y'] - merge_actions['start_x'])
merge_actions['field_progression'] = merge_actions['x_pos_x'] - merge_actions['x_pos_y']

last_action = merge_actions[(merge_actions['time_diff'] <= 0) & \
                         (~merge_actions['action_y'].isin(shot_outcome_actions)) & \
                         (merge_actions['team_x'] == merge_actions['team_y'])] \
        .sort_values(by=['id_x', 'time_diff']).drop_duplicates(subset=['id_x'], keep='last')

gk_action = merge_actions[merge_actions['action_y'].isin(['Wide shot (Goalkeepers)', 'Shot on target (saved)'])].sort_values(by=['id_x'])

shot_creation = shots.merge(last_action[['id_x', 'x_pos_y', 'y_pos_y', 'action_y', 'time_diff']], how='left', left_on='index', right_on='id_x')
shot_creation = shot_creation.merge(gk_action[['id_x', 'team_y', 'action_y', 'player_y']], how='left', left_on='index', right_on='id_x') \
    .drop(columns=['index', 'half', 'id_x_x', 'id_x_y', 'team_y']) \
        .rename(columns={'team': 'attacking_team', 'x_pos_y': 'x_pos_ass', 'y_pos_y': 'y_pos_ass', \
            'player_y': 'goalkeeper', 'action_y_x': 'action', 'action_y_y': 'outcome_gk', 'action': 'outcome'})

In [14]:
goals = df[df['action'] == 'Goals'].reset_index()

action_before_goals = df.reset_index()
action_before_goals.rename(columns={'index': 'id'}, inplace=True)

for i in range(1, 5):
    action_before_goals['action-' + str(i)] = action_before_goals.id.shift(i)
    action_before_goals['action+' + str(i)] = action_before_goals.id.shift(-i)

melt_goals = pd.melt(action_before_goals, id_vars=action_before_goals.iloc[:,:9], value_vars=action_before_goals.iloc[:,10:])
melt_goals = melt_goals[melt_goals['action'] == 'Goals']
melt_goals.dropna(subset=['value'], inplace=True)
melt_goals.merge(action_before_goals, left_on='value', right_on='id')
melt_goals['value'] = melt_goals['value'].astype('int64')
merge_goals = melt_goals.merge(action_before_goals, left_on='value', right_on='id').reset_index()

merge_goals['time_diff'] = np.where(merge_goals['start_y'] - merge_goals['start_x'] > 0, \
                                    (merge_goals['start_y'] - merge_goals['start_x']) * -1, \
                                    merge_goals['start_y'] - merge_goals['start_x'])
merge_goals['field_progression'] = merge_goals['x_pos_x'] - merge_goals['x_pos_y']

last_action = merge_goals[(merge_goals['action_y'].isin(goal_situations)) & \
                         (merge_goals['team_x'] == merge_goals['team_y'])] \
        .sort_values(by=['id_x', 'time_diff'], ascending=True).drop_duplicates(subset=['id_x'], keep='last')

gk_action = merge_goals[merge_goals['action_y'] == 'Goals conceded'].sort_values(by=['id_x'])

goal_creation = goals.merge(last_action[['id_x', 'x_pos_y', 'y_pos_y', 'action_y', 'time_diff']], how='left', left_on='index', right_on='id_x')
goal_creation = goal_creation.merge(gk_action[['id_x', 'team_y', 'action_y', 'player_y']], how='left', left_on='index', right_on='id_x') \
    .drop(columns=['index', 'half', 'id_x_x', 'id_x_y', 'team_y']) \
        .rename(columns={'team': 'attacking_team', 'x_pos_y': 'x_pos_ass', 'y_pos_y': 'y_pos_ass', \
            'player_y': 'goalkeeper', 'action_y_x': 'action', 'action_y_y': 'outcome_gk', 'action': 'outcome'})

In [15]:
chance_creation = pd.concat([shot_creation, goal_creation]).reset_index()
passing_deviation = chance_creation['time_diff'].mean() + (chance_creation['time_diff'].std() * -3)
columns = ['x_pos_ass', 'y_pos_ass', 'action']
for col in columns:    
    chance_creation[col] = np.where((chance_creation['action'].isin(passing_action)) & (chance_creation['time_diff'] < passing_deviation), \
        np.nan, \
        chance_creation[col])

chance_creation = chance_creation.merge(fixtures, how='left', left_on='game_id', right_on='game_id').reset_index()
chance_creation['defending_team'] = np.where(chance_creation['attacking_team'] == chance_creation['team_x'], chance_creation['team_y'], chance_creation['team_x'])
chance_creation.drop(columns=['level_0', 'index', 'time_diff', 'team_x', 'team_y'], inplace=True)

In [16]:
chance_creation.to_csv('/Users/matthiashugli/Dropbox/bucket/super-league-stats/chance_creation_2022.csv', index=False)
chance_creation.head()

Unnamed: 0,game_id,start,end,player,x_pos,y_pos,attacking_team,outcome,x_pos_ass,y_pos_ass,action,outcome_gk,goalkeeper,defending_team
0,0,911.64,931.64,30. Ardon Jashari,78.5,46.0,Luzern,Shots,74.3,47.1,Interceptions,Shot on target (saved),18. Kevin Fickentscher,Sion
1,0,1386.99,1406.99,15. Marvin Schulz,95.4,49.0,Luzern,Shots,89.2,49.3,Picking-ups,,,Sion
2,0,1741.1,1761.1,19. Filip Ugrinic,90.5,32.3,Luzern,Shots,92.1,41.6,Passes accurate,Wide shot (Goalkeepers),18. Kevin Fickentscher,Sion
3,0,1789.1,1809.1,9. Dejan Sorgic,82.8,27.7,Luzern,Shots,82.8,27.7,Picking-ups,Wide shot (Goalkeepers),18. Kevin Fickentscher,Sion
4,0,2047.36,2067.36,10. Samuele Campo,79.1,31.9,Luzern,Shots,74.5,20.0,Picking-ups,Shot on target (saved),18. Kevin Fickentscher,Sion


In [17]:
passing = chance_creation[chance_creation['action'].isin(['Passes accurate', 'Passes (inaccurate)', 'Assists'])]
passing_deviation = passing['time_diff'].mean() + (passing['time_diff'].std() * -3)
passing = passing[passing['time_diff'] >= passing_deviation]
passing.to_csv('/Users/matthiashugli/Dropbox/bucket/super-league-stats/chance_creation_passes.csv')

KeyError: 'time_diff'

In [227]:
#df[(df['action'] == 'Goals') & (df['player'] == '41. Noah Loosli')]
#chance_creation[(chance_creation['outcome'] == 'Goals') & (chance_creation['player'] == '41. Noah Loosli')]
# df[df.start == 1192.13]
df[26500:26515]

Unnamed: 0,game_id,start,end,player,x_pos,y_pos,team,action,half
26500,20,1147.44,1167.44,28. Gaetano Berardi,29.0,60.3,Sion,Passes accurate,1st half
26501,20,1151.52,1171.52,32. Loris Benito Souto,47.1,66.8,Sion,Passes accurate,1st half
26502,20,1152.54,1172.54,14. Anto Grgic,37.6,52.5,Sion,Passes accurate,1st half
26503,20,1154.47,1174.47,28. Gaetano Berardi,28.1,60.9,Sion,Passes accurate,1st half
26504,20,1157.87,1177.87,18. Kevin Fickentscher,7.6,38.7,Sion,Passes accurate,1st half
26505,20,1160.64,1180.64,97. Dimitri Kevin Cavare,40.8,3.4,Sion,Passes (inaccurate),1st half
26506,20,1161.75,1181.75,21. Ulisses Garcia,53.7,61.4,Young Boys,Picking-ups,1st half
26507,20,1162.89,1182.89,21. Ulisses Garcia,54.3,57.3,Young Boys,Passes accurate,1st half
26508,20,1165.39,1185.39,30. Sandro Mike Lauper,60.1,44.9,Young Boys,Passes accurate,1st half
26509,20,1170.48,1190.48,11. Edimilson Fernandes,92.1,59.0,Young Boys,Passes (inaccurate),1st half


In [234]:
#goal_creation[goal_creation['action'] == 'Own goal']
#goal_creation.tail(20)
#np.where(chance_creation[chance_creation['action'] == 'Own goal'], 105 - chance_creation['x_pos'], chance_creation['x_pos'])

ValueError: operands could not be broadcast together with shapes (4,14) (886,) (886,) 