In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.distributions.empirical_distribution import ECDF
from datetime import datetime
import unicodedata
from unidecode import unidecode
import re
import copy
import pickle


# Scoring Dynamics in Professional Soccer

In this writeup I will clean and aggregate data from EPL datasets from 2015-2022. 

## Reading in Data

We first read in 3 datasets.
+ fifa player ratings: Dataset of player ratings for all players in FIFA video games from FIFA 15 to FIFA 22
+ EPL Games: A dataframe of games and the starting lineups
+ EPL Events: A dataframe of events for each game in EPL Games

In [2]:
#show all columns when viewing df
pd.set_option('display.max_columns', None)

In [3]:
#read in fifa rating data
fifa_player_ratings = pd.read_csv("kaggle_data/fifa_ratings/male_players.csv")
#read in game summary data
EPL_games = pd.read_csv("kaggle_data/EPL_games_events/matches.csv")
#read in event data
EPL_events = pd.read_csv("kaggle_data/EPL_games_events/events.csv")


  fifa_player_ratings = pd.read_csv("/Users/kmason/Desktop/Folders/sports_gambling_exploration/soccer/data/fifa_ratings/male_players.csv")
  EPL_games = pd.read_csv("/Users/kmason/Desktop/Folders/sports_gambling_exploration/soccer/data/EPL_games_events/matches.csv")


## Adding FIFA Year to EPL Games
Becasue we need to match each player to their FIFA rating, we care about what FIFA version was available when a game was played. As such, we change each date in the EPL games dataframe to a FIFA year. 

In [4]:
#get fifa year for data
EPL_games['fifa_year'] = EPL_games['year'] + 1 - 2000
month_int = {"January":1,
       "February":2,
       "March":3,
       "April":4,
       "May":5,
       "June":6,
       "July":7,
       "August":8,
       "September":9,
       "October":10,
       "November":11,
       "December":12}

date = EPL_games['date']
n = len(date)
month = []
day = []
year = []
for j in range(n):
    d = date.iloc[j]
    d = d.split(",")
    d = d[1].split()
    month.append(month_int[d[0]])
    day.append(int(d[1]))
    y = EPL_games['year'][j]
    if month_int[d[0]] >= 8:
        year.append(y)
    else:
        year.append(y+1)

EPL_games['month'] = month
EPL_games['day'] = day 
EPL_games['year'] = year

#remove games that occur before 2015
EPL_games = EPL_games[(EPL_games['fifa_year'] >= 16)]
    

## Adding FIFA Year to EPL Events
We add the year data to the EPL events dataframe by joining with EPL games

In [5]:
# join events with games
A = EPL_games[['year', 'id']]
EPL_events = EPL_events.join(A.set_index('id'), on='id', lsuffix='_event', rsuffix='_game')


## Evaluating Player Offensive and Defensive Ratings
We begin by evaluating the offensive and defensive abilities of each player on the field. Offensive positions include roles such as striker, winger, and attacking midfielder. Defensive positions encompass roles such as central defensive midfielder, left wing-back, and similar positions. A player's offensive rating is determined by taking the maximum rating across all offensive positions they can play. Their defensive rating is defined in the same manner, considering the maximum rating across all defensive positions.

In [6]:
pos = {}
n = fifa_player_ratings.shape[0]
positions = fifa_player_ratings['player_positions']
for j in range(n):
    p = positions[j]
    p = p.split(",")
    for val in p:
        if val.strip() not in pos:
            pos[val.strip()] = 0


In [7]:
#separate offensive, defensive, and goalkeeping columns 
attacking = set(['st',"lw","rw","cf",'lam','cam','ram'])
defending = set(['cb','rb','lb','cdm','rwb','lwb','ldm','rdm'])
goalkeeping = set(['gk'])
mix = set(['lm','rm','cm'])

In [8]:
# Initialize the player_attacking and player_defending columns to 0
fifa_player_ratings['player_attacking'] = 0
fifa_player_ratings['player_defending'] = 0
fifa_player_ratings['player_goalkeeping'] = 0
#get attacking and defending stats of each player
attacking_stats = fifa_player_ratings[list(attacking)]
defending_stats = fifa_player_ratings[list(defending)]
goalkeeping_stats = fifa_player_ratings[list(goalkeeping)]


def parse_stats(s:str):
    if "+" in s:
        s = s.split("+")
        return int(s[0])
        #return sum([int(j) for j in s])
    elif "-" in s:
        s = s.split("-")
        s = [int(j) for j in s]
        return int(s[0])
        #return s[1] - s[0]
    else:
        return int(s)
    



# Iterate over the rows and update the attacking and defending columns
for j in range(n):
    #parse each stat
    player_attacking = attacking_stats.iloc[j]
    player_attacking = max([parse_stats(stat) for stat in player_attacking])
    player_defending = defending_stats.iloc[j]
    player_defending = max([parse_stats(stat) for stat in player_defending])
    player_goalkeeping = goalkeeping_stats.iloc[j]
    player_goalkeeping = max([parse_stats(str(stat)) for stat in player_goalkeeping])
    
    fifa_player_ratings.loc[j, 'player_attacking'] = player_attacking
    fifa_player_ratings.loc[j, 'player_defending'] = player_defending
    fifa_player_ratings.loc[j, 'player_goalkeeping'] = player_goalkeeping




## Filtering FIFA Ratings
We remove all  FIFA data corresponding to players who are not in the EPL.

In [9]:
#filter to only include premier league teams
all_fifa_player_ratings = fifa_player_ratings.copy()
fifa_player_ratings = fifa_player_ratings[(fifa_player_ratings['league_name'] == "Premier League") | (fifa_player_ratings['league_name'] == "Championship")]
#filter to only include player name, team name, jersey number, player ratings
fifa_player_ratings = fifa_player_ratings[['fifa_version','fifa_update','player_id','short_name','long_name','club_name','club_jersey_number','player_attacking','player_defending','player_goalkeeping']]

## Saving Player Ratings to Dictionaries
Becasue the names in the FIFA dictionary are not matched to names in the EPL games datast,we must create dictionaries that store player ratings. They are detailed below
+ Player Ratings: Key is player ID, Value is offensive, defensive, and goalie ratings
+ First Player to ID: Key Player first name, team, and number: Value FIFA player ID
+ Last Player to ID: Key Player last name, team, and number: Value FIFA player ID
+


In [10]:
#Dict2: (player_id) -> Dict3: (version) -> fifa stats 
#convert to a dictionary
player_ratings = {}
m = all_fifa_player_ratings.shape[0]
all_name_to_id = {}
for j in range(m):
    #get player info 
    player = all_fifa_player_ratings.iloc[j]
    #get player id
    player_id = player['player_id']
    #get first and last name 
    name = player['long_name']
    #remove accented chars
    name = unidecode(name)
    #split by space 
    name = name.split()
    #get list of first and last name 
    if len(name) > 1:
        last_name = name[-1].upper()
        first_name = name[0].upper()
    else:
        last_name = name[0].upper()
        first_name = name[0].upper()
    name = tuple((first_name,last_name))
    if name not in all_name_to_id:
        all_name_to_id[name] = set([player_id])
    else:
        all_name_to_id[name].add(player_id)

    #add player stats to the dictionary 
    if player_id not in player_ratings:
        player_ratings[player_id] = {player['fifa_version']:[player['player_attacking'],player['player_defending'],player['player_goalkeeping'],player['fifa_update']]}
    elif player['fifa_version'] not in player_ratings[player_id]:
        player_ratings[player_id][player['fifa_version']] = [player['player_attacking'],player['player_defending'],player['player_goalkeeping'],player['fifa_update']]
    elif player['fifa_update'] > player_ratings[player_id][player['fifa_version']][3]:
        player_ratings[player_id][player['fifa_version']] [player['player_attacking'],player['player_defending'],player['player_goalkeeping'],player['fifa_update']]

    


In [11]:
#names that have no matches that need to get added manually
all_name_to_id[tuple(('JOSH', 'SILVA'))] = set([231445])
all_name_to_id[tuple(('SAMIR', 'SAMIR'))] = set([215651])
all_name_to_id[tuple(('MATHIAS', 'JORGENSEN'))] = set([183491])
all_name_to_id[tuple(('MAT', 'RYAN'))] = set([213407])
all_name_to_id[tuple(('CARLOS', 'VINICIUS'))] = set([244621	])
all_name_to_id[tuple(('RHYS', 'WILLIAMS'))] = set([247601])
all_name_to_id[tuple(('RAPHINHA', 'RAPHINHA'))] = set([233419])
all_name_to_id[tuple(('LUKE', 'THOMAS'))] = set([254470])
all_name_to_id[tuple(('PEPE', 'REINA'))] = set([24630])
all_name_to_id[tuple(('PHILLIP', 'BILING'))] = set([220714	])
all_name_to_id[tuple(('FABRI', 'FABRI'))] = set([177723	])
all_name_to_id[tuple(('LASSE', 'SORENSON'))] = set([240826])
all_name_to_id[tuple(('CONNOR', 'ROBERTS'))] = set([225147])
all_name_to_id[tuple(('FOUSSEYNI', 'DIABATE'))] = set([236944])
all_name_to_id[tuple(('BADOU', "N'DIAYE"))] = set([221000])
all_name_to_id[tuple(('JOAO', 'MARIO'))] = set([212814])
all_name_to_id[tuple(('BIRAM', 'KIYAL'))] = set([193550])
all_name_to_id[tuple(('THOMAS', 'CARROLL'))] = set([202491])
all_name_to_id[tuple(('DIDIER', "N'DONG"))] = set([218359])
all_name_to_id[tuple(('TOM', 'DAVIES'))] = set([230005])
all_name_to_id[tuple(('ALEXANDRE', 'PATO'))] = set([180175])
all_name_to_id[tuple(('SANDRO', 'SANDRO'))] = set([190782])
all_name_to_id[tuple(('GULLERMO', 'VARELA'))] = set([219914])
all_name_to_id[tuple(('JOAO', 'TEIXEIRA'))] = set([210377])
all_name_to_id[tuple(('TYLER', 'ROBERTS'))] = set([228815])
all_name_to_id[tuple(('JAKE', 'CLARK-SALTER'))] = set([230774])
all_name_to_id[tuple(('BEN', 'WHITE'))] = set([231936])
all_name_to_id[tuple(('SAM', 'GALLAGHER'))] = set([213905])
all_name_to_id[tuple(('JOAO', 'PEDRO'))] = set([252042])
all_name_to_id[tuple(('JOE', 'RILEY'))] = set([233114])
all_name_to_id[tuple(('BORJA', 'BASTON'))] = set([194996])
all_name_to_id[tuple(('DANIEL', "N'LUNDULU"))] = set([236321])
all_name_to_id[tuple(('JON', 'ROWE'))] = set([266500])
all_name_to_id[tuple(('TARIQUE', 'FOSU-HENRY'))] = set([216483])
all_name_to_id[tuple(('ANDREW', 'ELEFTHERIOU'))] = set([198485])


In [12]:
#Dict1: (last name, team, number) -> player id 
n = fifa_player_ratings.shape[0]
first_player_to_id = {}
last_player_to_id = {}
for j in range(n):
    #get player info
    player = fifa_player_ratings.iloc[j]
    #get player id
    id = player['player_id']
    #get club name 
    team = player['club_name']
    #get number 
    number = player['club_jersey_number']
    #get name 
    name = player['short_name']
    #remove accented chars
    name = unidecode(name)
    #split by space 
    name = name.split()
    #get last name and make uppercase 
    if len(name) > 1:
        last_name = name[-1].upper()
        first_name = name[0].upper()
    else:
        last_name = name[0].upper()
        first_name = name[0].upper()
    #key is last name, team, and jersey number 
    first_key = tuple((first_name,team,number))
    last_key = tuple((last_name,team,number))
    #add to dictionary 
    if first_key not in first_player_to_id:
        first_player_to_id[first_key] = set([id])
    else:
        first_player_to_id[first_key].add(id)
    
    if last_key not in last_player_to_id:
        last_player_to_id[last_key] = set([id])
    else:
        last_player_to_id[last_key].add(id)
   

    


## Reading in Shot Data
We now read in shot data for all EPL matches between 2015 and 2022 from WorldfootballR. Note that since this data is not matched to EPL games, we must match team names and game IDs. 

In [13]:
shot_data = pd.read_csv("kaggle_data/EPL_shots.csv")
shot_data['goal'] = shot_data['result'] == "Goal"
shot_data = shot_data[['date','home_team','away_team','h_a','minute','x_g','goal','situation','player']]

  shot_data = pd.read_csv("/Users/kmason/Desktop/Folders/sports_gambling_exploration/soccer/data/EPL_shots.csv")


In [14]:
#function to convert date to list 
def convert_time(datetime_str):
    # Parse the datetime string
    dt = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S')
    # Extract year, month, and day and return as a tuple
    return [dt.year, dt.month, dt.day]
#get game data
date = shot_data['date']
#get number of shots 
n = len(date)


#save month of game 
month = []
#save day of game 
day = []
#save year of game 
year = []
#save team  
team = []
for j in range(n):
    #get date
    d = date.iloc[j]
    #convert date to list
    time_list = convert_time(d)
    year.append(time_list[0])
    month.append(time_list[1])
    day.append(time_list[2])
    
shot_data['month'] = month
shot_data['day'] = day
shot_data['year'] = year

In [15]:
#get names in EPL games 
team1 = EPL_games['home']
#get number of teams 
n = len(team1)
#get teams in shot data 
team2 = shot_data['home_team']
#get number of teams 
m = len(team2)

#track whihc teams are found (i.e matched)
D = {}
found = {}
no_found = {}
for j in range(n):
    #get team name 
    t1 = team1.iloc[j]
    #add team to dictionary 
    if t1 not in D:
        D[t1] = 0
for j in range(m):
    t2 = team2.iloc[j]
    if t2 not in D:
        no_found[t2] = 0
    else:
        D[t2] += 1
        found[t2] = 0
        



In [16]:
#manually add team anmes that don't match 
match = {a:a for a in found}
#hard code non matches
match['Leicester'] = 'Leicester City'
match['Stoke'] = "Stoke City"
match['West Ham'] = "West Ham United"
match['Swansea'] = "Swansea City"
match['Hull'] = "Hull City"
match['Brighton'] = "Brighton & Hove Albion"
match['Bournemouth'] = "AFC Bournemouth"
match['Norwich'] = "Norwich City"
match['Tottenham'] = "Tottenham Hotspur"
match['Cardiff'] = "Cardiff City"
match['Leeds'] = "Leeds United"
match['Huddersfield'] = "Huddersfield Town"

In [17]:
#join shots with games
n = shot_data.shape[0]
home = []
away = []
neither = []
#add home/away team info for shot 
for j in range(n):
    t = shot_data.iloc[j]['home_team'] 
    if t in match:
        home.append(match[t])
    else:
        home.append(t)
    a = shot_data.iloc[j]['away_team'] 
    if a in match:
        away.append(match[a])
    else:
        away.append(a)
shot_data['home_team'] = home
shot_data['away_team'] = away 
    

## Matching Shots to Games
We now match dates and teams in the shots dataset to game IDs in EPL games

In [18]:
#join shots with games 
E = EPL_games[['id','home','away','month','day','year']]
# Merge shot_data with EPL_games on 'home' (home_team), 'away' (away_team), 'month', 'day', and 'year'
shots_with_id = pd.merge(E, shot_data, left_on=['home', 'away', 'month', 'day', 'year'], 
                       right_on=['home_team', 'away_team', 'month', 'day', 'year'], 
                       how='inner')

In [19]:
#filter out columns 
shots_with_id = shots_with_id[['id','minute','home','away','h_a','x_g','goal','situation','player']]

In [20]:
#filter out eevnts that aren't substitutions, hald ends, ejections, or goals 
EPL_events = EPL_events[EPL_events['Event'].str.startswith('Substitution') | EPL_events['Event'].str.startswith('Second Half ends') | 
                        EPL_events['Event'].str.startswith('First Half ends') |
                        EPL_events['Event'].str.startswith('Second yellow card') |EPL_events['Event'].str.contains('red card')|
                          EPL_events['Event'].str.contains('RED CARD') | EPL_events['Event'].str.contains('Attempt')|
                          EPL_events['Event'].str.contains('Goal')]
EPL_events = EPL_events[~EPL_events['Event'].str.startswith('VAR')]
#remove events with no year
EPL_events = EPL_events.dropna(subset=['year'])

In [21]:
def parse_lineup_info(game_info,home_number_list,home_name_list,away_number_list,away_name_list):
    #assert that lists are of equl length
    assert len(home_number_list) == len(home_name_list)
    assert len(away_number_list) == len(away_name_list)
    #columns of game info that correspond to starting names and numbers 
    home_num = game_info[home_number_list]
    home_name = game_info[home_name_list]
    n = len(home_num)
    #get dictionary for home team and tuples for starting lineup
    home_name_to_number = {unidecode(home_name.iloc[j]):home_num.iloc[j] for j in range(n) if np.isnan(home_num.iloc[j]) == False}
    home_number_to_name = {home_num.iloc[j]:unidecode(home_name.iloc[j]) for j in range(n) if np.isnan(home_num.iloc[j]) == False}

    away_num = game_info[away_number_list]
    away_name = game_info[away_name_list]
    n = len(away_num)
    #get dictionary for home team and tuples for starting lineup
    away_name_to_number = {unidecode(away_name.iloc[j]):away_num.iloc[j] for j in range(n) if np.isnan(away_num.iloc[j]) == False}
    away_number_to_name = {away_num.iloc[j]:unidecode(away_name.iloc[j]) for j in range(n) if np.isnan(away_num.iloc[j]) == False}


    return home_name_to_number, home_number_to_name, away_name_to_number,away_number_to_name

def find_player_id(first_key,last_key,first_name_to_id_dict,last_name_to_id_dict):
    #get player id(s) given jersey and team info as sets, first name takes priority
    try:
        first_player_id = first_name_to_id_dict[first_key]
    except:
        first_player_id = set()
    try:
        last_player_id = last_name_to_id_dict[last_key]
    except:
        last_player_id = set()
    if len(last_player_id) == 1:
        player_id = list(last_player_id)[0]
    elif len(first_player_id) == 1:
        player_id = list(first_player_id)[0]
    else:
        cand = first_player_id.intersection(last_player_id)
        if len(cand) == 1:
            player_id = list(cand)[0]
        else:
            player_id = None
    return player_id

def search_name_in_table(first,last,table):
    name = first.lower() + "-" + last.lower()
    table_name = table[(table['player_url'].str.contains(name))]['player_id']
    player_id = set([v for v in table_name])
    if len(player_id)  == 1:
        return list(player_id)[0]
    else:
        player_id = None
    if player_id is None:
        table_name = table[(table['player_url'].str.contains(first.lower())) & (table['player_url'].str.contains(last.lower()))]['player_id']
        player_id = set([v for v in table_name])
        if len(player_id)  == 1:
            return list(player_id)[0]
        else:
            return None



def lineup_to_fifa(first_name_to_id_dict,last_name_to_id_dict,all_name_to_id_dict,player_table,player_rating_dict,lineup_names,lineup_numbers,fifa_year, team_name):
    assert len(lineup_names) == len(lineup_numbers)
    n = len(lineup_names)
    goalkeeping = []
    offense = []
    defense = []
    no_found = {}
    for j in range(n):
        first_key = tuple((lineup_names[j][0],team_name,lineup_numbers[j]))
        last_key = tuple((lineup_names[j][1],team_name,lineup_numbers[j]))
        player_id = find_player_id(first_key,last_key,first_name_to_id_dict,last_name_to_id_dict)
        if player_id is None:
            player_id = find_player_id(last_key,first_key,first_name_to_id_dict,last_name_to_id_dict)
        if player_id is None:
            name = tuple((first_key[0],last_key[0]))
            if name in all_name_to_id_dict:
                cand = all_name_to_id_dict[name]
                if len(cand) == 1:
                    player_id = list(cand)[0]
        if player_id is None:
            player_id = search_name_in_table(first_key[0],last_key[0],player_table)
        if player_id is None:
            print("player not found")
            no_found.update({tuple((lineup_names[j][0],lineup_names[j][1],team_name,lineup_numbers[j])):1})
            continue
        #print(player_id)
        #get stats
        if fifa_year in player_rating_dict[player_id]:
            stats = player_rating_dict[player_id][fifa_year]
        else:
            #prefer using an earluer year vs a later year 
            years = list(player_rating_dict[player_id].keys())
            years_diff = [abs(v + 0.5 -fifa_year) for v in player_rating_dict[player_id]]
            #get index order from smallest to largest
            closest_year = years[sorted(range(len(years)), key=lambda x: years_diff[x])[0]]
            #get stats ffrom closest year
            stats = player_rating_dict[player_id][closest_year]
        offense.append(stats[0])
        defense.append(stats[1])
        goalkeeping.append(stats[2])

    gk = goalkeeping.index(max(goalkeeping))
    goalkeeping = max(goalkeeping)
    offense.pop(gk)
    defense.pop(gk)

    return [[goalkeeping],offense,defense],no_found
    

In [22]:
#convert times in shots and events to minutes 
def convert_time(s):
    if s == "-":
        return 0
    s = s.split("'")
    s = int(s[0])
    return s

def convert_time_with_extra_time(s):
    if s == "-":
        return 0
    s = s.replace("'","")
    if "+" in s:
        s = s.split("+")
        return int(s[0]) + int(s[1])
    if "-" in s:
        s = s.split("-")
        return int(s[0]) - int(s[1])
    return int(s)


In [23]:
#parse strings to edit events dataframe
#Subsitituion, Team. Player A replaces Player B
#Goal/Attempt: (Team) in string

def parse_substitution(s:str,home_dict, away_dict, home_team_name,away_team_name):
    home_team_name_ = home_team_name.upper().split()
    home_team_name_ = home_team_name.replace("&","and")
    away_team_name_ = away_team_name.replace("&","and")
    home_team_name_list = home_team_name_.upper().split()
    away_team_name_list= away_team_name_.upper().split()


    home_team_names = set([home_team_name_.upper() ] + [  v  for v in home_team_name_list])
    away_team_names = set([  away_team_name_.upper() ] + [v  for v in away_team_name_list])
    # Step 1: Find the overlap
    overlap = home_team_names.intersection(away_team_names)

    # Step 2: Remove the overlap from both sets
    home_team_names = home_team_names - overlap
    away_team_names = away_team_names - overlap
    #Subsitituion, Team. Player A replaces Player B
    s = s.strip(".")
    s = s.replace(",","")
    
    s = s.split()
    s = [unidecode(v.strip(".").upper()) for v in s]
    #make a dictionary for s 
    s_dict = {v:0 for v in s}

    team = None
    for word in s:
        if word in home_team_names:
            D = home_dict
            team = home_team_name
        if word in away_team_names:
            D = away_dict
            team = away_team_name
    #figure out what team we're even talking about 
    if team is None:
        counter_home = set([])
        counter_away = set([])
        home_keys = home_dict.keys()
        for key in home_keys:
            if key in s_dict:
                counter_home.add(home_dict[key])
        away_keys = away_dict.keys()
        for key in away_keys:
            if key in s_dict:
                counter_away.add(away_dict[key])
        counter_home = len(counter_home)
        counter_away = len(counter_away)
        if counter_home  < 1 and counter_away < 1:
            #print("team not found ")
            return None
        elif counter_home >= counter_away:
            D = home_dict
            team = home_team_name
        else:
            D = away_dict
            team = away_team_name
    
    #now find players
    players = [team]
    all_players = [team]
    p = []
    n = len(s)
    for j in range(n):
        if s[j] in D:
            p.append(D[s[j]])
        elif len(p) > 0:
            all_players.append(p[-1])
            p = []
    if len(p) > 0:
        all_players.append(p[-1])
    for v in all_players:
        if v in players:
            continue
        else:
            players.append(v)
    return players


In [24]:
def parse_shot(s:str,home_dict,away_dict,home_team_name, away_team_name):
    home_team_name_ = home_team_name.upper().split()
    home_team_name_ = home_team_name.replace("&","and")
    away_team_name_ = away_team_name.replace("&","and")
    home_team_name_list = home_team_name_.upper().split()
    away_team_name_list= away_team_name_.upper().split()

    home_team_names = set(["(" + home_team_name_.upper() + ")"] + [ "(" + v + ")" for v in home_team_name_list])
    away_team_names = set([ "(" + away_team_name_.upper() + ")"] + ["(" + v + ")" for v in away_team_name_list])
    # Step 1: Find the overlap
    overlap = home_team_names.intersection(away_team_names)

    # Step 2: Remove the overlap from both sets
    home_team_names = home_team_names - overlap
    away_team_names = away_team_names - overlap
    #Subsitituion, Team. Player A replaces Player B
    s = s.strip(".")
    s = s.replace(",","")

    #get team name 
    try: 
        match =  match = "(" + re.search(r'\((.*?)\)', s).group(1).upper()  + ")"
    except:
        match = "()"
    
    s = s.split()
    s = [unidecode(v.strip(".").upper()) for v in s]
    #make a dictionary for s 
    s_dict = {v:0 for v in s}
    #figure out what team we're even talking about 
    if match in home_team_names:
        D = home_dict
        team = home_team_name
        return team 
    elif match in away_team_names:
        D = away_dict
        team = away_team_name
        return team 
    else:
        counter_home = set([])
        counter_away = set([])
        home_keys = home_dict.keys()
        for key in home_keys:
            if key in s_dict:
                counter_home.add(home_dict[key])
        away_keys = away_dict.keys()
        for key in away_keys:
            if key in s_dict:
                counter_away.add(away_dict[key])
        counter_home = len(counter_home)
        counter_away = len(counter_away)
        if counter_home  < 1 and counter_away < 1:
            #print("team not found ")
            return None
        elif counter_home >= counter_away:
            D = home_dict
            team = home_team_name
        else:
            D = away_dict
            team = away_team_name
        
    return team 

    

In [25]:
def parse_ejection(s:str,home_dict,away_dict,home_team_name,away_team_name):
    home_team_name_ = home_team_name.upper().split()
    home_team_name_ = home_team_name.replace("&","and")
    away_team_name_ = away_team_name.replace("&","and")
    home_team_name_list = home_team_name_.upper().split()
    away_team_name_list= away_team_name_.upper().split()

    home_team_names = set(["(" + home_team_name_.upper() + ")"] + [ "(" + v + ")" for v in home_team_name_list])
    away_team_names = set([ "(" + away_team_name_.upper() + ")"] + ["(" + v + ")" for v in away_team_name_list])
    # Step 1: Find the overlap
    overlap = home_team_names.intersection(away_team_names)

    # Step 2: Remove the overlap from both sets
    home_team_names = home_team_names - overlap
    away_team_names = away_team_names - overlap
    #Subsitituion, Team. Player A replaces Player B
    s = s.strip(".")
    s = s.replace(",","")

    #get team name 
    try: 
        match_ = re.search(r'\((.*?)\)', s).group(1).upper()
        match =  match = "(" + match_  + ")"
    except:
        match = "()"
        match_ = "()"

    s = s.split()
    s = [unidecode(v.strip(".").upper()) for v in s]
    #make a dictionary for s 
    s_dict = {v:0 for v in s}
    #figure out what team we're even talking about 
    if match in home_team_names:
        D = home_dict
        team = home_team_name
    elif match in away_team_names:
        D = away_dict
        team = away_team_name
    else:
        counter_home = set([])
        counter_away = set([])
        home_keys = home_dict.keys()
        for key in home_keys:
            if key in s_dict:
                counter_home.add(home_dict[key])
        away_keys = away_dict.keys()
        for key in away_keys:
            if key in s_dict:
                counter_away.add(away_dict[key])
        counter_home = len(counter_home)
        counter_away = len(counter_away)
        if counter_home  < 1 and counter_away < 1:
            #print("team not found ")
            return None
        elif counter_home >= counter_away:
            D = home_dict
            team = home_team_name
        else:
            D = away_dict
            team = away_team_name
    ##now find players
    players = [team]
    player = {}
    n = len(s)
    for j in range(n):
        if s[j] in D:
            if D[s[j]] in player:
                continue
            else:
                player[D[s[j]]] = 0
                players.append(D[s[j]])
    return players


In [26]:
def edit_name_dictionary(D_,equivalent_names):
    D = {}
    #add equivalent names to D
    for name in equivalent_names:
        if name in D_:
            D_[equivalent_names[name]] = D_[name]
    #add unique key word identifiers to dictionary 
    for key in D_:
        name = unidecode(key)
        name = name.upper()
        name = name.split()
        unique = len(name) == 1
        for part in name:
            if part in D:
                D[part].add(tuple((D_[key],unique)))
            else:
                D[part] = set([tuple((D_[key],unique))])
    #remove words with multiple players attached
    rem_key = []
    for key in D:
        if len(D[key]) > 1:
            vals = [v for v in D[key] if v[1] == True]
            if len(vals) != 1:
                rem_key.append(key)
            else:
                D[key] = vals[0]
        else:
            D[key] = list(D[key])[0]
    for key in rem_key:
        del D[key]
    for key in D:
        D[key] = D[key][0]
        
    return D

    
    

In [27]:

n = EPL_games.shape[0]
#get columns corresponding to team names and numbers
home_num = ['home_starting_' + str(j) + "_num" for j in range(1,15)] + ['home_bench_' + str(j) + "_num" for j in range(1,11)]
home_name = ['home_starting_' + str(j) for j in range(1,15)] + ['home_bench_' + str(j) for j in range(1,11)]
away_num = ['away_starting_' + str(j) + "_num" for j in range(1,15)] + ['away_bench_' + str(j) + "_num" for j in range(1,11)]
away_name = ['away_starting_' + str(j) for j in range(1,15)] + ['away_bench_' + str(j) for j in range(1,11)]
#manually add equivalent names 
equivalent_names = {'Rodri': 'RODRIGO',
                        'Mathias Jorgensen':'ZANKA',
                        'Phillip Biling':'BILLING',
                        'Javier Hernandez':'CHICHARITO',
                        'Ahmed Elmohamady': 'AHMED EL MOHAMADY'}

game_events = {}
check = []
mistake_events = {}
counter = [0,0,0,0,0,0]
bruh = 0
H1_xg = []
H2_xg = []
for j in range(n):
    #get game info
    game_info = EPL_games.iloc[j]
    #get id of game
    id = game_info['id']
    home_formation = game_info['home_formation']
    away_formation = game_info['away_formation']
    #get shot data for this game (shots are in order by team and then minute)
    shots = shots_with_id[shots_with_id['id'] == id]
    #get events data
    events = EPL_events[EPL_events['id'] == id]
    if events.shape[0] == 0:
        continue
    #get year 
    y = events.iloc[0]['year']
    #add extra time event 
    events.loc[-1] = [-1,"45'","First Half: Extra Time Begins",y]
    #add extra time event second half 
    events.loc[-2] = [-2,"90'","Second Half: Extra Time Begins",y]
    # shifting index
    events.index = events.index + 2  
    #order by time
    events['pseudo_time'] = events['Time'].apply(convert_time)
    events['index'] = events.index
    events = events.sort_values(by=['pseudo_time','index'])
    #remove duplicate rows 
    events= events.drop_duplicates(subset=['Event', 'Time'])
    #if we can't find shots, skip game 
    if shots.shape[0] == 0:
        counter[0] += 1
        continue
    #remove duplicate shots 
    shots = shots.drop_duplicates(subset=["minute"	,"home", "away",	"h_a",	"x_g",	"goal",	"situation","player"])

    #get fifa year
    fifa_year = game_info['fifa_year']
    #get home team 
    home_team_name = game_info['home']
    away_team_name = game_info['away']

    #get dictionary mapping name to number 
    home_name_to_number, home_number_to_name, away_name_to_number, away_number_to_name = parse_lineup_info(game_info,home_num,home_name, away_num, away_name)
    #edit names to get unique identifiers for each player 
    home_dict = edit_name_dictionary(home_name_to_number,equivalent_names)
    away_dict = edit_name_dictionary(away_name_to_number,equivalent_names)



    #get starting lineup numbers
    home_starting_lineup = game_info[['home_starting_' + str(j) + "_num" for j in range(1,15)]]
    #get numbers and names 
    home_starting_lineup = set([v for v in home_starting_lineup if pd.isna(v) == False])
    

    #get starting lineup numbers
    away_starting_lineup = game_info[['away_starting_' + str(j) + "_num" for j in range(1,15)]]
    away_starting_lineup = set([v for v in away_starting_lineup if pd.isna(v) == False])

    
    #make sure that each team has 11 people
    if (len(away_starting_lineup) != 11) or (len(home_starting_lineup) != 11):
        counter[1] += 1
        continue
    #make sure that shots and goals are somehat close 
    #get shot and goal data
    events_ = events[(events['Event'].str.contains("Attempt")) ]
    events_own =  events[ (events['Event'].str.contains("Own Goal"))]
    #get number of these events 
    reported_shots = events_.shape[0] - events_own.shape[0]
    #get the number of actual shots 
    num_shots = shots.shape[0]
    #make sure that there are at least more reported shots than actual shots 
    if reported_shots > num_shots:
        #counter[2] += 1
        continue


    events_ = events[(events['Event'].str.contains("Goal!")) ]
    events_own =  events[ (events['Event'].str.contains("Own Goal"))]
    #get number of these events 
    reported_shots = events_.shape[0] #+ events_own.shape[0]
    #get the number of actual shots 
    num_shots = shots[shots['goal'] == True].shape[0]
    #make sure that there are at least more reported shots than actual shots 
    if reported_shots != num_shots:
        counter[2] += 1
        continue

    events_ = events[(events['Event'].str.startswith("First Half ends")) | (events['Event'].str.startswith("Second Half ends")) ]
    if events_.shape[0]!=2:
        counter[2] += 1
        continue



    #iterate over events 
    #add game to dictionary 
    game_events[id] = {'year':fifa_year,
                       'home_team_name':home_team_name,
                       'away_team_name':away_team_name,
                       'home_team_roster':copy.deepcopy(home_number_to_name),
                       'away_team_roster':copy.deepcopy(away_number_to_name),
                       'home_formation':home_formation,
                       'away_formation':away_formation,
                       'data':[]}
    m = events.shape[0]
    half = 1
    extra_time = 0
    first_half_length = 45
    prev_game_state_end = 0
    score = {home_team_name:0,
             away_team_name:0}
    xg = {home_team_name:0,
             away_team_name:0}
    shots_dict = {home_team_name:[tuple((shots.iloc[v]['x_g'],shots.iloc[v]['goal'])) for v in range(shots.shape[0]) if shots.iloc[v]['h_a'] == "h" and shots.iloc[v]['goal'] == False],
             away_team_name:[tuple((shots.iloc[v]['x_g'],shots.iloc[v]['goal'])) for v in range(shots.shape[0]) if shots.iloc[v]['h_a'] == "a" and shots.iloc[v]['goal'] == False ]}
    goals_dict = {home_team_name:[tuple((shots.iloc[v]['x_g'],shots.iloc[v]['goal'])) for v in range(shots.shape[0]) if shots.iloc[v]['h_a'] == "h" and shots.iloc[v]['goal'] == True],
             away_team_name:[tuple((shots.iloc[v]['x_g'],shots.iloc[v]['goal'])) for v in range(shots.shape[0]) if shots.iloc[v]['h_a'] == "a" and shots.iloc[v]['goal'] == True]}
    lineups = {home_team_name: home_starting_lineup,
               away_team_name: away_starting_lineup}
    #get curretn game state
    curr_game_state = {'change_type':None,
               'xg':copy.deepcopy(xg),
              'time_start': 0,
              'time_end':0,
              'half':1,
              'extra_time':0,
              'score':copy.deepcopy(score),
              'lineups':copy.deepcopy(lineups)}
    #need to compute original game state before updates 
    for k in range(m):
        #get event 
        E = events.iloc[k]['Event'].upper()
        #compute event time 
        event_time = convert_time_with_extra_time(events.iloc[k]['Time'])
        #account for the fact that if the second half started then the first half had some extra time 
        event_time = event_time + first_half_length - 45
        #indicator for if the game state changes 
        game_state_change = False 
        #if a player gets substituted
        if 'SUBSTITUTION' in E:
            substitutes = parse_substitution(E,home_dict,away_dict,home_team_name,away_team_name)
            if substitutes is None or len(substitutes)!= 3:
                continue
            sub_team = substitutes[0]
            player_replacing = substitutes[1]
            player_replaced = substitutes[2]
            #remove substituted player
            lineups[sub_team].remove(player_replaced)
            #add substitute
            lineups[sub_team].add(player_replacing)
            #game state changes 
            game_state_change = True 
            change_type = tuple(("Substitution",[sub_team,player_replacing,player_replaced]))
            

        #if there is a goal 
        #complete
        elif "OWN GOAL" in E:
            shot_team = parse_shot(E,home_dict,away_dict,home_team_name,away_team_name)
            score[shot_team] -= 1
            game_state_change = True 
            change_type = tuple(("Own Goal",shot_team))

        elif 'GOAL!' in E:
            shot_team = parse_shot(E,home_dict,away_dict,home_team_name,away_team_name)
            #if own goal just subtract from team that scored
            #look at team shots 
            s = goals_dict[shot_team]
            #find goal 
            xg[shot_team] += s[0][0]
            del s[0]
            goals_dict[shot_team] = s
            #update score of team 
            score[shot_team] += 1
            #game state changes 
            game_state_change = True 
            change_type = tuple(("Goal",shot_team))
                

        #complete 
        elif 'ATTEMPT' in E:
            shot_team = parse_shot(E,home_dict,away_dict,home_team_name,away_team_name)
            #get next shot 
            s = shots_dict[shot_team]
            if len(s) > 0:
                shot = s[0]
                xg[shot_team] += shot[0]
                #remove shot added 
                del s[0]
                shots_dict[shot_team] = s
        
        
        #if a player gets ejected 
        elif 'SECOND YELLOW CARD' in E or  'RED CARD' in E:
            player_ejected = parse_ejection(E,home_dict,away_dict,home_team_name,away_team_name)
            if player_ejected is None or len(player_ejected) != 2:
                continue
            else:
                #remove ejected player from team 
                lineups[player_ejected[0]].remove(player_ejected[1])
            #game state changes 
            game_state_change = True 
            change_type = tuple(("Ejection",player_ejected))

        #if extra time starts
        elif 'First Half: Extra Time Begins'.upper() in E:
            extra_time = 1
            #game state changes 
            game_state_change = True 
            change_type = tuple(("Extra Time",0))
        elif 'Second Half: Extra Time Begins'.upper() in E:
            extra_time = 1
            #game state changes 
            game_state_change = True 
            change_type = tuple(("Extra Time",0))
        #if the first half ends 
        elif 'First Half ends'.upper() in E: #some games no have so gotta add
            #bruh += 1
            half = 2
            extra_time = 0
            first_half_length = event_time
            #game state changes 
            game_state_change = True 
            change_type = tuple(("Half End",0))
        elif 'Second half ends'.upper() in E:
            #end game (update game state and add to dictionary)
            #game state changes 
            game_state_change = True 
            change_type = tuple(("Game End",0))
        
        #if game state changes, update all parameters before going to next event
        if game_state_change == True:
            #repsponse is expected goals: record this 
            #team stats and fatigue stay the same
            #half changes, score differential changes

            #xg anf time_elapsed are the only things that can chang ein a game state 
            curr_game_state['xg'] = copy.deepcopy(xg)
            curr_game_state['time_end'] = event_time 
            curr_game_state['change_type'] = change_type
            #add game state to list
            game_events[id]['data'].append(copy.deepcopy(curr_game_state))
            
            #update all game state parameters
            #reset xg scored 
            xg = {home_team_name:0,
             away_team_name:0}
            #update half/extra time 
            curr_game_state['score'] = copy.deepcopy(score)
            curr_game_state['half'] = half
            curr_game_state['extra_time'] = extra_time
            curr_game_state['time_start'] = event_time
            curr_game_state['lineups'] = copy.deepcopy(lineups)
            curr_game_state['change_type'] = None
            #update last game time 
            prev_game_state_end = event_time

    vals = game_events[id]['data']
    for val in vals:
        if val['half'] == 1:
            H1_xg.append(val['xg'][home_team_name])
            H1_xg.append(val['xg'][away_team_name])
        else:
            H2_xg.append(val['xg'][home_team_name])
            H2_xg.append(val['xg'][away_team_name])
    
    home_roster = []
    away_roster = []
    



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events.loc[-1] = [-1,"45'","First Half: Extra Time Begins",y]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events.loc[-2] = [-2,"90'","Second Half: Extra Time Begins",y]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events['pseudo_time'] = events['Time'].apply(convert_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cav

In [30]:
#save dictionaries as pikl files
with open('data/all_name_to_id.pkl', 'wb') as f:
    pickle.dump(all_name_to_id, f)
with open('data/first_name_to_id.pkl', 'wb') as f:
    pickle.dump(first_player_to_id, f)
with open('data/last_name_to_id.pkl', 'wb') as f:
    pickle.dump(last_player_to_id, f)

with open('data/game_events.pkl', 'wb') as f:
    pickle.dump(game_events, f)

with open('data/player_ratings.pkl', 'wb') as f:
    pickle.dump(player_ratings, f)

all_fifa_player_ratings.to_pickle('data/all_fifa_player_ratings.pkl')


