In [1]:
import requests
import json
import pandas as pd
import time
import datetime
from bs4 import BeautifulSoup
import numpy as np

SLEEPING_TIME = 3


In [2]:
# this is used to get the best attacking players and the defending aptitude of each team
base_url = 'https://understat.com/league/EPL/'
url = base_url

res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
scripts = soup.find_all('script')


In [3]:
"""
    params:
        - best_players_df: the best attacking players overall this season
        - scripts[3]
"""
matches = scripts[3]
strings = matches.string
start = strings.index("('")+2
end = strings.index("')")
json_data = strings[start:end]
json_data = json_data.encode('utf8').decode('unicode_escape')

data = json.loads(json_data)

columns = data[0].keys()
dc = {c:[] for c in columns}

for player in data:
    for c in columns:
        dc[c].append(player[c])
best_players_df = pd.DataFrame(dc)
best_players_df.head(3)

Unnamed: 0,id,player_name,games,time,goals,xG,assists,xA,shots,key_passes,yellow_cards,red_cards,position,team_title,npg,npxG,xGChain,xGBuildup
0,1250,Mohamed Salah,21,1814,16,16.222767740488052,9,6.507743250578642,84,41,1,0,F S,Liverpool,14,13.939261242747309,23.11178502440453,7.717494197189808
1,6854,Diogo Jota,22,1667,12,13.742104824632406,1,4.0528733022511005,62,27,1,0,F S,Liverpool,12,13.742104824632406,20.036065958440304,5.775763585232198
2,453,Son Heung-Min,18,1589,9,7.443591669201851,3,3.3822450675070286,45,33,0,0,F M,Tottenham,9,7.443591669201851,13.139947384595873,4.664681572467089


## Best Attacking Players Overall

based on attacking rating 

In [4]:
best_players_df['xG'] = best_players_df['xG'].apply(lambda x: float(x))
best_players_df['xA'] = best_players_df['xA'].apply(lambda x: float(x))
best_players_df['goals'] = best_players_df['goals'].apply(lambda x: int(x))
best_players_df['assists'] = best_players_df['assists'].apply(lambda x: int(x))
best_players_df['shots'] = best_players_df['shots'].apply(lambda x: int(x))
best_players_df['attacking_rating'] = best_players_df.apply(lambda x: x['xG'] *1.2 + x['xA'], axis = 1)

columns = ['id', 'player_name', 'position', 'team_title', 'games', 'time', 'attacking_rating', 'xG', 'xA', 'goals', 'assists']
best_players_df[columns].sort_values(by='attacking_rating', ascending=False, ignore_index=True).head(25)

Unnamed: 0,id,player_name,position,team_title,games,time,attacking_rating,xG,xA,goals,assists
0,1250,Mohamed Salah,F S,Liverpool,21,1814,25.975065,16.222768,6.507743,16,9
1,6854,Diogo Jota,F S,Liverpool,22,1667,20.543399,13.742105,4.052873,12,1
2,1776,Jarrod Bowen,F M S,West Ham,24,2052,16.208287,9.553089,4.74458,7,7
3,531,Michail Antonio,F S,West Ham,23,2022,16.065247,9.544686,4.611624,8,6
4,838,Sadio Mané,F S,Liverpool,20,1713,15.989597,10.503095,3.385884,8,1
5,647,Harry Kane,F M S,Tottenham,20,1699,14.894368,8.961495,4.140574,5,2
6,2371,Cristiano Ronaldo,F S,Manchester United,19,1483,14.371936,10.508529,1.761701,8,3
7,618,Raheem Sterling,F M S,Manchester City,20,1322,14.034463,8.963165,3.278665,7,1
8,453,Son Heung-Min,F M,Tottenham,18,1589,12.314555,7.443592,3.382245,9,3
9,9040,Conor Gallagher,M,Crystal Palace,20,1762,11.822847,6.687234,3.798166,7,3


## Choose the most suitable attacking players by looking at the worst defending teams

Get the teams that are facing opponents with weak defence in the next `NUMBER_OF_FUTURE_MATCHES`. Get the best players of the team by looking at their xGoals + xAssists in the past `NUMBER_OF_MATCHES` games.

Penalise the team if it concedes a goal at home. Every xgoal at home is multiplied by 1.3.
Boost the team if it makes a goal away. Every xgoal away is multiplied by 1.3.


In [5]:
"""
    params: 
        - matches_df: contains xGA for each team
"""
matches = scripts[2]
strings = matches.string
start = strings.index("('")+2
end = strings.index("')")
json_data = strings[start:end]
json_data = json_data.encode('utf8').decode('unicode_escape')

data = json.loads(json_data)
columns = ['h_a', 'xG', 'xGA', 'npxG', 'npxGA', 'scored','missed', 'xpts', 'result']

dc = {}
dc['team'] = []
for c in columns:
    dc[c] = []

team_list = []
for team_id in data:
    team_name = data[team_id]['title']
    team_list.append(team_name)
    for match in data[team_id]['history']:
        dc['team'].append(team_name)
        for c in columns:
            dc[c].append(match[c]) 
            
matches_df = pd.DataFrame(dc)


In [6]:
## number of games to look the xgoals against for each team
NUMBER_OF_GAMES = 8
print(f'Looking back at {NUMBER_OF_GAMES} games.')
matches_df['matches_home'] = matches_df.apply(lambda row: 1 if row['h_a'] == 'h' else 0, axis=1)
matches_df['matches_away'] = matches_df.apply(lambda row: 1 if row['h_a'] == 'a' else 0, axis=1)
matches_df['attack_home'] = matches_df.apply(lambda row: row['xG'] if row['h_a'] == 'h' else 0, axis=1)
matches_df['attack_away'] = matches_df.apply(lambda row: row['xG'] if row['h_a'] == 'a' else 0, axis=1)
matches_df['defence_home'] = matches_df.apply(lambda row: row['xGA'] if row['h_a'] == 'h' else 0, axis=1)
matches_df['defence_away'] = matches_df.apply(lambda row: row['xGA'] if row['h_a'] == 'a' else 0, axis=1)
defence_df = matches_df.groupby('team').tail(NUMBER_OF_GAMES).groupby('team', as_index=False).apply(
        lambda row: pd.Series(
            {
                'xG':sum(row.xG)
                , 'xGA': sum(row.xGA)
                , 'xpts': sum(row.xpts)
                , 'attack_home':sum(row.attack_home)
                , 'attack_away':sum(row.attack_away)
                , 'defence_home': sum(row.defence_home)
                , 'defence_away': sum(row.defence_away)
                , 'matches_home': sum(row.matches_home)
                , 'matches_away': sum(row.matches_away)
            }
        )
    )# .sort_values('defending_rating', ascending=False, ignore_index=True)[:]
defence_df['avg_home_att'] = defence_df['attack_home'] / defence_df['matches_home']
defence_df['avg_away_att'] = defence_df['attack_away'] / defence_df['matches_away']
defence_df['avg_home_def'] = defence_df['defence_home'] / defence_df['matches_home']
defence_df['avg_away_def'] = defence_df['defence_away'] / defence_df['matches_away']
defence_df['defending_rating'] = defence_df['avg_home_def'] * 1.3 + defence_df['avg_away_def']
defence_df['attacking_rating'] = defence_df['avg_away_att'] * 1.3 + defence_df['avg_home_att']
defence_df = defence_df.sort_values(by='defending_rating', ascending=False, ignore_index=True)
defence_df

Looking back at 8 games.


Unnamed: 0,team,xG,xGA,xpts,attack_home,attack_away,defence_home,defence_away,matches_home,matches_away,avg_home_att,avg_away_att,avg_home_def,avg_away_def,defending_rating,attacking_rating
0,Leicester,11.73562,19.001774,7.3318,6.333702,5.401918,8.683244,10.31853,4.0,4.0,1.583426,1.350479,2.170811,2.579632,5.401687,3.339049
1,Leeds,12.659638,16.927333,9.4079,5.90482,6.754818,7.009203,9.91813,4.0,4.0,1.476205,1.688705,1.752301,2.479533,4.757523,3.671521
2,Norwich,4.8036,15.294037,4.4722,2.940907,1.862693,8.706027,6.58801,5.0,3.0,0.588181,0.620898,1.741205,2.196003,4.45957,1.395348
3,Watford,6.626202,14.27339,6.5419,3.298979,3.327223,9.735802,4.537588,4.0,4.0,0.824745,0.831806,2.43395,1.134397,4.298533,1.906092
4,Everton,7.906445,15.251995,6.7459,4.638723,3.267722,6.262115,8.98988,4.0,4.0,1.159681,0.816931,1.565529,2.24747,4.282657,2.22169
5,Newcastle United,8.749402,14.453916,8.52,6.872028,1.877374,8.327166,6.12675,5.0,3.0,1.374406,0.625791,1.665433,2.04225,4.207313,2.187934
6,Southampton,8.082361,14.215916,6.989,2.948323,5.134038,5.059706,9.15621,3.0,5.0,0.982774,1.026808,1.686569,1.831242,4.023781,2.317624
7,Aston Villa,9.752987,12.64822,10.0785,5.651349,4.101638,7.694034,4.954186,4.0,4.0,1.412837,1.02541,1.923509,1.238546,3.739108,2.74587
8,West Ham,13.338845,12.583821,11.674,7.689663,5.649182,4.304091,8.27973,4.0,4.0,1.922416,1.412296,1.076023,2.069933,3.468762,3.7584
9,Brentford,6.126578,12.443614,6.875,3.563731,2.562847,4.433741,8.009873,4.0,4.0,0.890933,0.640712,1.108435,2.002468,3.443434,1.723858


## Get player data from team page & future matches for each team

To restrict the search space we just consider players that had at least xTotal = xgoals + xassists the whole season

In [7]:
"""
    params: 
        - team_data:
        - df:
        - scripts[1]: contains a list of matches of a team (future and past for the whole season)
            - isResult: if the match was done
            - 
        - scripts[3]:
        - xTotal
"""
team_rosters = {}
teams = ['Aston Villa', 'Everton', 'Southampton', 'Leicester', 'Crystal Palace', 'Norwich', 'Chelsea', 'West Ham', 'Tottenham', 'Arsenal', 'Newcastle United', 'Liverpool', 'Manchester City', 'Manchester United', 'Watford', 'Burnley', 'Brighton', 'Wolverhampton Wanderers', 'Brentford', 'Leeds']
teams.sort()

# Number of future matches for which we want to see the defence of the next teams
NUMBER_OF_FUTURE_MATCHES = 6
# Threshold of x goals + x assists that a player needs to have the whole season to be listed our shortlist
xTotal = 7
# we need this variable to compare the current datetime to decide if a match is next or not
now = datetime.datetime.now()

future_matches_dc = {
    'team':[], 
    'teams_against':[], 
    'home_or_away':[], 
    'when': [], 
    'defence_of_team_against':[], 
    'attack_of_team_against':[],
    'defending_rating':[],
    'attacking_rating':[],
}

df = pd.DataFrame()

for team in teams:
    team_name_url = team.replace(' ', '%20')
    base_url = f'https://understat.com/team/{team_name_url}/2021'
    url = base_url
    print(url)

    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    scripts = soup.find_all('script')
    
    script_data = scripts[1]
    strings = script_data.string
    start = strings.index("('")+2
    end = strings.index("')")
    json_data = strings[start:end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    data = json.loads(json_data)
    
    total_matches = 0
    for match in data:
        date_diff = (now - datetime.datetime.strptime(match['datetime'], '%Y-%m-%d %H:%M:%S')).days
        if date_diff < 0 and not match['isResult']:
            if match['h']['title'] == team:            
                team_against = match['a']['title']
                home_or_away = 'Home'
            else:
                team_against = match['h']['title']
                home_or_away = 'Away'
            # print(f"Team {team} is playing {home_or_away} against {team_against} at {match['datetime']}")

            future_matches_dc['team'].append(team)
            future_matches_dc['teams_against'].append(team_against)
            future_matches_dc['home_or_away'].append(home_or_away)
            future_matches_dc['when'].append(match['datetime'])

            row_against = defence_df[defence_df['team'] == team_against]
            row_team = defence_df[defence_df['team'] == team]

            if home_or_away == 'Home':
                future_matches_dc['defence_of_team_against'].append(row_against['avg_away_def'].iloc[0])
                future_matches_dc['attack_of_team_against'].append(row_against['avg_away_att'].iloc[0])
                future_matches_dc['attacking_rating'].append(row_team['avg_home_att'].iloc[0])
                future_matches_dc['defending_rating'].append(row_team['avg_home_def'].iloc[0])
            else:
                future_matches_dc['defence_of_team_against'].append(row_against['avg_home_def'].iloc[0])
                future_matches_dc['attack_of_team_against'].append(row_against['avg_home_att'].iloc[0])
                future_matches_dc['attacking_rating'].append(row_team['avg_away_att'].iloc[0])
                future_matches_dc['defending_rating'].append(row_team['avg_away_def'].iloc[0])

            total_matches += 1
            if total_matches >= NUMBER_OF_FUTURE_MATCHES:
                break

    script_data = scripts[3]
    strings = script_data.string
    start = strings.index("('")+2
    end = strings.index("')")
    json_data = strings[start:end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    data = json.loads(json_data)

    columns = data[0].keys()
    dc = {c:[] for c in columns}

    for player in data:
        for c in columns:
            dc[c].append(player[c])

    team_data = pd.DataFrame(dc)
    team_rosters[team] = team_data
    team_data['team'] = team
    team_data['xG'] = team_data['xG'].astype(float)
    team_data['xA'] = team_data['xA'].astype(float)
    team_data['xTotal'] = team_data['xG'] + team_data['xA']
    df = df.append(team_data.loc[team_data['xTotal'] > 4, ['id', 'player_name', 'team', 'xG', 'xA', 'xTotal', 'goals', 'assists']])
    time.sleep(SLEEPING_TIME)
    
future_matches_df = pd.DataFrame(future_matches_dc)
df = df.reset_index(drop=True)
print(f'Total players to receive: {df.shape[0]}')

https://understat.com/team/Arsenal/2021
https://understat.com/team/Aston%20Villa/2021
https://understat.com/team/Brentford/2021
https://understat.com/team/Brighton/2021
https://understat.com/team/Burnley/2021
https://understat.com/team/Chelsea/2021
https://understat.com/team/Crystal%20Palace/2021
https://understat.com/team/Everton/2021
https://understat.com/team/Leeds/2021
https://understat.com/team/Leicester/2021
https://understat.com/team/Liverpool/2021
https://understat.com/team/Manchester%20City/2021
https://understat.com/team/Manchester%20United/2021
https://understat.com/team/Newcastle%20United/2021
https://understat.com/team/Norwich/2021
https://understat.com/team/Southampton/2021
https://understat.com/team/Tottenham/2021
https://understat.com/team/Watford/2021
https://understat.com/team/West%20Ham/2021
https://understat.com/team/Wolverhampton%20Wanderers/2021
Total players to receive: 91


## Show Teams that have future matches against teams with bad defence

In [8]:
future_matches_df = pd.DataFrame(future_matches_dc)
future_matches_df = future_matches_df.groupby('team', as_index=False).apply(
        lambda row: pd.Series(
            {
                'teams_against': list(row.teams_against),
                'home_or_away': list(row.home_or_away),
                'when': list(row.when),
                'sum_of_defence_against': sum(row.defence_of_team_against),
                'sum_of_attack_against': sum(row.attack_of_team_against),
                'defending_rating': sum(row.defending_rating),
                'attacking_rating': sum(row.attacking_rating)
            }
        )
    ).sort_values(by='sum_of_defence_against', ignore_index=True, ascending=False)

future_matches_df

Unnamed: 0,team,teams_against,home_or_away,when,sum_of_defence_against,sum_of_attack_against,defending_rating,attacking_rating
0,Arsenal,"[Chelsea, Brentford, Liverpool, Watford, Leice...","[Away, Home, Home, Away, Home, Away]","[2022-02-12 15:00:00, 2022-02-19 15:00:00, 202...",11.429749,7.718565,4.760245,12.124724
1,Wolverhampton Wanderers,"[Tottenham, Leicester, West Ham, Crystal Palac...","[Away, Home, Away, Home, Away, Home]","[2022-02-13 14:00:00, 2022-02-20 16:30:00, 202...",10.598435,9.102423,8.474008,4.207385
2,Brentford,"[Crystal Palace, Arsenal, Newcastle United, No...","[Home, Away, Home, Away, Home, Away]","[2022-02-12 15:00:00, 2022-02-19 15:00:00, 202...",10.038967,5.813391,9.33271,4.594934
3,Everton,"[Leeds, Southampton, Manchester City, Tottenha...","[Home, Away, Home, Away, Home, Away]","[2022-02-12 15:00:00, 2022-02-19 15:00:00, 202...",9.722595,8.461564,11.438996,5.929834
4,Chelsea,"[Arsenal, Crystal Palace, Leicester, Burnley, ...","[Home, Away, Home, Away, Home, Away]","[2022-02-12 15:00:00, 2022-02-19 15:00:00, 202...",9.655981,7.5886,4.482455,8.404628
5,Leeds,"[Everton, Manchester United, Tottenham, Leices...","[Away, Home, Home, Away, Home, Away]","[2022-02-12 15:00:00, 2022-02-20 14:00:00, 202...",9.628051,8.354201,12.6955,9.494729
6,Leicester,"[West Ham, Wolverhampton Wanderers, Chelsea, L...","[Home, Away, Away, Home, Away, Home]","[2022-02-13 16:30:00, 2022-02-20 16:30:00, 202...",9.625741,7.953003,14.25133,8.801715
7,Southampton,"[Manchester United, Everton, Norwich, Aston Vi...","[Away, Home, Home, Away, Home, Away]","[2022-02-12 12:30:00, 2022-02-19 15:00:00, 202...",9.407158,5.823044,10.553432,6.028746
8,Brighton,"[Watford, Manchester United, Burnley, Aston Vi...","[Away, Away, Home, Home, Away, Home]","[2022-02-12 15:00:00, 2022-02-15 20:15:00, 202...",9.119857,7.164403,6.709532,7.520858
9,Tottenham,"[Wolverhampton Wanderers, Manchester City, Lee...","[Home, Away, Away, Home, Away, Home]","[2022-02-13 14:00:00, 2022-02-19 17:30:00, 202...",9.08205,8.220239,6.131476,14.018556


## fetch player data

In [None]:
## number of matches to look the form
NUMBER_OF_MATCHES = 5

player_data_all = pd.DataFrame()
for i in range(len(df)):
    player_id = df.at[i, 'id']
    player_name = df.at[i, 'player_name']
    team = df.at[i, 'team']
    print(player_id, player_name, team)
    
    base_url = f'https://understat.com/player/{player_id}'
    url = base_url
    print(url)

    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    scripts = soup.find_all('script')
    script_data = scripts[4]
    strings = script_data.string
    start = strings.index("('")+2
    end = strings.index("')")
    json_data = strings[start:end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    data = json.loads(json_data)

    # get data for every player for the past 5 matches
    
    columns = ['time', 'xG', 'xA', 'goals', 'assists', 'key_passes', 'shots', 'h_team', 'a_team', 'date']
    dc = {c:[] for c in columns}

    for match in data[:NUMBER_OF_MATCHES]:
        for c in columns:
            dc[c].append(match[c])
    player_data = pd.DataFrame(dc)
    player_data['player_id'] = player_id
    player_data['player_name'] = player_name
    player_data['team'] = team
    player_data['NUMBER_OF_MATCHES'] = NUMBER_OF_MATCHES
    player_data['time'] = player_data['time'].astype(float)
    player_data['xG'] = player_data['xG'].astype(float)
    player_data['xA'] = player_data['xA'].astype(float)
    player_data['goals'] = player_data['goals'].astype(float)
    player_data['assists'] = player_data['assists'].astype(float)
    player_data['key_passes'] = player_data['key_passes'].astype(float)
    player_data['shots'] = player_data['shots'].astype(float)
    player_data['team_played_against'] =  player_data.apply(lambda row: row['h_team'] if row['h_team'] != row['team'] else row['a_team'], axis=1)
    
    player_data_all = player_data_all.append(player_data)
    time.sleep(SLEEPING_TIME)


7230 Emile Smith-Rowe Arsenal
https://understat.com/player/7230
7322 Bukayo Saka Arsenal
https://understat.com/player/7322
318 Pierre-Emerick Aubameyang Arsenal
https://understat.com/player/318
2517 Martin Odegaard Arsenal
https://understat.com/player/2517
7752 Gabriel Martinelli Arsenal
https://understat.com/player/7752
3277 Alexandre Lacazette Arsenal
https://understat.com/player/3277
8865 Ollie Watkins Aston Villa
https://understat.com/player/8865
8941 Jacob Ramsey Aston Villa
https://understat.com/player/8941
2203 Emiliano Buendía Aston Villa
https://understat.com/player/2203
998 Ivan Toney Brentford
https://understat.com/player/998
6552 Bryan Mbeumo Brentford
https://understat.com/player/6552
1078 Sergi Canos Brentford
https://understat.com/player/1078
7083 Christian Nørgaard Brentford
https://understat.com/player/7083
3621 Neal Maupay Brighton
https://understat.com/player/3621
7698 Leandro Trossard Brighton
https://understat.com/player/7698
8379 Alexis Mac Allister Brighton
https

In [None]:
agg = player_data_all.groupby(['player_id', 'player_name', 'team', 'NUMBER_OF_MATCHES'], as_index=False).apply(
        lambda row: pd.Series(
            {
                'total_minutes_played':sum(row.time),
                'avg_minutes_played': np.mean(row.time),
                'xG':sum(row.xG),
                'xA':sum(row.xA),
                'key_passes': sum(row.key_passes),
                'shots':sum(row.shots),
                'goals':sum(row.goals),
                'assists':sum(row.assists),
                'team_played_against':list(row.team_played_against),
                'date_played': list(row.date),
                'oldest_match': min(row.date),
                'latest_match': max(row.date)
            }
        )
    )

agg['latest_match'] = agg['latest_match'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
today = datetime.datetime.today()
agg['days_since_last_match'] = agg['latest_match'].apply(lambda x: (today - x).days)
agg['xTotal'] = agg['xG'] + agg['xA']
agg['date_played'] = agg['date_played'].apply(lambda x: sorted(x, reverse=True))
agg = agg.sort_values(by='xTotal', ascending=False)
agg.head(20)


## Final Decision Making

In [None]:
future_matches_df.sort_values(by='sum_of_attack_against')

Get the best 3 players for each of the 5 teams that play against with the worst combined defences. Exclude players that have not played in the past 15 days

In [None]:
PLAYERS_TO_SHOW = 3
TEAMS_SELECTED = 20
DAYS_NOT_PLAYED = 15

selected_teams = [team for team in future_matches_df[:TEAMS_SELECTED].team]
selected_players_for_squad = agg.loc[(agg['days_since_last_match'] < DAYS_NOT_PLAYED) & (agg['team'].isin(selected_teams))].groupby('team').tail(PLAYERS_TO_SHOW).sort_values(by='xTotal', ascending=False).drop(['team_played_against', 'date_played'],axis=1).reset_index(drop=True)
daten = selected_players_for_squad.merge(future_matches_df[['team', 'sum_of_defence_against', 'attacking_rating']])#.drop(['NUMBER_OF_MATCHES', 'total_minutes_played', 'player_id', 'key_passes', 'shots'], axis = 1)
daten[['player_name', 'team', 'xTotal', 'xG', 'xA', 'goals', 'assists', 'sum_of_defence_against', 'attacking_rating', 'oldest_match', 'latest_match' ,'avg_minutes_played']].sort_values(by='xTotal', ascending=False)