In [None]:
import requests
import json
import pandas as pd
import time
import datetime
from bs4 import BeautifulSoup
import numpy as np

SLEEPING_TIME = 3


In [None]:
# this is used to get the best attacking players and the defending aptitude of each team
base_url = 'https://understat.com/league/EPL/'
url = base_url

res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
scripts = soup.find_all('script')


In [None]:
"""
    params:
        - best_players_df: the best attacking players overall this season
        - scripts[3]
"""
matches = scripts[3]
strings = matches.string
start = strings.index("('")+2
end = strings.index("')")
json_data = strings[start:end]
json_data = json_data.encode('utf8').decode('unicode_escape')

data = json.loads(json_data)

columns = data[0].keys()
dc = {c:[] for c in columns}

for player in data:
    for c in columns:
        dc[c].append(player[c])
best_players_df = pd.DataFrame(dc)
best_players_df.head(3)

## Best Attacking Players Overall

based on attacking rating 

In [None]:
best_players_df['xG'] = best_players_df['xG'].apply(lambda x: float(x))
best_players_df['xA'] = best_players_df['xA'].apply(lambda x: float(x))
best_players_df['goals'] = best_players_df['goals'].apply(lambda x: int(x))
best_players_df['assists'] = best_players_df['assists'].apply(lambda x: int(x))
best_players_df['shots'] = best_players_df['shots'].apply(lambda x: int(x))
best_players_df['attacking_rating'] = best_players_df.apply(lambda x: x['xG'] *1.2 + x['xA'], axis = 1)

columns = ['id', 'player_name', 'position', 'team_title', 'games', 'time', 'attacking_rating', 'xG', 'xA', 'goals', 'assists']
best_players_df[columns].sort_values(by='attacking_rating', ascending=False, ignore_index=True).head(25)

## Choose the most suitable attacking players by looking at the worst defending teams

Get the teams that are facing opponents with weak defence in the next `NUMBER_OF_FUTURE_MATCHES`. Get the best players of the team by looking at their xGoals + xAssists in the past `NUMBER_OF_MATCHES` games.

Penalise the team if it concedes a goal at home. Every xgoal at home is multiplied by 1.3.
Boost the team if it makes a goal away. Every xgoal away is multiplied by 1.3.


In [None]:
"""
    params: 
        - matches_df: contains xGA for each team
"""
matches = scripts[2]
strings = matches.string
start = strings.index("('")+2
end = strings.index("')")
json_data = strings[start:end]
json_data = json_data.encode('utf8').decode('unicode_escape')

data = json.loads(json_data)
columns = ['h_a', 'xG', 'xGA', 'npxG', 'npxGA', 'scored','missed', 'xpts', 'result']

dc = {}
dc['team'] = []
for c in columns:
    dc[c] = []

team_list = []
for team_id in data:
    team_name = data[team_id]['title']
    team_list.append(team_name)
    for match in data[team_id]['history']:
        dc['team'].append(team_name)
        for c in columns:
            dc[c].append(match[c]) 
            
matches_df = pd.DataFrame(dc)


In [None]:
## number of games to look the xgoals against for each team
NUMBER_OF_GAMES = 8
print(f'Looking back at {NUMBER_OF_GAMES} games.')
matches_df['matches_home'] = matches_df.apply(lambda row: 1 if row['h_a'] == 'h' else 0, axis=1)
matches_df['matches_away'] = matches_df.apply(lambda row: 1 if row['h_a'] == 'a' else 0, axis=1)
matches_df['attack_home'] = matches_df.apply(lambda row: row['xG'] if row['h_a'] == 'h' else 0, axis=1)
matches_df['attack_away'] = matches_df.apply(lambda row: row['xG'] if row['h_a'] == 'a' else 0, axis=1)
matches_df['defence_home'] = matches_df.apply(lambda row: row['xGA'] if row['h_a'] == 'h' else 0, axis=1)
matches_df['defence_away'] = matches_df.apply(lambda row: row['xGA'] if row['h_a'] == 'a' else 0, axis=1)
defence_df = matches_df.groupby('team').tail(NUMBER_OF_GAMES).groupby('team', as_index=False).apply(
        lambda row: pd.Series(
            {
                'xG':sum(row.xG)
                , 'xGA': sum(row.xGA)
                , 'xpts': sum(row.xpts)
                , 'attack_home':sum(row.attack_home)
                , 'attack_away':sum(row.attack_away)
                , 'defence_home': sum(row.defence_home)
                , 'defence_away': sum(row.defence_away)
                , 'matches_home': sum(row.matches_home)
                , 'matches_away': sum(row.matches_away)
            }
        )
    )# .sort_values('defending_rating', ascending=False, ignore_index=True)[:]
defence_df['avg_home_att'] = defence_df['attack_home'] / defence_df['matches_home']
defence_df['avg_away_att'] = defence_df['attack_away'] / defence_df['matches_away']
defence_df['avg_home_def'] = defence_df['defence_home'] / defence_df['matches_home']
defence_df['avg_away_def'] = defence_df['defence_away'] / defence_df['matches_away']
defence_df['defending_rating'] = defence_df['avg_home_def'] * 1.3 + defence_df['avg_away_def']
defence_df['attacking_rating'] = defence_df['avg_away_att'] * 1.3 + defence_df['avg_home_att']
defence_df = defence_df.sort_values(by='defending_rating', ascending=False, ignore_index=True)
defence_df

## Get player data from team page & future matches for each team

To restrict the search space we just consider players that had at least xTotal = xgoals + xassists the whole season

In [None]:
"""
    params: 
        - team_data:
        - df:
        - scripts[1]: contains a list of matches of a team (future and past for the whole season)
            - isResult: if the match was done
            - 
        - scripts[3]:
        - xTotal
"""
team_rosters = {}
teams = ['Aston Villa', 'Everton', 'Southampton', 'Leicester', 'Crystal Palace', 'Norwich', 'Chelsea', 'West Ham', 'Tottenham', 'Arsenal', 'Newcastle United', 'Liverpool', 'Manchester City', 'Manchester United', 'Watford', 'Burnley', 'Brighton', 'Wolverhampton Wanderers', 'Brentford', 'Leeds']
teams.sort()

# Number of future matches for which we want to see the defence of the next teams
NUMBER_OF_FUTURE_MATCHES = 6
# Threshold of x goals + x assists that a player needs to have the whole season to be listed our shortlist
xTotal = 5 
# we need this variable to compare the current datetime to decide if a match is next or not
now = datetime.datetime.now()

future_matches_dc = {
    'team':[], 
    'teams_against':[], 
    'home_or_away':[], 
    'when': [], 
    'defence_of_team_against':[], 
    'attack_of_team_against':[],
    'defending_rating':[],
    'attacking_rating':[],
}

df = pd.DataFrame()

for team in teams:
    team_name_url = team.replace(' ', '%20')
    base_url = f'https://understat.com/team/{team_name_url}/2021'
    url = base_url
    print(url)

    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    scripts = soup.find_all('script')
    
    script_data = scripts[1]
    strings = script_data.string
    start = strings.index("('")+2
    end = strings.index("')")
    json_data = strings[start:end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    data = json.loads(json_data)
    
    total_matches = 0
    for match in data:
        date_diff = (now - datetime.datetime.strptime(match['datetime'], '%Y-%m-%d %H:%M:%S')).days
        if date_diff < 0 and not match['isResult']:
            if match['h']['title'] == team:            
                team_against = match['a']['title']
                home_or_away = 'Home'
            else:
                team_against = match['h']['title']
                home_or_away = 'Away'
            # print(f"Team {team} is playing {home_or_away} against {team_against} at {match['datetime']}")

            future_matches_dc['team'].append(team)
            future_matches_dc['teams_against'].append(team_against)
            future_matches_dc['home_or_away'].append(home_or_away)
            future_matches_dc['when'].append(match['datetime'])

            row_against = defence_df[defence_df['team'] == team_against]
            row_team = defence_df[defence_df['team'] == team]

            if home_or_away == 'Home':
                future_matches_dc['defence_of_team_against'].append(row_against['avg_away_def'].iloc[0])
                future_matches_dc['attack_of_team_against'].append(row_against['avg_away_att'].iloc[0])
                future_matches_dc['attacking_rating'].append(row_team['avg_home_att'].iloc[0])
                future_matches_dc['defending_rating'].append(row_team['avg_home_def'].iloc[0])
            else:
                future_matches_dc['defence_of_team_against'].append(row_against['avg_home_def'].iloc[0])
                future_matches_dc['attack_of_team_against'].append(row_against['avg_home_att'].iloc[0])
                future_matches_dc['attacking_rating'].append(row_team['avg_away_att'].iloc[0])
                future_matches_dc['defending_rating'].append(row_team['avg_away_def'].iloc[0])

            total_matches += 1
            if total_matches >= NUMBER_OF_FUTURE_MATCHES:
                break

    script_data = scripts[3]
    strings = script_data.string
    start = strings.index("('")+2
    end = strings.index("')")
    json_data = strings[start:end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    data = json.loads(json_data)

    columns = data[0].keys()
    dc = {c:[] for c in columns}

    for player in data:
        for c in columns:
            dc[c].append(player[c])

    team_data = pd.DataFrame(dc)
    team_rosters[team] = team_data
    team_data['team'] = team
    team_data['xG'] = team_data['xG'].astype(float)
    team_data['xA'] = team_data['xA'].astype(float)
    team_data['xTotal'] = team_data['xG'] + team_data['xA']
    df = df.append(team_data.loc[team_data['xTotal'] > 4, ['id', 'player_name', 'team', 'xG', 'xA', 'xTotal', 'goals', 'assists']])
    time.sleep(SLEEPING_TIME)
    
future_matches_df = pd.DataFrame(future_matches_dc)
df = df.reset_index(drop=True)
print(f'Total players to receive: {df.shape[0]}')

## Show Teams that have future matches against teams with bad defence

In [None]:
future_matches_df = pd.DataFrame(future_matches_dc)
future_matches_df = future_matches_df.groupby('team', as_index=False).apply(
        lambda row: pd.Series(
            {
                'teams_against': list(row.teams_against),
                'home_or_away': list(row.home_or_away),
                'when': list(row.when),
                'sum_of_defence_against': sum(row.defence_of_team_against),
                'sum_of_attack_against': sum(row.attack_of_team_against),
                'defending_rating': sum(row.defending_rating),
                'attacking_rating': sum(row.attacking_rating)
            }
        )
    ).sort_values(by='sum_of_defence_against', ignore_index=True, ascending=False)

future_matches_df

## fetch player data

In [None]:
## number of matches to look the form
NUMBER_OF_MATCHES = 5

player_data_all = pd.DataFrame()
for i in range(len(df)):
    player_id = df.at[i, 'id']
    player_name = df.at[i, 'player_name']
    team = df.at[i, 'team']
    print(player_id, player_name, team)
    
    base_url = f'https://understat.com/player/{player_id}'
    url = base_url
    print(url)

    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    scripts = soup.find_all('script')
    script_data = scripts[4]
    strings = script_data.string
    start = strings.index("('")+2
    end = strings.index("')")
    json_data = strings[start:end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    data = json.loads(json_data)

    # get data for every player for the past 5 matches
    
    columns = ['time', 'xG', 'xA', 'goals', 'assists', 'key_passes', 'shots', 'h_team', 'a_team', 'date']
    dc = {c:[] for c in columns}

    for match in data[:NUMBER_OF_MATCHES]:
        for c in columns:
            dc[c].append(match[c])
    player_data = pd.DataFrame(dc)
    player_data['player_id'] = player_id
    player_data['player_name'] = player_name
    player_data['team'] = team
    player_data['NUMBER_OF_MATCHES'] = NUMBER_OF_MATCHES
    player_data['time'] = player_data['time'].astype(float)
    player_data['xG'] = player_data['xG'].astype(float)
    player_data['xA'] = player_data['xA'].astype(float)
    player_data['goals'] = player_data['goals'].astype(float)
    player_data['assists'] = player_data['assists'].astype(float)
    player_data['key_passes'] = player_data['key_passes'].astype(float)
    player_data['shots'] = player_data['shots'].astype(float)
    player_data['team_played_against'] =  player_data.apply(lambda row: row['h_team'] if row['h_team'] != row['team'] else row['a_team'], axis=1)
    
    player_data_all = player_data_all.append(player_data)
    time.sleep(SLEEPING_TIME)


In [None]:
agg = player_data_all.groupby(['player_id', 'player_name', 'team', 'NUMBER_OF_MATCHES'], as_index=False).apply(
        lambda row: pd.Series(
            {
                'total_minutes_played':sum(row.time),
                'avg_minutes_played': np.mean(row.time),
                'xG':sum(row.xG),
                'xA':sum(row.xA),
                'key_passes': sum(row.key_passes),
                'shots':sum(row.shots),
                'goals':sum(row.goals),
                'assists':sum(row.assists),
                'team_played_against':list(row.team_played_against),
                'date_played': list(row.date),
                'oldest_match': min(row.date),
                'latest_match': max(row.date)
            }
        )
    )

agg['latest_match'] = agg['latest_match'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
today = datetime.datetime.today()
agg['days_since_last_match'] = agg['latest_match'].apply(lambda x: (today - x).days)
agg['xTotal'] = agg['xG'] + agg['xA']
agg['date_played'] = agg['date_played'].apply(lambda x: sorted(x, reverse=True))
agg = agg.sort_values(by='xTotal', ascending=False)
agg.head(20)


## Final Decision Making

In [None]:
future_matches_df

Get the best 3 players for each of the 5 teams that play against with the worst combined defences. Exclude players that have not played in the past 15 days

In [None]:
PLAYERS_TO_SHOW = 3
TEAMS_SELECTED = 20
DAYS_NOT_PLAYED = 15

selected_teams = [team for team in future_matches_df[:TEAMS_SELECTED].team]
selected_players_for_squad = agg.loc[(agg['days_since_last_match'] < DAYS_NOT_PLAYED) & (agg['team'].isin(selected_teams))].groupby('team').tail(PLAYERS_TO_SHOW).sort_values(by='xTotal', ascending=False).drop(['team_played_against', 'date_played'],axis=1).reset_index(drop=True)
daten = selected_players_for_squad.merge(future_matches_df[['team', 'sum_of_defence_against', 'attacking_rating']])#.drop(['NUMBER_OF_MATCHES', 'total_minutes_played', 'player_id', 'key_passes', 'shots'], axis = 1)
daten[['player_name', 'team', 'xTotal', 'xG', 'xA', 'goals', 'assists', 'sum_of_defence_against', 'attacking_rating', 'oldest_match', 'latest_match' ,'avg_minutes_played']].sort_values(by='xTotal', ascending=False)