In [None]:
import requests
import json
import pandas as pd
import time
import datetime
from bs4 import BeautifulSoup

In [None]:
base_url = 'https://understat.com/league/EPL/'
url = base_url

res = requests.get(url)
soup = BeautifulSoup(res.content, 'html.parser')
scripts = soup.find_all('script')


In [None]:
matches = scripts[3]
strings = matches.string
start = strings.index("('")+2
end = strings.index("')")
json_data = strings[start:end]
json_data = json_data.encode('utf8').decode('unicode_escape')

data = json.loads(json_data)

columns = data[0].keys()
dc = {c:[] for c in columns}

for player in data:
    for c in columns:
        dc[c].append(player[c])
players = pd.DataFrame(dc)
players.head(3)

## Best Attacking Players Overall

based on attacking rating 

In [None]:
players['xG'] = players['xG'].apply(lambda x: float(x))
players['xA'] = players['xA'].apply(lambda x: float(x))
players['goals'] = players['goals'].apply(lambda x: int(x))
players['assists'] = players['assists'].apply(lambda x: int(x))
players['shots'] = players['shots'].apply(lambda x: int(x))
players['attacking_rating'] = players.apply(lambda x: x['xG'] *1.2 + x['xA'], axis = 1)

columns = ['id', 'player_name', 'position', 'team_title', 'games', 'time', 'attacking_rating', 'xG', 'xA', 'goals', 'assists']
players[columns].sort_values(by='attacking_rating', ascending=False, ignore_index=True).head(25)

## Choose the most suitable attacking players by looking at the worst defending teams

Get the 5 worst teams in the past X games and find their opponents . Show the best players of the other team. By best players we define the ones that in the past 6 games have the best avg rating (to show their form)


Penalise the team if it concedes a goal at home. Every xgoal at home is multiply by 1.2.

In [None]:
matches = scripts[2]
strings = matches.string
start = strings.index("('")+2
end = strings.index("')")
json_data = strings[start:end]
json_data = json_data.encode('utf8').decode('unicode_escape')

data = json.loads(json_data)
columns = ['h_a', 'xG', 'xGA', 'npxG', 'npxGA', 'scored','missed', 'xpts', 'result']

dc = {}
dc['team'] = []
for c in columns:
    dc[c] = []

team_list = []
for team_id in data:
    team_name = data[team_id]['title']
    team_list.append(team_name)
    for match in data[team_id]['history']:
        dc['team'].append(team_name)
        for c in columns:
            dc[c].append(match[c]) 
            
matches_df = pd.DataFrame(dc)


In [None]:
NUMBER_OF_GAMES = 8
print(f'Looking back at {NUMBER_OF_GAMES} games.')
matches_df['defending_rating'] = matches_df.apply(lambda row: 1.2 * row['xGA'] if row['h_a'] == 'a' else row['xGA'], axis=1)
matches_df['matches_home'] = matches_df.apply(lambda row: 1 if row['h_a'] == 'h' else 0, axis=1)
matches_df['matches_away'] = matches_df.apply(lambda row: 1 if row['h_a'] == 'a' else 0, axis=1)
matches_df['defence_home'] = matches_df.apply(lambda row: row['xGA'] if row['h_a'] == 'h' else 0, axis=1)
matches_df['defence_away'] = matches_df.apply(lambda row: row['xGA'] if row['h_a'] == 'a' else 0, axis=1)
defence_df = matches_df.groupby('team').tail(NUMBER_OF_GAMES).groupby('team', as_index=False).apply(
        lambda row: pd.Series(
            {
                'xG':sum(row.xG)
                , 'xGA': sum(row.xGA)
                , 'xpts': sum(row.xpts)
                , 'defending_rating': sum(row.defending_rating)
                , 'defence_home': sum(row.defence_home)
                , 'defence_away': sum(row.defence_away)
                , 'matches_home': sum(row.matches_home)
                , 'matches_away': sum(row.matches_away)
            }
        )
    ).sort_values('defending_rating', ascending=False, ignore_index=True)[:]
defence_df.sort_values(by='defending_rating', ascending=False, ignore_index=True)
defence_df.sort_values(by='defence_away', ascending=False, ignore_index=True)

Find the best performing teams in the past 5 matches

In [None]:
defence_df.sort_values(by='xGA', ignore_index=False)

## Get player data from team page

To restrict the search space we just consider players that had at least 4 xgoals + xassists the whole season

In [None]:
import time

team_rosters = {}
teams = ['Aston Villa', 'Everton', 'Southampton', 'Leicester', 'Crystal Palace', 'Norwich', 'Chelsea', 'West Ham', 'Tottenham', 'Arsenal', 'Newcastle United', 'Liverpool', 'Manchester City', 'Manchester United', 'Watford', 'Burnley', 'Brighton', 'Wolverhampton Wanderers', 'Brentford', 'Leeds']
teams.sort()

df = pd.DataFrame()

for team in teams:
    base_url = f'https://understat.com/team/{team}/2021'
    url = base_url
    print(url)

    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    scripts = soup.find_all('script')

    script_data = scripts[3]
    strings = script_data.string
    start = strings.index("('")+2
    end = strings.index("')")
    json_data = strings[start:end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    data = json.loads(json_data)

    columns = data[0].keys()
    dc = {c:[] for c in columns}

    for player in data:
        for c in columns:
            dc[c].append(player[c])

    team_data = pd.DataFrame(dc)
    team_rosters[team] = team_data
    team_data['team'] = team
    team_data['xG'] = team_data['xG'].astype(float)
    team_data['xA'] = team_data['xA'].astype(float)
    team_data['xTotal'] = team_data['xG'] + team_data['xA']
    df = df.append(team_data.loc[team_data['xTotal'] > 4, ['id', 'player_name', 'team', 'xG', 'xA', 'xTotal', 'goals', 'assists']])
    time.sleep(2)
df = df.reset_index(drop=True)

In [None]:
df

## fetch player data

In [None]:
player_data_all = pd.DataFrame()
for i in range(len(df)):
    player_id = df.at[i, 'id']
    player_name = df.at[i, 'player_name']
    print(player_id, player_name)
    
    base_url = f'https://understat.com/player/{player_id}'
    url = base_url
    print(url)

    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')
    scripts = soup.find_all('script')
    script_data = scripts[4]
    strings = script_data.string
    start = strings.index("('")+2
    end = strings.index("')")
    json_data = strings[start:end]
    json_data = json_data.encode('utf8').decode('unicode_escape')
    data = json.loads(json_data)

    # get data for every player for the past 5 matches
    NUMBER_OF_MATCHES = 5
    columns = ['xG', 'xA', 'goals', 'assists', 'key_passes', 'shots', 'h_team', 'a_team', 'date']
    dc = {c:[] for c in columns}

    for match in data[:NUMBER_OF_MATCHES]:
        for c in columns:
            dc[c].append(match[c])
    player_data = pd.DataFrame(dc)
    player_data['player_id'] = player_id
    player_data['player_name'] = player_name
    player_data['xG'] = player_data['xG'].astype(float)
    player_data['xA'] = player_data['xA'].astype(float)
    player_data['goals'] = player_data['goals'].astype(float)
    player_data['assists'] = player_data['assists'].astype(float)
    player_data['key_passes'] = player_data['key_passes'].astype(float)
    player_data['shots'] = player_data['shots'].astype(float)
    
    player_data_all = player_data_all.append(player_data)
    time.sleep(2)

In [None]:
agg = player_data_all.groupby(['player_id', 'player_name'], as_index=False).apply(
        lambda row: pd.Series(
            {
                'xG':sum(row.xG),
                'xA':sum(row.xA),
                'key_passes': sum(row.key_passes),
                'shots':sum(row.shots),
                'goals':sum(row.goals),
                'assists':sum(row.assists),
            }
        )
    )
agg['xTotal'] = agg['xG'] + agg['xA']
agg.sort_values(by='xTotal', ascending=False).head(30)
