## Which opponent is the most difficult to score points against? (xG analysis)

In [22]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [23]:
# Global variables
gameweek = 28

## Collect Team data
##### Attacking (for) and defensive (against) stats for each team

In [24]:
# Specify the path to the files
attack = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Attacking\*.csv')
defense = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Defensive\*.csv')

# Define a function to extract the week number from the filename
def extract_week_number(filename):
    match = re.search(r'GW_(\d+)', filename)
    return int(match.group(1)) if match else None

# Read each attacking file and add the 'Week' column
att_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in attack],
    ignore_index=True
)

# Read each defensive file and add the 'Week' column
def_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in defense],
    ignore_index=True
)
# Remove 'VS' team
def_weekly_data['Team'] = def_weekly_data['Team'].str[3:]

# Choose columns data 
columns_new = ['Team','Week', 'Playing TimeMP', 'Possession','PerformanceGls','PerformanceAst','ExpectedxG','ExpectedxAG',
               'Per 90 MinutesGls','Per 90 MinutesAst','Per 90 MinutesxG','Per 90 MinutesxAG']

# Attacking data
attacking_data = pd.DataFrame(att_weekly_data[columns_new]).sort_values(by = 'Week')

# # Defensive data
defensive_data = pd.DataFrame(def_weekly_data[columns_new]).sort_values(by = 'Week')

In [25]:
attacking_data

Unnamed: 0,Team,Week,Playing TimeMP,Possession,PerformanceGls,PerformanceAst,ExpectedxG,ExpectedxAG,Per 90 MinutesGls,Per 90 MinutesAst,Per 90 MinutesxG,Per 90 MinutesxAG
399,Wolves,8,1,44.4,1,1,0.8,0.8,1.25,1.00,0.92,0.72
397,Tottenham,8,1,61.6,3,3,1.9,1.6,2.00,1.62,2.08,1.48
396,Southampton,8,1,55.5,2,1,2.1,1.5,0.75,0.50,1.29,0.99
395,Nott'ham Forest,8,1,41.3,1,0,1.5,1.4,1.00,0.50,1.21,0.86
394,Newcastle Utd,8,1,49.5,0,0,2.0,1.7,1.00,0.87,1.55,1.20
...,...,...,...,...,...,...,...,...,...,...,...,...
377,Tottenham,28,2,56.8,2,1,2.5,1.5,1.79,1.41,1.63,1.27
378,West Ham,28,2,47.0,1,1,1.5,0.8,1.07,0.62,1.26,0.88
379,Wolves,28,2,47.8,3,3,1.1,1.0,1.38,1.17,1.09,0.90
370,Leicester City,28,2,45.9,0,0,1.2,0.9,0.86,0.69,0.89,0.70


## Overall team performance
##### Average xG minus average xG against

In [26]:
# Blank list to store data 
averages = []

# List of unique teams 
teams = attacking_data['Team'].unique()

# Loop through each team 
for team in teams:
    # Filter data for the current team
    team_attack = attacking_data[attacking_data['Team'] == team]
    team_defense = defensive_data[defensive_data['Team'] == team]
    # Calculate the average xG attacking and xGA defensive for the team
    xG = team_attack['ExpectedxG'].mean().round(3)
    xGA = team_defense['ExpectedxG'].mean().round(3)
    # Append team and it average xG and xGa to the list
    averages.append([team, xG, xGA])

# Sort the data a-z
data = sorted(averages)

# Create a pd dataframe with column names 
averages_data = pd.DataFrame(data, columns = ['Team', 'xG', 'xGA'])

# Create a differences column
averages_data['Difference'] = averages_data['xG'] - averages_data['xGA']

# Sort in ascending order
averages_data.sort_values(by = 'Difference', ascending= False)

Unnamed: 0,Team,xG,xGA,Difference
11,Liverpool,2.438,0.952,1.486
0,Arsenal,1.51,0.833,0.677
2,Bournemouth,1.943,1.419,0.524
6,Crystal Palace,1.714,1.19,0.524
14,Newcastle Utd,1.71,1.195,0.515
5,Chelsea,1.99,1.51,0.48
12,Manchester City,1.79,1.505,0.285
8,Fulham,1.381,1.186,0.195
4,Brighton,1.562,1.414,0.148
15,Nott'ham Forest,1.352,1.267,0.085


## Separate home and away data

- home_attack: All gameweeks a Team plays at home. The metrics are all FOR the team. How many performance goals and assists have been scored at home. How many expected goals and assists they have at home.
- away_attack. All gameweeks a Team plays away. The metrics are all FOR the team. How many performance goals and assists have been scored away. How many expected goals and assists they have away. 
- home_defense: All gameweeks a Team plays at home. The metrics are all AGAINST the team. How many performance goals and assists have been conceded at home. How many expected goals and assists they conceded at home.
- away_defense: All gameweeks a Team plays away. The metrics are all all AGAINST the team. How many performance goals and assists have been conceded away. How many expected goals and assists they conceded away.

In [27]:
# Collect fixture list
fixtures = pd.read_csv(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Schedule\Fixtures_alt_names.csv')

# Create function to collect homedata
def team_home_data(team, fixtures, gameweek):
    # Create a list to store the results
    home_data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(H)' in row[col]:  # Check if it's a home game and add GW and opponent
                    home_data.append([col, row[col]])

    # Return the collected home data
    return home_data

# Create function to collect awaydata
def team_away_data(team, fixtures, gameweek):
    # Create a list to store the results
    away_data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(A)' in row[col]:  # Check if it's an away game and add GW and oppponent
                    away_data.append([col, row[col]])

    # Return the collected home data
    return away_data

# Home data
home_games = []

for team in teams:
    data = team_home_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        home_games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
home = pd.DataFrame(home_games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
home['Week'] = home['Week'].str[2:].astype(int)

# Away data
away_games = []

for team in teams:
    data = team_away_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        away_games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
away = pd.DataFrame(away_games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
away['Week'] = away['Week'].str[2:].astype(int)

# Define columns
cols = ['Team', 'Week', 'Possession', 'PerformanceGls',
       'PerformanceAst', 'ExpectedxG', 'ExpectedxAG', 'Per 90 MinutesGls',
       'Per 90 MinutesAst', 'Per 90 MinutesxG', 'Per 90 MinutesxAG']

# Get attacking and defensive data
attacking = attacking_data[cols]
defensive = defensive_data[cols]

# Get all data
home_attack = home.merge(attacking, on=['Week', 'Team'])
home_defense = home.merge(defensive, on=['Week', 'Team'])
away_attack = away.merge(attacking, on=['Week', 'Team'] )
away_defense = away.merge(defensive, on = ['Week', 'Team'])

In [28]:
home_attack

Unnamed: 0,Week,Team,Opponent,Possession,PerformanceGls,PerformanceAst,ExpectedxG,ExpectedxAG,Per 90 MinutesGls,Per 90 MinutesAst,Per 90 MinutesxG,Per 90 MinutesxAG
0,8,Wolves,MCI (H),44.4,1,1,0.8,0.8,1.25,1.00,0.92,0.72
1,10,Wolves,CRY (H),46.1,2,2,1.5,1.4,1.40,1.10,1.02,0.85
2,11,Wolves,SOU (H),44.5,2,2,1.3,1.2,1.45,1.18,1.04,0.88
3,13,Wolves,BOU (H),45.5,2,2,0.5,0.3,1.69,1.46,1.02,0.87
4,16,Wolves,IPS (H),46.7,1,1,1.2,1.1,1.50,1.31,1.05,0.89
...,...,...,...,...,...,...,...,...,...,...,...,...
205,18,Arsenal,IPS (H),54.6,1,1,1.4,0.0,1.94,1.59,1.72,1.31
206,21,Arsenal,TOT (H),55.0,1,1,1.4,1.1,1.86,1.43,1.68,1.27
207,22,Arsenal,AVL (H),55.5,2,2,1.3,0.9,1.86,1.45,1.67,1.25
208,24,Arsenal,MCI (H),54.8,5,4,1.0,0.9,1.96,1.50,1.61,1.22


# Option to filter by later gameweek

In [29]:
#home_attack = home_attack[home_attack['Week'] > 10]
#home_defense = home_defense[home_defense['Week'] > 10]
#away_attack = away_attack[away_attack['Week'] > 10]
#away_defense = away_defense[away_defense['Week'] > 10]

## Calculate best performing teams

##### Average all home/away and attack(for)/defense(against) stats
This is by combining 1) actual goals, 2) expected goals and 3) expected assists.

In [30]:
# Function to group by 'Team', calculate mean of 'ExpectedxG', and round to 3 decimal places
def process_group(data, column):
    return data.groupby('Team')[[column]].mean().round(3).reset_index()

# Process each dataset
best_home_attack = process_group(home_attack, 'ExpectedxG')
best_home_defense = process_group(home_defense, 'ExpectedxG')
best_away_attack = process_group(away_attack, 'ExpectedxG')
best_away_defense = process_group(away_defense, 'ExpectedxG')

In [32]:
best_away_attack

Unnamed: 0,Team,ExpectedxG
0,Arsenal,1.575
1,Aston Villa,0.936
2,Bournemouth,1.918
3,Brentford,1.72
4,Brighton,1.727
5,Chelsea,2.227
6,Crystal Palace,2.01
7,Everton,0.918
8,Fulham,1.42
9,Ipswich Town,0.91


## Standardizing

In [11]:
def standardize_data(data):
    # Step 1: Drop the 'team' column (or any string-based column)
    team_column = data.pop("Team")  # Temporarily store the column

    # Step 2: Standardize numerical columns
    scaler = MinMaxScaler()
    numerical_columns = data.columns  # Identify all remaining columns
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    # Step 3: Re-add the 'team' column
    data["Team"] = team_column

    return data

## Ranking

In [12]:
def rank_data_attack(data):
    
    # Save the original ExpectedxG before standardization
    original_xG = data['ExpectedxG'].copy()
    
    # Standardize the data (assuming standardize_data returns a new DataFrame)
    data2 = standardize_data(data)
    
    # Set the standardized xG (assuming ExpectedxG is now standardized)
    data2['xG_std'] = data2['ExpectedxG'].round(3)
    
    # Reintroduce the original xG
    data2['xG'] = original_xG
    
    # Sort values by standardized xG
    data2 = data2.sort_values(by='xG_std', ascending=False)
    
    # Rank teams into 4 quantiles based on standardized xG
    data2["Difficulty"] = pd.qcut(data2["xG_std"], q=5, labels=[1, 2, 3, 4, 5])

    # Replace 1 with 2 to match original FDR better
    data2['Difficulty'] = data2['Difficulty'].replace(1,2)
    
    # Create the final dataset
    final = data2[['Team', 'Difficulty', 'xG_std', 'xG']]
    
    return final


def rank_data_defense(data):
    
    # Save the original ExpectedxG before standardization
    original_xG = data['ExpectedxG'].copy()
    
    # Standardize the data (assuming standardize_data returns a new DataFrame)
    data2 = standardize_data(data)
    
    # Set the standardized xG (assuming ExpectedxG is now standardized)
    data2['xG_std'] = data2['ExpectedxG'].round(3)
    
    # Reintroduce the original xG
    data2['xG'] = original_xG
    
    # Sort values by standardized xG
    data2 = data2.sort_values(by='xG_std', ascending=False)
    
    # Rank teams into 4 quantiles based on standardized xG
    data2["Difficulty"] = pd.qcut(data2["xG_std"], q=5, labels=[5, 4, 3, 2, 1])

    # Replace 1 with 2 to match original FDR better
    data2['Difficulty'] = data2['Difficulty'].replace(1,2)
    
    # Create the final dataset
    final = data2[['Team', 'Difficulty', 'xG_std', 'xG']]
    
    return final

## Home attackers

In [13]:
home_attack = pd.DataFrame(rank_data_attack(best_home_attack))
home_attack.sort_values(by = 'xG_std', ascending = False)

  data2['Difficulty'] = data2['Difficulty'].replace(1,2)


Unnamed: 0,Team,Difficulty,xG_std,xG
12,Manchester City,5,1.0,2.244
11,Liverpool,5,0.913,2.136
1,Aston Villa,5,0.9,2.12
14,Newcastle Utd,4,0.78,1.97
2,Bournemouth,4,0.78,1.97
5,Chelsea,4,0.587,1.73
17,Tottenham,4,0.502,1.625
13,Manchester Utd,4,0.456,1.567
3,Brentford,3,0.387,1.482
15,Nott'ham Forest,3,0.38,1.473


What this translates as in terms of selection is: A defender who is away (A) to these clubs with have a difficulty of 4.

## Away attackers

In [14]:
away_attack = pd.DataFrame(rank_data_attack(best_away_attack))
away_attack.sort_values(by = 'xG_std', ascending = False)

  data2['Difficulty'] = data2['Difficulty'].replace(1,2)


Unnamed: 0,Team,Difficulty,xG_std,xG
11,Liverpool,5,1.0,2.77
5,Chelsea,5,0.733,2.227
6,Crystal Palace,5,0.626,2.01
2,Bournemouth,5,0.581,1.918
4,Brighton,4,0.487,1.727
3,Brentford,4,0.484,1.72
0,Arsenal,4,0.412,1.575
18,West Ham,4,0.381,1.51
14,Newcastle Utd,3,0.362,1.473
12,Manchester City,3,0.351,1.45


What this translates as in terms of selection is: A defender who is at home (H)) to these clubs with have a difficulty of 4.

## Home Defence

In [15]:
home_def = pd.DataFrame(rank_data_defense(best_home_defense))
home_def.sort_values(by = 'xG_std', ascending = True)

  data2['Difficulty'] = data2['Difficulty'].replace(1,2)


Unnamed: 0,Team,Difficulty,xG_std,xG
0,Arsenal,5,0.0,0.633
11,Liverpool,5,0.072,0.818
15,Nott'ham Forest,5,0.115,0.927
7,Everton,5,0.128,0.96
6,Crystal Palace,4,0.183,1.1
5,Chelsea,4,0.21,1.17
8,Fulham,4,0.239,1.245
4,Brighton,4,0.257,1.29
13,Manchester Utd,3,0.3,1.4
19,Wolves,3,0.303,1.409


What this translates as in terms of selection is: An attacker who plays away (A)) to these clubs with have a difficulty of 4.

## Away defence

In [16]:
away_def = pd.DataFrame(rank_data_defense(best_away_defense))
away_def.sort_values(by = 'xG_std', ascending = True)

  data2['Difficulty'] = data2['Difficulty'].replace(1,2)


Unnamed: 0,Team,Difficulty,xG_std,xG
14,Newcastle Utd,5,0.0,0.982
0,Arsenal,5,0.001,0.983
11,Liverpool,5,0.103,1.1
8,Fulham,5,0.12,1.12
7,Everton,4,0.261,1.282
6,Crystal Palace,4,0.268,1.29
2,Bournemouth,4,0.325,1.355
3,Brentford,4,0.329,1.36
12,Manchester City,3,0.429,1.475
4,Brighton,3,0.475,1.527


What this translates as in terms of selection is: An attacker who plays home (H)) to these clubs with have a difficulty of 4.

# Convert to useable dataframe

In [17]:
# Create a dictionary mapping full team names to their 3-letter codes plus (A)
team_to_code_A = {
    "Arsenal": "ARS (A)",
    "Aston Villa": "AVL (A)",
    "Brentford": "BRE (A)",
    "Brighton": "BHA (A)",  # Brighton & Hove Albion
    "Bournemouth": "BOU (A)",
    "Chelsea": "CHE (A)",
    "Crystal Palace": "CRY (A)", # You can choose CRY (A) for Crystal Palace
    "Everton": "EVE (A)",
    "Fulham": "FUL (A)",
    "Ipswich Town": "IPS (A)",
    "Leicester City": "LEI (A)",
    "Liverpool": "LIV (A)",
    "Manchester City": "MCI (A)",
    "Manchester Utd": "MUN (A)",
    "Newcastle Utd": "NEW (A)",
    "Nott'ham Forest": "NFO (A)",
    "Southampton": "SOU (A)",
    "Tottenham": "TOT (A)",
    "West Ham": "WHU (A)",
    "Wolves": "WOL (A)"
}

# Create a dictionary mapping full team names to their 3-letter codes plus (H)
team_to_code_H = {
    "Arsenal": "ARS (H)",
    "Aston Villa": "AVL (H)",
    "Brentford": "BRE (H)",
    "Brighton": "BHA (H)",  # Brighton & Hove Albion
    "Bournemouth": "BOU (H)",
    "Chelsea": "CHE (H)",
    "Crystal Palace": "CRY (H)",
    "Everton": "EVE (H)",
    "Fulham": "FUL (H)",
    "Ipswich Town": "IPS (H)",
    "Leicester City": "LEI (H)",
    "Liverpool": "LIV (H)",
    "Manchester City": "MCI (H)",
    "Manchester Utd": "MUN (H)",
    "Newcastle Utd": "NEW (H)",
    "Nott'ham Forest": "NFO (H)",
    "Southampton": "SOU (H)",
    "Tottenham": "TOT (H)",
    "West Ham": "WHU (H)",
    "Wolves": "WOL (H)"
}

In [18]:
# Convert defensive players away
home_attack['Team'] = home_attack['Team'].replace(team_to_code_A)
home_attack['Position'] = 'GK'
goalkeepers_A = home_attack.copy()
home_attack['Position'] = 'DEF'
defenders_A = home_attack.copy()

# Convert defensive players home
away_attack['Team'] = away_attack['Team'].replace(team_to_code_H)
away_attack['Position'] = 'GK'
goalkeepers_H = away_attack.copy()
away_attack['Position'] = 'DEF'
defenders_H = away_attack.copy()
defenders_H

Unnamed: 0,Team,Difficulty,xG_std,xG,Position
11,LIV (H),5,1.0,2.77,DEF
5,CHE (H),5,0.733,2.227,DEF
6,CRY (H),5,0.626,2.01,DEF
2,BOU (H),5,0.581,1.918,DEF
4,BHA (H),4,0.487,1.727,DEF
3,BRE (H),4,0.484,1.72,DEF
0,ARS (H),4,0.412,1.575,DEF
18,WHU (H),4,0.381,1.51,DEF
14,NEW (H),3,0.362,1.473,DEF
12,MCI (H),3,0.351,1.45,DEF


In [19]:
# Convert attacking players away
home_def['Team'] = home_def['Team'].replace(team_to_code_A)
home_def['Position'] = 'MID'
midfielders_A = home_def.copy()
home_def['Position'] = 'FWD'
forwards_A = home_def.copy()
forwards_A

# Convert attacking players home
away_def['Team'] = away_def['Team'].replace(team_to_code_H)
away_def['Position'] = 'MID'
midfielders_H = away_def.copy()
away_def['Position'] = 'FWD'
forwards_H = away_def.copy()
midfielders_H

Unnamed: 0,Team,Difficulty,xG_std,xG,Position
9,IPS (H),2,1.0,2.13,MID
16,SOU (H),2,0.887,2.0,MID
17,TOT (H),2,0.771,1.867,MID
5,CHE (H),2,0.728,1.818,MID
13,MUN (H),2,0.713,1.8,MID
10,LEI (H),2,0.697,1.782,MID
18,WHU (H),2,0.634,1.71,MID
19,WOL (H),2,0.617,1.69,MID
15,NFO (H),3,0.573,1.64,MID
1,AVL (H),3,0.49,1.545,MID


In [20]:
# Combine into dataframe
FD_xG = pd.concat([goalkeepers_A, goalkeepers_H, defenders_A, defenders_H, midfielders_A, midfielders_H, forwards_A, forwards_H])
# Rename columns
FD_xG.rename(columns = {'Team': 'Opponent'}, inplace = True)