## Which opponent is the most difficult to score points against? (xG analysis)

In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
# Global variables
gameweek = 16

## Collect Team data
##### Attacking (for) and defensive (against) stats for each team

In [3]:
# Specify the path to the files
attack = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Attacking\*.csv')
defense = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Defensive\*.csv')

# Define a function to extract the week number from the filename
def extract_week_number(filename):
    match = re.search(r'GW_(\d+)', filename)
    return int(match.group(1)) if match else None

# Read each attacking file and add the 'Week' column
att_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in attack],
    ignore_index=True
)

# Read each defensive file and add the 'Week' column
def_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in defense],
    ignore_index=True
)
# Remove 'VS' team
def_weekly_data['Team'] = def_weekly_data['Team'].str[3:]

# Choose columns data 
columns_new = ['Team','Week', 'Playing TimeMP', 'Possession','PerformanceGls','PerformanceAst','ExpectedxG','ExpectedxAG',
               'Per 90 MinutesGls','Per 90 MinutesAst','Per 90 MinutesxG','Per 90 MinutesxAG']

# Attacking data
attacking_data = pd.DataFrame(att_weekly_data[columns_new]).sort_values(by = 'Week')

# # Defensive data
defensive_data = pd.DataFrame(def_weekly_data[columns_new]).sort_values(by = 'Week')

## Overall team performance
##### Average xG minus average xG against

In [4]:
# Blank list to store data 
averages = []

# List of unique teams 
teams = attacking_data['Team'].unique()

# Loop through each team 
for team in teams:
    # Filter data for the current team
    team_attack = attacking_data[attacking_data['Team'] == team]
    team_defense = defensive_data[defensive_data['Team'] == team]
    # Calculate the average xG attacking and xGA defensive for the team
    xG = team_attack['ExpectedxG'].mean().round(3)
    xGA = team_defense['ExpectedxG'].mean().round(3)
    # Append team and it average xG and xGa to the list
    averages.append([team, xG, xGA])

# Sort the data a-z
data = sorted(averages)

# Create a pd dataframe with column names 
averages_data = pd.DataFrame(data, columns = ['Team', 'xG', 'xGA'])

# Create a differences column
averages_data['Difference'] = averages_data['xG'] - averages_data['xGA']

# Sort in ascending order
averages_data.sort_values(by = 'Difference', ascending= False)

Unnamed: 0,Team,xG,xGA,Difference
2,Bournemouth,2.256,1.233,1.023
11,Liverpool,1.867,1.067,0.8
0,Arsenal,1.556,0.778,0.778
14,Newcastle Utd,1.656,1.144,0.512
5,Chelsea,2.233,1.722,0.511
1,Aston Villa,1.756,1.389,0.367
6,Crystal Palace,1.533,1.3,0.233
15,Nott'ham Forest,1.289,1.1,0.189
13,Manchester Utd,1.344,1.267,0.077
12,Manchester City,1.767,1.733,0.034


## Separate home and away data

- home_attack: All gameweeks a Team plays at home. The metrics are all FOR the team. How many performance goals and assists have been scored at home. How many expected goals and assists they have at home.
- away_attack. All gameweeks a Team plays away. The metrics are all FOR the team. How many performance goals and assists have been scored away. How many expected goals and assists they have away. 
- home_defense: All gameweeks a Team plays at home. The metrics are all AGAINST the team. How many performance goals and assists have been conceded at home. How many expected goals and assists they conceded at home.
- away_defense: All gameweeks a Team plays away. The metrics are all all AGAINST the team. How many performance goals and assists have been conceded away. How many expected goals and assists they conceded away.

In [5]:
# Collect fixture list
fixtures = pd.read_csv(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Schedule\Fixtures_alt_names.csv')

# Create function to collect homedata
def team_home_data(team, fixtures, gameweek):
    # Create a list to store the results
    home_data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(H)' in row[col]:  # Check if it's a home game and add GW and opponent
                    home_data.append([col, row[col]])

    # Return the collected home data
    return home_data

# Create function to collect awaydata
def team_away_data(team, fixtures, gameweek):
    # Create a list to store the results
    away_data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(A)' in row[col]:  # Check if it's an away game and add GW and oppponent
                    away_data.append([col, row[col]])

    # Return the collected home data
    return away_data

# Home data
home_games = []

for team in teams:
    data = team_home_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        home_games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
home = pd.DataFrame(home_games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
home['Week'] = home['Week'].str[2:].astype(int)

# Away data
away_games = []

for team in teams:
    data = team_away_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        away_games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
away = pd.DataFrame(away_games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
away['Week'] = away['Week'].str[2:].astype(int)

# Define columns
cols = ['Team', 'Week', 'Possession', 'PerformanceGls',
       'PerformanceAst', 'ExpectedxG', 'ExpectedxAG', 'Per 90 MinutesGls',
       'Per 90 MinutesAst', 'Per 90 MinutesxG', 'Per 90 MinutesxAG']

# Get attacking and defensive data
attacking = attacking_data[cols]
defensive = defensive_data[cols]

# Get all data
home_attack = home.merge(attacking, on=['Week', 'Team'])
home_defense = home.merge(defensive, on=['Week', 'Team'])
away_attack = away.merge(attacking, on=['Week', 'Team'] )
away_defense = away.merge(defensive, on = ['Week', 'Team'])

# Option to filter by later gameweek

In [10]:
#home_attack = home_attack[home_attack['Week'] > 12]
#home_defense = home_defense[home_defense['Week'] > 12]
#away_attack = away_attack[away_attack['Week'] > 12]
#away_defense = away_defense[away_defense['Week'] > 12]

## Calculate best performing teams

##### Average all home/away and attack(for)/defense(against) stats
This is by combining 1) actual goals, 2) expected goals and 3) expected assists.

In [11]:
# Function to group by 'Team', calculate mean of 'ExpectedxG', and round to 3 decimal places
def process_group(data, column):
    return data.groupby('Team')[[column]].mean().round(3).reset_index()

# Process each dataset
best_home_attack = process_group(home_attack, 'ExpectedxG')
best_home_defense = process_group(home_defense, 'ExpectedxG')
best_away_attack = process_group(away_attack, 'ExpectedxG')
best_away_defense = process_group(away_defense, 'ExpectedxG')

## Standardizing

In [12]:
def standardize_data(data):
    # Step 1: Drop the 'team' column (or any string-based column)
    team_column = data.pop("Team")  # Temporarily store the column

    # Step 2: Standardize numerical columns
    scaler = MinMaxScaler()
    numerical_columns = data.columns  # Identify all remaining columns
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    # Step 3: Re-add the 'team' column
    data["Team"] = team_column

    return data

## Ranking

In [13]:
def rank_data_attack(data):
    
    # Save the original ExpectedxG before standardization
    original_xG = data['ExpectedxG'].copy()
    
    # Standardize the data (assuming standardize_data returns a new DataFrame)
    data2 = standardize_data(data)
    
    # Set the standardized xG (assuming ExpectedxG is now standardized)
    data2['xG_std'] = data2['ExpectedxG'].round(3)
    
    # Reintroduce the original xG
    data2['xG'] = original_xG
    
    # Sort values by standardized xG
    data2 = data2.sort_values(by='xG_std', ascending=False)
    
    # Rank teams into 4 quantiles based on standardized xG
    data2["Difficulty"] = pd.qcut(data2["xG_std"], q=5, labels=[1, 2, 3, 4, 5])

    # Replace 1 with 2 to match original FDR better
    data2['Difficulty'] = data2['Difficulty'].replace(1,2)
    
    # Create the final dataset
    final = data2[['Team', 'Difficulty', 'xG_std', 'xG']]
    
    return final


def rank_data_defense(data):
    
    # Save the original ExpectedxG before standardization
    original_xG = data['ExpectedxG'].copy()
    
    # Standardize the data (assuming standardize_data returns a new DataFrame)
    data2 = standardize_data(data)
    
    # Set the standardized xG (assuming ExpectedxG is now standardized)
    data2['xG_std'] = data2['ExpectedxG'].round(3)
    
    # Reintroduce the original xG
    data2['xG'] = original_xG
    
    # Sort values by standardized xG
    data2 = data2.sort_values(by='xG_std', ascending=False)
    
    # Rank teams into 4 quantiles based on standardized xG
    data2["Difficulty"] = pd.qcut(data2["xG_std"], q=5, labels=[5, 4, 3, 2, 1])

    # Replace 1 with 2 to match original FDR better
    data2['Difficulty'] = data2['Difficulty'].replace(1,2)
    
    # Create the final dataset
    final = data2[['Team', 'Difficulty', 'xG_std', 'xG']]
    
    return final

## Home attackers

In [14]:
home_attack = pd.DataFrame(rank_data_attack(best_home_attack))
home_attack.sort_values(by = 'xG_std', ascending = False)

  data2['Difficulty'] = data2['Difficulty'].replace(1,2)


Unnamed: 0,Team,Difficulty,xG_std,xG
14,Newcastle Utd,5,1.0,2.95
2,Bournemouth,5,0.913,2.75
1,Aston Villa,5,0.826,2.55
12,Manchester City,5,0.63,2.1
5,Chelsea,4,0.543,1.9
17,Tottenham,4,0.522,1.85
3,Brentford,4,0.5,1.8
0,Arsenal,3,0.435,1.65
15,Nott'ham Forest,3,0.435,1.65
6,Crystal Palace,3,0.391,1.55


What this translates as in terms of selection is: A defender who is away (A) to these clubs with have a difficulty of 4.

## Away attackers

In [11]:
away_attack = pd.DataFrame(rank_data_attack(best_away_attack))
away_attack.sort_values(by = 'xG_std', ascending = False)

  data2['Difficulty'] = data2['Difficulty'].replace(1,2)


Unnamed: 0,Team,Difficulty,xG_std,xG
5,Chelsea,5,1.0,2.64
2,Bournemouth,5,0.851,2.325
11,Liverpool,5,0.721,2.05
0,Arsenal,4,0.556,1.7
6,Crystal Palace,4,0.556,1.7
17,Tottenham,4,0.496,1.575
18,West Ham,4,0.423,1.42
12,Manchester City,4,0.385,1.34
8,Fulham,3,0.366,1.3
10,Leicester City,3,0.357,1.28


What this translates as in terms of selection is: A defender who is at home (H)) to these clubs with have a difficulty of 4.

## Home Defence

In [12]:
home_def = pd.DataFrame(rank_data_defense(best_home_defense))
home_def.sort_values(by = 'xG_std', ascending = True)

  data2['Difficulty'] = data2['Difficulty'].replace(1,2)


Unnamed: 0,Team,Difficulty,xG_std,xG
0,Arsenal,5,0.0,0.35
7,Everton,5,0.037,0.45
1,Aston Villa,5,0.148,0.75
15,Nott'ham Forest,5,0.181,0.84
13,Manchester Utd,4,0.21,0.92
11,Liverpool,4,0.232,0.98
6,Crystal Palace,4,0.24,1.0
14,Newcastle Utd,4,0.255,1.04
2,Bournemouth,3,0.314,1.2
8,Fulham,3,0.365,1.34


What this translates as in terms of selection is: An attacker who plays away (A)) to these clubs with have a difficulty of 4.

## Away defence

In [13]:
away_def = pd.DataFrame(rank_data_defense(best_away_defense))
away_def.sort_values(by = 'xG_std', ascending = True)

  data2['Difficulty'] = data2['Difficulty'].replace(1,2)


Unnamed: 0,Team,Difficulty,xG_std,xG
19,Wolves,5,0.0,1.075
0,Arsenal,5,0.043,1.12
11,Liverpool,5,0.095,1.175
2,Bournemouth,5,0.19,1.275
14,Newcastle Utd,5,0.19,1.275
7,Everton,4,0.195,1.28
8,Fulham,4,0.238,1.325
15,Nott'ham Forest,4,0.333,1.425
4,Brighton,3,0.405,1.5
6,Crystal Palace,3,0.443,1.54


What this translates as in terms of selection is: An attacker who plays home (H)) to these clubs with have a difficulty of 4.

# Convert to useable dataframe

In [14]:
# Create a dictionary mapping full team names to their 3-letter codes plus (A)
team_to_code_A = {
    "Arsenal": "ARS (A)",
    "Aston Villa": "AVL (A)",
    "Brentford": "BRE (A)",
    "Brighton": "BHA (A)",  # Brighton & Hove Albion
    "Bournemouth": "BOU (A)",
    "Chelsea": "CHE (A)",
    "Crystal Palace": "CRY (A)", # You can choose CRY (A) for Crystal Palace
    "Everton": "EVE (A)",
    "Fulham": "FUL (A)",
    "Ipswich Town": "IPS (A)",
    "Leicester City": "LEI (A)",
    "Liverpool": "LIV (A)",
    "Manchester City": "MCI (A)",
    "Manchester Utd": "MUN (A)",
    "Newcastle Utd": "NEW (A)",
    "Nott'ham Forest": "NFO (A)",
    "Southampton": "SOU (A)",
    "Tottenham": "TOT (A)",
    "West Ham": "WHU (A)",
    "Wolves": "WOL (A)"
}

# Create a dictionary mapping full team names to their 3-letter codes plus (H)
team_to_code_H = {
    "Arsenal": "ARS (H)",
    "Aston Villa": "AVL (H)",
    "Brentford": "BRE (H)",
    "Brighton": "BHA (H)",  # Brighton & Hove Albion
    "Bournemouth": "BOU (H)",
    "Chelsea": "CHE (H)",
    "Crystal Palace": "CRY (H)",
    "Everton": "EVE (H)",
    "Fulham": "FUL (H)",
    "Ipswich Town": "IPS (H)",
    "Leicester City": "LEI (H)",
    "Liverpool": "LIV (H)",
    "Manchester City": "MCI (H)",
    "Manchester Utd": "MUN (H)",
    "Newcastle Utd": "NEW (H)",
    "Nott'ham Forest": "NFO (H)",
    "Southampton": "SOU (H)",
    "Tottenham": "TOT (H)",
    "West Ham": "WHU (H)",
    "Wolves": "WOL (H)"
}

In [15]:
# Convert defensive players away
home_attack['Team'] = home_attack['Team'].replace(team_to_code_A)
home_attack['Position'] = 'GK'
goalkeepers_A = home_attack.copy()
home_attack['Position'] = 'DEF'
defenders_A = home_attack.copy()

# Convert defensive players home
away_attack['Team'] = away_attack['Team'].replace(team_to_code_H)
away_attack['Position'] = 'GK'
goalkeepers_H = away_attack.copy()
away_attack['Position'] = 'DEF'
defenders_H = away_attack.copy()
defenders_H

Unnamed: 0,Team,Difficulty,xG_std,xG,Position
5,CHE (H),5,1.0,2.64,DEF
2,BOU (H),5,0.851,2.325,DEF
11,LIV (H),5,0.721,2.05,DEF
0,ARS (H),4,0.556,1.7,DEF
6,CRY (H),4,0.556,1.7,DEF
17,TOT (H),4,0.496,1.575,DEF
18,WHU (H),4,0.423,1.42,DEF
12,MCI (H),4,0.385,1.34,DEF
8,FUL (H),3,0.366,1.3,DEF
10,LEI (H),3,0.357,1.28,DEF


In [16]:
# Convert attacking players away
home_def['Team'] = home_def['Team'].replace(team_to_code_A)
home_def['Position'] = 'MID'
midfielders_A = home_def.copy()
home_def['Position'] = 'FWD'
forwards_A = home_def.copy()
forwards_A

# Convert attacking players home
away_def['Team'] = away_def['Team'].replace(team_to_code_H)
away_def['Position'] = 'MID'
midfielders_H = away_def.copy()
away_def['Position'] = 'FWD'
forwards_H = away_def.copy()
midfielders_H

Unnamed: 0,Team,Difficulty,xG_std,xG,Position
9,IPS (H),2,1.0,2.125,MID
10,LEI (H),2,0.881,2.0,MID
16,SOU (H),2,0.881,2.0,MID
18,WHU (H),2,0.843,1.96,MID
5,CHE (H),2,0.805,1.92,MID
1,AVL (H),2,0.786,1.9,MID
17,TOT (H),2,0.69,1.8,MID
13,MUN (H),2,0.595,1.7,MID
3,BRE (H),3,0.576,1.68,MID
12,MCI (H),3,0.5,1.6,MID


In [17]:
# Combine into dataframe
FD_xG = pd.concat([goalkeepers_A, goalkeepers_H, defenders_A, defenders_H, midfielders_A, midfielders_H, forwards_A, forwards_H])
# Rename columns
FD_xG.rename(columns = {'Team': 'Opponent'}, inplace = True)