## Which opponent is the most difficult to score points against? (xG analysis)

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
# URL to team data
url = 'https://fbref.com/en/comps/9/Premier-League-Stats#all_stats_squads_standard'

In [3]:
# Read the URL
df = pd.read_html(url)[1]

In [4]:
# Convert to pandas dataframe
data = pd.DataFrame(df)

# Get unique team data
teams = data[('Unnamed: 1_level_0', 'Squad')].unique()

In [5]:
def extract_team_stats(data, location):
    """
    Extracts team-level match stats (MP, xG, xGA) and computes averages
    for either 'Home' or 'Away' matches.
    
    Parameters:
        data (pd.DataFrame): The full match dataset with multi-level columns.
        location (str): 'Home' or 'Away'
    
    Returns:
        pd.DataFrame: Cleaned and processed DataFrame of team stats.
    """
    result = []

    # Get the list of unique team names
    teams = data[('Unnamed: 1_level_0', 'Squad')].unique()

    for team in teams:
        # Filter rows for the current team
        team_data = data[data[('Unnamed: 1_level_0', 'Squad')] == team]

        if not team_data.empty:
            # Extract stats for that team and append to result list
            mp = team_data[(location, 'MP')].iloc[0]
            xg = team_data[(location, 'xG')].iloc[0]
            xga = team_data[(location, 'xGA')].iloc[0]
            result.append([team, mp, xg, xga])

    # Convert list to DataFrame and calculate averages
    df = pd.DataFrame(result, columns=['Team', 'MP', 'xG', 'xGA'])
    df['Av_xG'] = round(df['xG'] / df['MP'], 3)
    df['Av_xGA'] = round(df['xGA'] / df['MP'], 3)

    # Sort teams by average xG in descending order
    return df.sort_values(by='Av_xG', ascending=False).reset_index(drop=True)

In [6]:
# Collect data
home_data = extract_team_stats(data, location = 'Home')
away_data = extract_team_stats(data, location = 'Away')

In [7]:
def standardize_data(data, columns_to_standardize):
    """
    Standardizes selected columns in the dataset using Min-Max scaling.

    Parameters:
        data (pd.DataFrame): DataFrame with team data.
        columns_to_standardize (list): Columns to be standardized.

    Returns:
        pd.DataFrame: DataFrame with added standardized columns.
    """
    data_copy = data.copy()
    scaler = MinMaxScaler()
    
    scaled_values = scaler.fit_transform(data_copy[columns_to_standardize])
    
    for i, col in enumerate(columns_to_standardize):
        data_copy[col + '_std'] = scaled_values[:, i].round(3)

    return data_copy

In [8]:
def rank_data(data, metric, num_quantiles=5):
    """
    Ranks teams based on a standardized metric and assigns difficulty scores.

    Parameters:
        data (pd.DataFrame): DataFrame with standardized metric columns.
        metric (str): Column name to rank by (e.g., 'Av_xG_std', 'Av_xGA_std').
        num_quantiles (int): Number of difficulty tiers (default: 5).

    Returns:
        pd.DataFrame: Ranked DataFrame with difficulty scores.
    """
    data = data.copy()

    # Decide whether to reverse quantile labels based on the metric
    reverse = metric.lower() in ['av_xga_std', 'xga_std']

    # Sort in the appropriate direction
    data = data.sort_values(by=metric, ascending=not reverse)

    # Create quantile labels
    labels = list(range(1, num_quantiles + 1))
    if reverse:
        labels = labels[::-1]  # Reverse if lower = harder

    # Assign difficulty scores
    data['Difficulty'] = pd.qcut(data[metric], q=num_quantiles, labels=labels)

    return data[['Team', metric, 'Difficulty']]

In [9]:
# Standardize data
standardized_home = standardize_data(home_data, ['Av_xG', 'Av_xGA'])
standardized_away = standardize_data(away_data, ['Av_xG', 'Av_xGA'])

# Home statistics
home_attack = rank_data(standardized_home, metric='Av_xG_std')
home_defense = rank_data(standardized_home, metric='Av_xGA_std')

# Away statistics
away_attack = rank_data(standardized_away, metric='Av_xG_std')
away_defense = rank_data(standardized_away, metric='Av_xGA_std')

home_attack {A defender who is away (A) to these clubs with have a difficulty of 5.}
away_attack {A defender who is at home (H) to these clubs with have a difficulty of 5.}
home_defense {An attacker who plays away (A) to these clubs with have a difficulty of 5.}
away_defense {An attacker who plays home (H) to these clubs with have a difficulty of 5.}

In [10]:
# Create a dictionary mapping full team names to their 3-letter codes plus (A)
team_to_code_A = {
    "Arsenal": "ARS (A)",
    "Aston Villa": "AVL (A)",
    "Brentford": "BRE (A)",
    "Brighton": "BHA (A)",
    "Bournemouth": "BOU (A)",
    "Burnley": "BUR (A)",
    "Chelsea": "CHE (A)",
    "Crystal Palace": "CRY (A)",
    "Everton": "EVE (A)",
    "Fulham": "FUL (A)",
    "Leeds United": "LEE (A)",
    "Liverpool": "LIV (A)",
    "Manchester City": "MCI (A)",
    "Manchester Utd": "MUN (A)",
    "Newcastle Utd": "NEW (A)",
    "Nott'ham Forest": "NFO (A)",
    "Sunderland": "SUN (A)",
    "Tottenham": "TOT (A)",
    "West Ham": "WHU (A)",
    "Wolves": "WOL (A)"
}

# Create a dictionary mapping full team names to their 3-letter codes plus (H)
team_to_code_H = {
    "Arsenal": "ARS (H)",
    "Aston Villa": "AVL (H)",
    "Brentford": "BRE (H)",
    "Brighton": "BHA (H)",
    "Bournemouth": "BOU (H)",
    "Burnley": "BUR (H)",
    "Chelsea": "CHE (H)",
    "Crystal Palace": "CRY (H)",
    "Everton": "EVE (H)",
    "Fulham": "FUL (H)",
    "Leeds United": "LEE (H)",
    "Liverpool": "LIV (H)",
    "Manchester City": "MCI (H)",
    "Manchester Utd": "MUN (H)",
    "Newcastle Utd": "NEW (H)",
    "Nott'ham Forest": "NFO (H)",
    "Sunderland": "SUN (H)",
    "Tottenham": "TOT (H)",
    "West Ham": "WHU (H)",
    "Wolves": "WOL (H)"
}

In [11]:
def create_position_df(base_df, team_code_map, position, side_label):
    """
    Creates a new DataFrame for a specific player position and team code mapping.

    Parameters:
        base_df (pd.DataFrame): The starting DataFrame (e.g., home_attack).
        team_code_map (dict): Mapping of full team names to FPL team codes.
        position (str): The FPL player position ('GK', 'DEF', 'MID', 'FWD').
        side_label (str): Label to indicate home/away side ('H' or 'A').

    Returns:
        pd.DataFrame: A copy of the DataFrame with 'Team' converted and 'Position' added.
    """
    df = base_df.copy()
    df['Team'] = df['Team'].replace(team_code_map)
    df['Position'] = position
    df['Side'] = side_label
    return df

In [12]:
# Defensive players away
goalkeepers_A = create_position_df(home_attack, team_to_code_A, 'GK', 'A')
defenders_A   = create_position_df(home_attack, team_to_code_A, 'DEF', 'A')

# Defensive players home
goalkeepers_H = create_position_df(away_attack, team_to_code_H, 'GK', 'H')
defenders_H   = create_position_df(away_attack, team_to_code_H, 'DEF', 'H')

# Attacking players away
midfielders_A = create_position_df(home_defense, team_to_code_A, 'MID', 'A')
forwards_A    = create_position_df(home_defense, team_to_code_A, 'FWD', 'A')

# Attacking players home
midfielders_H = create_position_df(away_defense, team_to_code_H, 'MID', 'H')
forwards_H    = create_position_df(away_defense, team_to_code_H, 'FWD', 'H')

In [13]:
# Combine into dataframe
FD_xG = pd.concat([goalkeepers_A, goalkeepers_H, defenders_A, defenders_H, midfielders_A, midfielders_H, forwards_A, forwards_H])
# Rename columns
FD_xG.rename(columns = {'Team': 'Opponent'}, inplace = True)

In [16]:
FD_xG.to_csv(r'C:\Users\thoma\OneDrive\Documents\test.csv')