In [1]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Current gameweek number

In [2]:
gameweek = 14

## Collect Team data
##### Attacking (for) and defensive (against) stats for each team

In [3]:
# Specify the path to the files
attack = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Attacking\*.csv')
defense = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Defensive\*.csv')

# Define a function to extract the week number from the filename
def extract_week_number(filename):
    match = re.search(r'GW_(\d+)', filename)
    return int(match.group(1)) if match else None

# Read each attacking file and add the 'Week' column
att_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in attack],
    ignore_index=True
)

# Read each defensive file and add the 'Week' column
def_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in defense],
    ignore_index=True
)
# Remove 'VS' team
def_weekly_data['Team'] = def_weekly_data['Team'].str[3:]

# Choose columns data 
columns_new = ['Team','Week', 'Playing TimeMP', 'Possession','PerformanceGls','PerformanceAst','ExpectedxG','ExpectedxAG',
               'Per 90 MinutesGls','Per 90 MinutesAst','Per 90 MinutesxG','Per 90 MinutesxAG']

# Attacking data
attacking_data = pd.DataFrame(att_weekly_data[columns_new]).sort_values(by = 'Week')

# # Defensive data
defensive_data = pd.DataFrame(def_weekly_data[columns_new]).sort_values(by = 'Week')

## Overall team performance
##### Average xG minus average xG against

In [4]:
# Blank list to store data 
averages = []

# List of unique teams 
teams = attacking_data['Team'].unique()

# Loop through each team 
for team in teams:
    # Filter data for the current team
    team_attack = attacking_data[attacking_data['Team'] == team]
    team_defense = defensive_data[defensive_data['Team'] == team]
    # Calculate the average xG attacking and xGA defensive for the team
    xG = team_attack['ExpectedxG'].mean().round(3)
    xGA = team_defense['ExpectedxG'].mean().round(3)
    # Append team and it average xG and xGa to the list
    averages.append([team, xG, xGA])

# Sort the data a-z
data = sorted(averages)

# Create a pd dataframe with column names 
averages_data = pd.DataFrame(data, columns = ['Team', 'xG', 'xGA'])

# Create a differences column
averages_data['Difference'] = averages_data['xG'] - averages_data['xGA']

# Sort in ascending order
averages_data.sort_values(by = 'Difference', ascending= False)

Unnamed: 0,Team,xG,xGA,Difference
11,Liverpool,2.114,1.2,0.914
2,Bournemouth,1.657,0.986,0.671
0,Arsenal,1.571,0.971,0.6
5,Chelsea,2.157,1.586,0.571
1,Aston Villa,1.871,1.486,0.385
8,Fulham,1.243,0.957,0.286
17,Tottenham,1.4,1.214,0.186
12,Manchester City,1.929,1.743,0.186
15,Nott'ham Forest,1.3,1.129,0.171
6,Crystal Palace,1.471,1.329,0.142


## Separate home and away data

- home_attack: All gameweeks a Team plays at home. The metrics are all FOR the team. How many performance goals and assists have been scored at home. How many expected goals and assists they have at home.
- away_attack. All gameweeks a Team plays away. The metrics are all FOR the team. How many performance goals and assists have been scored away. How many expected goals and assists they have away. 
- home_defense: All gameweeks a Team plays at home. The metrics are all AGAINST the team. How many performance goals and assists have been conceded at home. How many expected goals and assists they conceded at home.
- away_defense: All gameweeks a Team plays away. The metrics are all all AGAINST the team. How many performance goals and assists have been conceded away. How many expected goals and assists they conceded away.

In [5]:
# Collect fixture list
fixtures = pd.read_csv(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Schedule\Fixtures_alt_names.csv')

# Create function to collect homedata
def team_home_data(team, fixtures, gameweek):
    # Create a list to store the results
    home_data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(H)' in row[col]:  # Check if it's a home game and add GW and opponent
                    home_data.append([col, row[col]])

    # Return the collected home data
    return home_data

# Create function to collect awaydata
def team_away_data(team, fixtures, gameweek):
    # Create a list to store the results
    away_data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(A)' in row[col]:  # Check if it's an away game and add GW and oppponent
                    away_data.append([col, row[col]])

    # Return the collected home data
    return away_data

# Home data
home_games = []

for team in teams:
    data = team_home_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        home_games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
home = pd.DataFrame(home_games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
home['Week'] = home['Week'].str[2:].astype(int)

# Away data
away_games = []

for team in teams:
    data = team_away_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        away_games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
away = pd.DataFrame(away_games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
away['Week'] = away['Week'].str[2:].astype(int)

# Define columns
cols = ['Team', 'Week', 'Possession', 'PerformanceGls',
       'PerformanceAst', 'ExpectedxG', 'ExpectedxAG', 'Per 90 MinutesGls',
       'Per 90 MinutesAst', 'Per 90 MinutesxG', 'Per 90 MinutesxAG']

# Get attacking and defensive data
attacking = attacking_data[cols]
defensive = defensive_data[cols]

# Get all data
home_attack = home.merge(attacking, on=['Week', 'Team'])
home_defense = home.merge(defensive, on=['Week', 'Team'])
away_attack = away.merge(attacking, on=['Week', 'Team'] )
away_defense = away.merge(defensive, on = ['Week', 'Team'])

## Calculate best performing teams

##### Average all home/away and attack(for)/defense(against) stats
This is by combining 1) actual goals, 2) expected goals and 3) expected assists.

In [6]:
# Group by 'Team' and sum 'PerformanceGls', while calculating the mean for the other columns
best_home_attack = home_attack.groupby('Team')[['ExpectedxG']].agg({
    'ExpectedxG': 'mean'
}).reset_index()

# Round the results to 2 decimal places
best_home_attack = best_home_attack.round({'ExpectedxG': 3})

# Group by 'Team' and sum 'PerformanceGls', while calculating the mean for the other columns
best_home_defense = home_defense.groupby('Team')[['ExpectedxG']].agg({
    'ExpectedxG': 'mean',
}).reset_index()

# Round the results to 2 decimal places
best_home_defense = best_home_defense.round({'ExpectedxG': 3})

# Group by 'Team' and sum 'PerformanceGls', while calculating the mean for the other columns
best_away_attack = away_attack.groupby('Team')[['ExpectedxG']].agg({
    'ExpectedxG': 'mean',
}).reset_index()

# Round the results to 2 decimal places
best_away_attack = best_away_attack.round({'ExpectedxG': 3})

# Group by 'Team' and sum 'PerformanceGls', while calculating the mean for the other columns
best_away_defence= away_defense.groupby('Team')[['ExpectedxG']].agg({
    'ExpectedxG': 'mean',
}).reset_index()

# Round the results to 2 decimal places
best_away_defence = best_away_defence.round({'ExpectedxG': 3})

## Standardizing

In [7]:
def standardize_data(data):
    # Step 1: Drop the 'team' column (or any string-based column)
    team_column = data.pop("Team")  # Temporarily store the column

    # Step 2: Standardize numerical columns
    scaler = MinMaxScaler()
    numerical_columns = data.columns  # Identify all remaining columns
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    # Step 3: Re-add the 'team' column
    data["Team"] = team_column

    return data

In [8]:
best_home_attack

Unnamed: 0,Team,ExpectedxG
0,Arsenal,1.433
1,Aston Villa,2.533
2,Bournemouth,1.375
3,Brentford,2.667
4,Brighton,1.7
5,Chelsea,1.567
6,Crystal Palace,1.333
7,Everton,1.1
8,Fulham,1.175
9,Ipswich Town,1.075


## Ranking

In [9]:
def rank_data_attack(data):
    
    # Save the original ExpectedxG before standardization
    original_xG = data['ExpectedxG'].copy()
    
    # Standardize the data (assuming standardize_data returns a new DataFrame)
    data2 = standardize_data(data)
    
    # Set the standardized xG (assuming ExpectedxG is now standardized)
    data2['xG_std'] = data2['ExpectedxG'].round(3)
    
    # Reintroduce the original xG
    data2['xG'] = original_xG
    
    # Sort values by standardized xG
    data2 = data2.sort_values(by='xG_std', ascending=False)
    
    # Rank teams into 4 quantiles based on standardized xG
    data2["Difficulty"] = pd.qcut(data2["xG_std"], q=5, labels=[1, 2, 3, 4, 5])
    
    # Create the final dataset
    final = data2[['Team', 'Difficulty', 'xG_std', 'xG']]
    
    return final



def rank_data_defense(data):
    
    # Save the original ExpectedxG before standardization
    original_xG = data['ExpectedxG'].copy()
    
    # Standardize the data (assuming standardize_data returns a new DataFrame)
    data2 = standardize_data(data)
    
    # Set the standardized xG (assuming ExpectedxG is now standardized)
    data2['xG_std'] = data2['ExpectedxG'].round(3)
    
    # Reintroduce the original xG
    data2['xG'] = original_xG
    
    # Sort values by standardized xG
    data2 = data2.sort_values(by='xG_std', ascending=False)
    
    # Rank teams into 4 quantiles based on standardized xG
    data2["Difficulty"] = pd.qcut(data2["xG_std"], q=5, labels=[5, 4, 3, 2, 1])
    
    # Create the final dataset
    final = data2[['Team', 'Difficulty', 'xG_std', 'xG']]
    
    return final

## Home attackers

In [10]:
home_attack = pd.DataFrame(rank_data_attack(best_home_attack))
home_attack.sort_values(by = 'xG_std', ascending = False)

Unnamed: 0,Team,Difficulty,xG_std,xG
12,Manchester City,5,1.0,2.733
3,Brentford,5,0.961,2.667
1,Aston Villa,5,0.883,2.533
4,Brighton,4,0.395,1.7
17,Tottenham,4,0.395,1.7
11,Liverpool,4,0.366,1.65
18,West Ham,4,0.337,1.6
14,Newcastle Utd,4,0.322,1.575
5,Chelsea,3,0.317,1.567
15,Nott'ham Forest,3,0.263,1.475


What this translates as in terms of selection is: A defender who is away (A) to these clubs with have a difficulty of 4 (Brentford, Brighton, Tottenham..)

## Away attackers

In [11]:
away_attack = pd.DataFrame(rank_data_attack(best_away_attack))
away_attack.sort_values(by = 'xG_std', ascending = False)

Unnamed: 0,Team,Difficulty,xG_std,xG
11,Liverpool,5,1.0,2.733
5,Chelsea,5,0.938,2.6
2,Bournemouth,5,0.672,2.033
0,Arsenal,5,0.504,1.675
6,Crystal Palace,4,0.457,1.575
10,Leicester City,4,0.445,1.55
1,Aston Villa,4,0.363,1.375
8,Fulham,4,0.344,1.333
12,Manchester City,3,0.34,1.325
18,West Ham,3,0.316,1.275


What this translates as in terms of selection is: A defender who is at home (H)) to these clubs with have a difficulty of 4 (Wolves, Tottenham, Newcastle..)

## Home Defence

In [12]:
home_def = pd.DataFrame(rank_data_defense(best_home_defense))
home_def.sort_values(by = 'xG_std', ascending = True)

Unnamed: 0,Team,Difficulty,xG_std,xG
0,Arsenal,5,0.0,0.467
7,Everton,5,0.048,0.6
2,Bournemouth,5,0.112,0.775
6,Crystal Palace,5,0.145,0.867
8,Fulham,4,0.148,0.875
1,Aston Villa,4,0.157,0.9
11,Liverpool,4,0.166,0.925
15,Nott'ham Forest,4,0.175,0.95
13,Manchester Utd,4,0.175,0.95
14,Newcastle Utd,3,0.284,1.25


What this translates as in terms of selection is: An attacker who plays away (A)) to these clubs with have a difficulty of 4 (Arsenal, Villa, Man Utd, Liverpool)

## Away defence

In [13]:
away_def = pd.DataFrame(rank_data_defense(best_away_defence))
away_def.sort_values(by = 'xG_std', ascending = True)

Unnamed: 0,Team,Difficulty,xG_std,xG
17,Tottenham,5,0.0,0.933
8,Fulham,5,0.091,1.067
19,Wolves,5,0.114,1.1
2,Bournemouth,5,0.228,1.267
4,Brighton,4,0.25,1.3
7,Everton,4,0.25,1.3
14,Newcastle Utd,4,0.25,1.3
0,Arsenal,4,0.284,1.35
15,Nott'ham Forest,3,0.296,1.367
3,Brentford,3,0.421,1.55


What this translates as in terms of selection is: An attacker who plays home (H)) to these clubs with have a difficulty of 4 (Arsenal, Tottenham, Chelsea, Fulham, Everton)

# Convert to useable dataframe

In [14]:
# Create a dictionary mapping full team names to their 3-letter codes plus (A)
team_to_code_A = {
    "Arsenal": "ARS (A)",
    "Aston Villa": "AVL (A)",
    "Brentford": "BRE (A)",
    "Brighton": "BHA (A)",  # Brighton & Hove Albion
    "Bournemouth": "BOU (A)",
    "Chelsea": "CHE (A)",
    "Crystal Palace": "CRY (A)", # You can choose CRY (A) for Crystal Palace
    "Everton": "EVE (A)",
    "Fulham": "FUL (A)",
    "Ipswich Town": "IPS (A)",
    "Leicester City": "LEI (A)",
    "Liverpool": "LIV (A)",
    "Manchester City": "MCI (A)",
    "Manchester Utd": "MUN (A)",
    "Newcastle Utd": "NEW (A)",
    "Nott'ham Forest": "NFO (A)",
    "Southampton": "SOU (A)",
    "Tottenham": "TOT (A)",
    "West Ham": "WHU (A)",
    "Wolves": "WOL (A)"
}

# Create a dictionary mapping full team names to their 3-letter codes plus (H)
team_to_code_H = {
    "Arsenal": "ARS (H)",
    "Aston Villa": "AVL (H)",
    "Brentford": "BRE (H)",
    "Brighton": "BHA (H)",  # Brighton & Hove Albion
    "Bournemouth": "BOU (H)",
    "Chelsea": "CHE (H)",
    "Crystal Palace": "CRY (H)",
    "Everton": "EVE (H)",
    "Fulham": "FUL (H)",
    "Ipswich Town": "IPS (H)",
    "Leicester City": "LEI (H)",
    "Liverpool": "LIV (H)",
    "Manchester City": "MCI (H)",
    "Manchester Utd": "MUN (H)",
    "Newcastle Utd": "NEW (H)",
    "Nott'ham Forest": "NFO (H)",
    "Southampton": "SOU (H)",
    "Tottenham": "TOT (H)",
    "West Ham": "WHU (H)",
    "Wolves": "WOL (H)"
}

In [15]:
# Convert defensive players away
home_attack['Team'] = home_attack['Team'].replace(team_to_code_A)
home_attack['Position'] = 'GK'
goalkeepers_A = home_attack.copy()
home_attack['Position'] = 'DEF'
defenders_A = home_attack.copy()

# Convert defensive players home
away_attack['Team'] = away_attack['Team'].replace(team_to_code_H)
away_attack['Position'] = 'GK'
goalkeepers_H = away_attack.copy()
away_attack['Position'] = 'DEF'
defenders_H = away_attack.copy()
defenders_H

Unnamed: 0,Team,Difficulty,xG_std,xG,Position
11,LIV (H),5,1.0,2.733,DEF
5,CHE (H),5,0.938,2.6,DEF
2,BOU (H),5,0.672,2.033,DEF
0,ARS (H),5,0.504,1.675,DEF
6,CRY (H),4,0.457,1.575,DEF
10,LEI (H),4,0.445,1.55,DEF
1,AVL (H),4,0.363,1.375,DEF
8,FUL (H),4,0.344,1.333,DEF
12,MCI (H),3,0.34,1.325,DEF
18,WHU (H),3,0.316,1.275,DEF


In [16]:
# Convert attacking players away
home_def['Team'] = home_def['Team'].replace(team_to_code_A)
home_def['Position'] = 'MID'
midfielders_A = home_def.copy()
home_def['Position'] = 'FWD'
forwards_A = home_def.copy()
forwards_A

# Convert attacking players home
away_def['Team'] = away_def['Team'].replace(team_to_code_H)
away_def['Position'] = 'MID'
midfielders_H = away_def.copy()
away_def['Position'] = 'FWD'
forwards_H = away_def.copy()
midfielders_H

Unnamed: 0,Team,Difficulty,xG_std,xG,Position
9,IPS (H),1,1.0,2.4,MID
13,MUN (H),1,0.705,1.967,MID
18,WHU (H),1,0.693,1.95,MID
16,SOU (H),1,0.682,1.933,MID
1,AVL (H),2,0.676,1.925,MID
5,CHE (H),2,0.523,1.7,MID
12,MCI (H),2,0.506,1.675,MID
6,CRY (H),2,0.506,1.675,MID
11,LIV (H),3,0.432,1.567,MID
10,LEI (H),3,0.421,1.55,MID


In [19]:
difficulty_xG = pd.concat([goalkeepers_A, goalkeepers_H, defenders_A, defenders_H, midfielders_A, midfielders_H, forwards_A, forwards_H])
difficulty_xG['Opponent'] = difficulty_xG['Team'].rename(inplace= True)

In [18]:
# Export to csv
difficulty_xG.to_csv(rf'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Difficulty_ratings\Model\Team_xG_rank\Team_xG_rank_{gameweek}.csv')