In [37]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Current gameweek number

In [38]:
gameweek = 12

## Collect Team data
##### Attacking (for) and defensive (against) stats for each team

In [39]:
# Specify the path to the files
attack = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Attacking\*.csv')
defense = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Defensive\*.csv')

# Define a function to extract the week number from the filename
def extract_week_number(filename):
    match = re.search(r'GW_(\d+)', filename)
    return int(match.group(1)) if match else None

# Read each attacking file and add the 'Week' column
att_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in attack],
    ignore_index=True
)

# Read each defensive file and add the 'Week' column
def_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in defense],
    ignore_index=True
)
# Remove 'VS' team
def_weekly_data['Team'] = def_weekly_data['Team'].str[3:]

# Choose columns data 
columns_new = ['Team','Week', 'Playing TimeMP', 'Possession','PerformanceGls','PerformanceAst','ExpectedxG','ExpectedxAG',
               'Per 90 MinutesGls','Per 90 MinutesAst','Per 90 MinutesxG','Per 90 MinutesxAG']

# Attacking data
attacking_data = pd.DataFrame(att_weekly_data[columns_new]).sort_values(by = 'Week')

# # Defensive data
defensive_data = pd.DataFrame(def_weekly_data[columns_new]).sort_values(by = 'Week')

## Overall team performance
##### Average xG minus average xG against

In [40]:
# Blank list to store data 
averages = []

# List of unique teams 
teams = attacking_data['Team'].unique()

# Loop through each team 
for team in teams:
    # Filter data for the current team
    team_attack = attacking_data[attacking_data['Team'] == team]
    team_defense = defensive_data[defensive_data['Team'] == team]
    # Calculate the average xG attacking and xGA defensive for the team
    xG = team_attack['ExpectedxG'].mean().round(3)
    xGA = team_defense['ExpectedxG'].mean().round(3)
    # Append team and it average xG and xGa to the list
    averages.append([team, xG, xGA])

# Sort the data a-z
data = sorted(averages)

# Create a pd dataframe with column names 
averages_data = pd.DataFrame(data, columns = ['Team', 'xG', 'xGA'])

# Create a differences column
averages_data['Difference'] = averages_data['xG'] - averages_data['xGA']

# Sort in ascending order
averages_data.sort_values(by = 'Difference', ascending= False)

Unnamed: 0,Team,xG,xGA,Difference
11,Liverpool,1.88,1.1,0.78
12,Manchester City,2.06,1.56,0.5
14,Newcastle Utd,1.52,1.04,0.48
17,Tottenham,1.8,1.4,0.4
2,Bournemouth,1.68,1.28,0.4
1,Aston Villa,1.9,1.58,0.32
15,Nott'ham Forest,1.28,0.98,0.3
8,Fulham,1.44,1.18,0.26
7,Everton,1.26,1.04,0.22
3,Brentford,1.62,1.54,0.08


## Separate home and away data

- home_attack: All gameweeks a Team plays at home. The metrics are all FOR the team. How many performance goals and assists have been scored at home. How many expected goals and assists they have at home.
- away_attack. All gameweeks a Team plays away. The metrics are all FOR the team. How many performance goals and assists have been scored away. How many expected goals and assists they have away. 
- home_defense: All gameweeks a Team plays at home. The metrics are all AGAINST the team. How many performance goals and assists have been conceded at home. How many expected goals and assists they conceded at home.
- away_defense: All gameweeks a Team plays away. The metrics are all all AGAINST the team. How many performance goals and assists have been conceded away. How many expected goals and assists they conceded away.

In [41]:
# Collect fixture list
fixtures = pd.read_csv(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Fixtures_alt_names.csv')

# Create function to collect homedata
def team_home_data(team, fixtures, gameweek):
    # Create a list to store the results
    home_data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(H)' in row[col]:  # Check if it's a home game and add GW and opponent
                    home_data.append([col, row[col]])

    # Return the collected home data
    return home_data

# Create function to collect awaydata
def team_away_data(team, fixtures, gameweek):
    # Create a list to store the results
    away_data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(A)' in row[col]:  # Check if it's an away game and add GW and oppponent
                    away_data.append([col, row[col]])

    # Return the collected home data
    return away_data

# Home data
home_games = []

for team in teams:
    data = team_home_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        home_games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
home = pd.DataFrame(home_games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
home['Week'] = home['Week'].str[2:].astype(int)

# Away data
away_games = []

for team in teams:
    data = team_away_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        away_games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
away = pd.DataFrame(away_games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
away['Week'] = away['Week'].str[2:].astype(int)

# Define columns
cols = ['Team', 'Week', 'Possession', 'PerformanceGls',
       'PerformanceAst', 'ExpectedxG', 'ExpectedxAG', 'Per 90 MinutesGls',
       'Per 90 MinutesAst', 'Per 90 MinutesxG', 'Per 90 MinutesxAG']

# Get attacking and defensive data
attacking = attacking_data[cols]
defensive = defensive_data[cols]

# Get all data
home_attack = home.merge(attacking, on=['Week', 'Team'])
home_defense = home.merge(defensive, on=['Week', 'Team'])
away_attack = away.merge(attacking, on=['Week', 'Team'] )
away_defense = away.merge(defensive, on = ['Week', 'Team'])

## Calculate best performing teams

##### Average all home/away and attack(for)/defense(against) stats
This is by combining 1) actual goals, 2) expected goals and 3) expected assists.

In [42]:
# Group by 'Team' and sum 'PerformanceGls', while calculating the mean for the other columns
best_home_attack = home_attack.groupby('Team')[['PerformanceGls','ExpectedxG', 'ExpectedxAG']].agg({
    'PerformanceGls': 'mean',
    'ExpectedxG': 'mean',
    'ExpectedxAG': 'mean',
}).reset_index()

# Round the results to 2 decimal places
best_home_attack = best_home_attack.round({'PerformanceGls': 2, 'ExpectedxG': 3, 'ExpectedxAG': 3})

# Group by 'Team' and sum 'PerformanceGls', while calculating the mean for the other columns
best_home_defense = home_defense.groupby('Team')[['PerformanceGls','ExpectedxG', 'ExpectedxAG']].agg({
    'PerformanceGls': 'mean',
    'ExpectedxG': 'mean',
    'ExpectedxAG': 'mean'
}).reset_index()

# Round the results to 2 decimal places
best_home_defense = best_home_defense.round({'PerformanceGls': 2, 'ExpectedxG': 3, 'ExpectedxAG': 3})

# Group by 'Team' and sum 'PerformanceGls', while calculating the mean for the other columns
best_away_attack = away_attack.groupby('Team')[['PerformanceGls','ExpectedxG', 'ExpectedxAG']].agg({
    'PerformanceGls': 'mean',
    'ExpectedxG': 'mean',
    'ExpectedxAG': 'mean'
}).reset_index()

# Round the results to 2 decimal places
best_away_attack = best_away_attack.round({'PerformanceGls': 2, 'ExpectedxG': 3, 'Per 90 MinutesxG': 3})

# Group by 'Team' and sum 'PerformanceGls', while calculating the mean for the other columns
best_away_defence= away_defense.groupby('Team')[['PerformanceGls','ExpectedxG', 'ExpectedxAG']].agg({
    'PerformanceGls': 'mean',
    'ExpectedxG': 'mean',
    'ExpectedxAG': 'mean'
}).reset_index()

# Round the results to 2 decimal places
best_away_defence = best_away_defence.round({'PerformanceGls': 2, 'ExpectedxG': 3, 'ExpectedxAG': 3})

## Standardizing

In [43]:
def standardize_data(data):
    # Step 1: Drop the 'team' column (or any string-based column)
    team_column = data.pop("Team")  # Temporarily store the column

    # Step 2: Standardize numerical columns
    scaler = MinMaxScaler()
    numerical_columns = data.columns  # Identify all remaining columns
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

    # Step 3: Re-add the 'team' column
    data["Team"] = team_column

    return data

## Ranking

In [44]:
def rank_data_attack(data):
    # Standardize columns
    data2 = standardize_data(data)
    # Add performance metrics
    data2['Rank'] = data2['PerformanceGls'] + data2['ExpectedxG'] + data2['ExpectedxAG']
    # Sort values
    data2.sort_values(by = 'Rank', ascending= False)
    # Rank teams into 4 parts (quantiles)
    data2["Difficulty"] = pd.qcut(data2["Rank"], q=4, labels=[1, 2, 3, 4])  # 1 is the lowest group, 5 is the highest
    # Drop unneeded columns
    data2 = data2.drop(columns = ['PerformanceGls', 'ExpectedxG', 'ExpectedxAG'])
    # Create final dataset
    final = data2[['Team', 'Difficulty', 'Rank']]
    return final

def rank_data_defense(data):
    # Standardize columns
    data2 = standardize_data(data)
    # Add performance metrics
    data2['Rank'] = data2['PerformanceGls'] + data2['ExpectedxG'] + data2['ExpectedxAG']
    # Sort values
    data2.sort_values(by = 'Rank', ascending= False)
    # Rank teams into 4 parts (quantiles)
    data2["Difficulty"] = pd.qcut(data2["Rank"], q=4, labels=[4, 3, 2, 1])  # 1 is the lowest group, 5 is the highest
    # Drop unneeded columns
    data2 = data2.drop(columns = ['PerformanceGls', 'ExpectedxG', 'ExpectedxAG'])
    # Create final dataset
    final = data2[['Team', 'Difficulty', 'Rank']]
    return final


## Home attackers

In [45]:
home_attack = pd.DataFrame(rank_data_attack(best_home_attack))
home_attack.sort_values(by = 'Difficulty', ascending = False)

Unnamed: 0,Team,Difficulty,Rank
3,Brentford,4,2.897436
4,Brighton,4,1.54508
17,Tottenham,4,2.02314
1,Aston Villa,4,1.786089
12,Manchester City,4,1.901508
0,Arsenal,3,1.175866
2,Bournemouth,3,1.321221
5,Chelsea,3,1.224065
18,West Ham,3,1.224602
11,Liverpool,3,1.46174


What this translates as in terms of selection is: A defender who is away (A) to these clubs with have a difficulty of 4 (Brentford, Brighton, Tottenham..)

## Away attackers

In [46]:
away_attack = pd.DataFrame(rank_data_attack(best_away_attack))
away_attack.sort_values(by = 'Difficulty', ascending = False)

Unnamed: 0,Team,Difficulty,Rank
19,Wolves,4,2.366667
17,Tottenham,4,2.353333
14,Newcastle Utd,4,2.5
12,Manchester City,4,2.228
11,Liverpool,4,2.473333
8,Fulham,3,2.193333
9,Ipswich Town,3,1.78
10,Leicester City,3,2.178667
6,Crystal Palace,3,1.961333
5,Chelsea,3,1.885333


What this translates as in terms of selection is: A defender who is at home (H)) to these clubs with have a difficulty of 4 (Wolves, Tottenham, Newcastle..)

## Home Defence

In [48]:
home_def = pd.DataFrame(rank_data_defense(best_home_defense))

home_def.sort_values(by = 'Difficulty', ascending = True)

Unnamed: 0,Team,Difficulty,Rank
0,Arsenal,4,0.25
1,Aston Villa,4,0.682781
2,Bournemouth,4,0.605574
13,Manchester Utd,4,0.382946
11,Liverpool,4,0.511387
17,Tottenham,3,0.976619
15,Nott'ham Forest,3,0.838593
14,Newcastle Utd,3,0.696664
6,Crystal Palace,3,1.133041
7,Everton,3,0.766232


What this translates as in terms of selection is: An attacker who plays away (A)) to these clubs with have a difficulty of 4 (Arsenal, Villa, Man Utd, Liverpool)

## Away defence

In [49]:
away_def = pd.DataFrame(rank_data_defense(best_away_defence))

away_def.sort_values(by = 'Difficulty', ascending = True)

Unnamed: 0,Team,Difficulty,Rank
0,Arsenal,4,0.819436
17,Tottenham,4,0.851667
5,Chelsea,4,0.901538
8,Fulham,4,0.346538
7,Everton,4,0.148667
15,Nott'ham Forest,3,0.901667
14,Newcastle Utd,3,0.913205
11,Liverpool,3,0.988846
19,Wolves,3,0.938846
3,Brentford,3,0.980513


What this translates as in terms of selection is: An attacker who plays home (H)) to these clubs with have a difficulty of 4 (Arsenal, Tottenham, Chelsea, Fulham, Everton)

## NOTE small sample sizes home and away and there may be a regression towards the mean yet to happen. A way around this in the short term may be to using 'Per 90 data, which takes the average performance both home and away from the start of the season. 