## Correlation analysis

In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import random
import pingouin as pg
import glob
import re
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")


In [30]:
# Current gameweek 
gameweek = 13

## Collect available player data

In [31]:
# Initialize an empty list to store all individual, player gameweek data 
all_player_sep = []

# Loop through each gameweek
for i in range(1, gameweek + 1):  # Adjusting the range to start from 1 to gameweek
    # Read the CSV for the current gameweek
    x = pd.read_csv(rf'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Players\Seperate_GW\GW_{i}.csv')
    
    # Append the current gameweek data to the list
    all_player_sep.append(x)

# Concatenate all dataframes in the list into a single dataframe
player_data = pd.concat(all_player_sep, axis=0, ignore_index=True)

# Drop unnamed column
player_data = player_data.drop(columns = ['Unnamed: 0'])

# Sort dataset correctly IMPORTANT
player_data = player_data.sort_values(by= ['Player ID','Gameweek'])

## Updated Difficulty Rating

In [32]:
# Read the difficulty data
difficulty = pd.read_csv(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Difficulty_ratings\Model_3_FD.csv', index_col=0)

# Create a mapping dictionary
mapping = difficulty.set_index(['Opponent', 'Position'])['Difficulty'].to_dict()

# Apply the mapping to a new column in player_data
player_data['M3_Difficulty'] = player_data.apply(
    lambda row: mapping.get((row['Opponent'], row['Position']), None), axis=1
)

## Team data

In [33]:
# Specify the path to the files
attack = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Attacking\*.csv')
defense = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Defensive\*.csv')

# Define a function to extract the week number from the filename
def extract_week_number(filename):
    match = re.search(r'GW_(\d+)', filename)
    return int(match.group(1)) if match else None

# Read each attacking file and add the 'Week' column
att_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in attack],
    ignore_index=True
)

# Read each defensive file and add the 'Week' column
def_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in defense],
    ignore_index=True
)
# Remove 'VS' team
def_weekly_data['Team'] = def_weekly_data['Team'].str[3:]

# Choose columns data 
columns_new = ['Team','Week', 'Playing TimeMP', 'Possession','PerformanceGls','PerformanceAst','ExpectedxG','ExpectedxAG',
               'Per 90 MinutesGls','Per 90 MinutesAst','Per 90 MinutesxG','Per 90 MinutesxAG']

# Attacking data
attacking_data = pd.DataFrame(att_weekly_data[columns_new]).sort_values(by = 'Week')

# # Defensive data
defensive_data = pd.DataFrame(def_weekly_data[columns_new]).sort_values(by = 'Week')

# Collect fixture list
fixtures = pd.read_csv(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Schedule\Fixtures_alt_names.csv')

# Create function to collect homedata
def team_data(team, fixtures, gameweek):
    # Create a list to store the results
    data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(H)' or '(A)' in row[col]:  # Check if it's a home game and add GW and opponent
                    data.append([col, row[col]])

    # Return the collected home data
    return data
# Get games
games = []

# List of unique teams 
teams = attacking_data['Team'].unique()

for team in teams:
    data = team_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
home = pd.DataFrame(games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
home['Week'] = home['Week'].str[2:].astype(int)

# Define columns
cols = ['Team', 'Week', 'Possession', 'PerformanceGls',
       'PerformanceAst', 'ExpectedxG', 'ExpectedxAG', 'Per 90 MinutesGls',
       'Per 90 MinutesAst', 'Per 90 MinutesxG', 'Per 90 MinutesxAG']

# Get attacking and defensive data
attacking = attacking_data[cols]
defensive = defensive_data[cols]

# Get all data
team_attack = home.merge(attacking, on=['Week', 'Team'])
team_defense = home.merge(defensive, on=['Week', 'Team'])

# Rename team_names to align with player_data
# Define a dictionary of old team names as keys and new names as values
name_changes = {
    "Nott'ham Forest": 'Nottingham Forest',
    'Manchester Utd': 'Man Utd',
    'Manchester City': 'Man City',
    'Newcastle Utd': 'Newcastle',
    'Leicester City': 'Leicester',
    'Ipswich Town': 'Ipswich',
    'Tottenham': 'Spurs',
    # Add more teams as needed
}
# Replace the team names using the dictionary
team_attack['Team'] = team_attack['Team'].replace(name_changes)
team_defense['Team'] = team_defense['Team'].replace(name_changes)

# Rename team columns
team_defense.rename(columns=lambda col: f"{col} against", inplace=True)
team_defense.rename(columns={'Week against': 'Week', 'Team against': 'Team', 'Opponent against': 'Opponent'}, inplace=True)

# Merge the data with team information
merged_df = pd.merge(player_data, team_attack, on=['Team', 'Opponent'], how='left')
player_d = pd.merge(merged_df, team_defense, on=['Team', 'Opponent'], how='left')

# Drop uneeded columns
player_data = player_d.drop(columns = ['Week_x', 'Week_y', 'KO_time'])

# Collect columns that are averages of team performance for weeks we have nan data 
team = player_data[['Player ID', 'Gameweek','Per 90 MinutesxG', 'Per 90 MinutesGls', 'Per 90 MinutesxG against','Per 90 MinutesGls against']]

# Filter on earliest GW possible
team = team[team['Gameweek'] == 8]

# Merge the data on 'Player ID'
complete = player_data.merge(team, on='Player ID', how='left', suffixes=('_post8', '_pre8'))

# Replace NaN values in POST_8 variables with average values from 'Per 90 MinutesxG_team'
complete['PerformanceGls'] = complete['PerformanceGls'].fillna(complete['Per 90 MinutesGls_pre8'])
complete['ExpectedxG'] = complete['ExpectedxG'].fillna(complete['Per 90 MinutesxG_pre8'])
complete['PerformanceGls against'] = complete['PerformanceGls against'].fillna(complete['Per 90 MinutesGls against_pre8'])
complete['ExpectedxG against'] = complete['ExpectedxG against'].fillna(complete['Per 90 MinutesxG against_pre8'])

In [34]:
# Rename column
complete = complete.rename(columns={'Gameweek_post8': 'Gameweek',
                                    'PerformanceGls': 'Team_gls',
                                    'ExpectedxG': 'TeamxG',
                                    'PerformanceGls against': 'Team_gls_against',
                                    'ExpectedxG against': 'TeamxG_against',
                                    })

columns_to_keep = ['Player ID', 'Name', 'Last_Name', 'Team', 'Position', 'Cost_Today',
       'GW Points', 'Minutes', 'Goals', 'Assists', 'Clean Sheets',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Influence', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek', 'Opponent', 'Difficulty',
       'M3_Difficulty','Team_gls', 'TeamxG', 'Team_gls_against', 'TeamxG_against']

player_data = complete[columns_to_keep]

In [35]:
# Sort dataset correctly IMPORTANT
player_data = player_data.sort_values(by= ['Player ID','Gameweek'])

In [36]:
player_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8522 entries, 0 to 8521
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Player ID           8522 non-null   int64  
 1   Name                8522 non-null   object 
 2   Last_Name           8522 non-null   object 
 3   Team                8522 non-null   object 
 4   Position            8522 non-null   object 
 5   Cost_Today          8522 non-null   int64  
 6   GW Points           8522 non-null   int64  
 7   Minutes             8522 non-null   int64  
 8   Goals               8522 non-null   int64  
 9   Assists             8522 non-null   int64  
 10  Clean Sheets        8522 non-null   int64  
 11  Goals Conceded      8522 non-null   int64  
 12  Penalties Saved     8522 non-null   int64  
 13  Penalties Missed    8522 non-null   int64  
 14  YC                  8522 non-null   int64  
 15  RC                  8522 non-null   int64  
 16  Saves 

## Feature Engineering and rolling averages

In [37]:
number_of_games = 3  # Define the window size

# Apply the logic for rolling mean only for Home ('H') games
player_data["Home_Form_3"] = player_data.groupby("Player ID")["GW Points"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(H\)")]  # Exact match for '(H)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Apply the logic for rolling mean only for Away ('A') games
player_data["Away_Form_3"] = player_data.groupby("Player ID")["GW Points"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(A\)")]  # Exact match for '(A)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Combined form
player_data['Combined_Form_3'] = player_data['Home_Form_3'].fillna(player_data['Away_Form_3'])

# Form/Fixture Difficulty
player_data['F_FD_1'] = player_data['Combined_Form_3']/player_data['M3_Difficulty']

# Apply the logic for rolling mean only for Home ('H') games
player_data["Home_xG"] = player_data.groupby("Player ID")["xG"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(H\)")]  # Exact match for '(H)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Apply the logic for rolling mean only for Away ('A') games
player_data["Away_xG"] = player_data.groupby("Player ID")["xG"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(A\)")]  # Exact match for '(A)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Combined form
player_data['Combined_xG_roll'] = player_data['Home_xG'].fillna(player_data['Away_xG'])

# xG/Fixture Difficulty
player_data['xG_FD_1'] = player_data['Combined_xG_roll']/player_data['M3_Difficulty']


# Apply the logic for rolling mean only for Home ('H') games
player_data["Home_xGc"] = player_data.groupby("Player ID")["xGc"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(H\)")]  # Exact match for '(H)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Apply the logic for rolling mean only for Away ('A') games
player_data["Away_xGc"] = player_data.groupby("Player ID")["xGc"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(A\)")]  # Exact match for '(A)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Combined form
player_data['Combined_xGc_roll'] = player_data['Home_xGc'].fillna(player_data['Away_xGc'])

# xGc/Fixture Difficulty
player_data['xGc_FD_1'] = player_data['Combined_xGc_roll']/player_data['M3_Difficulty']


# Apply the logic for rolling mean only for Home ('H') games
player_data["Home_team_goals"] = player_data.groupby("Player ID")["Team_gls"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(H\)")]  # Exact match for '(H)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Apply the logic for rolling mean only for Away ('A') games
player_data["Away_team_goals"] = player_data.groupby("Player ID")["Team_gls_against"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(A\)")]  # Exact match for '(A)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Combined goals
player_data['Combined_team_goals'] = player_data['Home_team_goals'].fillna(player_data['Away_team_goals'])

# Goals/Fixture Difficulty
player_data['Team_goals_FD_1'] = player_data['Combined_team_goals']/player_data['M3_Difficulty']


# Apply the logic for rolling mean only for Home ('H') games
player_data["Home_team_goals_against"] = player_data.groupby("Player ID")["Team_gls_against"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(H\)")]  # Exact match for '(H)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Apply the logic for rolling mean only for Away ('A') games
player_data["Away_team_goals_against"] = player_data.groupby("Player ID")["Team_gls_against"].transform(
    lambda x: (
        x.shift(1)  # Shift values to exclude the current game
        .loc[player_data["Opponent"].str.contains(r"\(A\)")]  # Exact match for '(A)' using regex
        .rolling(window=number_of_games)  # Apply rolling mean
        .mean()  # Calculate mean
        .round(3)  # Round to three decimal places
    )
)

# Combined form
player_data['Combined_team_goals_against'] = player_data['Home_team_goals_against'].fillna(player_data['Away_team_goals_against'])

# Goals/Fixture Difficulty
player_data['Team_goal_against_FD_1'] = player_data['Combined_team_goals_against']/player_data['M3_Difficulty']

In [38]:
# Remove players who play less than 61 minutes in a game (i.e. they do not recieve their 2 points minimum for playoing this amount)
player_data = player_data[player_data['Minutes'] > 60].copy()

In [39]:
player_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2661 entries, 23 to 8433
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Player ID                    2661 non-null   int64  
 1   Name                         2661 non-null   object 
 2   Last_Name                    2661 non-null   object 
 3   Team                         2661 non-null   object 
 4   Position                     2661 non-null   object 
 5   Cost_Today                   2661 non-null   int64  
 6   GW Points                    2661 non-null   int64  
 7   Minutes                      2661 non-null   int64  
 8   Goals                        2661 non-null   int64  
 9   Assists                      2661 non-null   int64  
 10  Clean Sheets                 2661 non-null   int64  
 11  Goals Conceded               2661 non-null   int64  
 12  Penalties Saved              2661 non-null   int64  
 13  Penalties Missed      

In [40]:
player_data.columns

Index(['Player ID', 'Name', 'Last_Name', 'Team', 'Position', 'Cost_Today',
       'GW Points', 'Minutes', 'Goals', 'Assists', 'Clean Sheets',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Influence', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek', 'Opponent', 'Difficulty',
       'M3_Difficulty', 'Team_gls', 'TeamxG', 'Team_gls_against',
       'TeamxG_against', 'Home_Form_3', 'Away_Form_3', 'Combined_Form_3',
       'F_FD_1', 'Home_xG', 'Away_xG', 'Combined_xG_roll', 'xG_FD_1',
       'Home_xGc', 'Away_xGc', 'Combined_xGc_roll', 'xGc_FD_1',
       'Home_team_goals', 'Away_team_goals', 'Combined_team_goals',
       'Team_goals_FD_1', 'Home_team_goals_against', 'Away_team_goals_against',
       'Combined_team_goals_against', 'Team_goal_against_FD_1'],
      dtype='object')

## Separate by position

In [41]:
# Defensive and Forward players
attackers = player_data[player_data['Position'].isin(['MID', 'FWD'])].copy()
defenders = player_data[player_data['Position'].isin(['GK', 'DEF'])].copy()

# Combined data
data = [attackers, defenders]

## Defensive

In [42]:
# Define correlation columns
correlations = ['GW Points', 'Minutes', 'Goals', 'Assists', 'Clean Sheets',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Influence', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek', 'Difficulty',
       'M3_Difficulty', 'Team_gls', 'TeamxG', 'Team_gls_against',
       'TeamxG_against', 'Home_Form_3', 'Away_Form_3', 'Combined_Form_3',
       'F_FD_1', 'Home_xG', 'Away_xG', 'Combined_xG_roll', 'xG_FD_1',
       'Home_xGc', 'Away_xGc', 'Combined_xGc_roll', 'xGc_FD_1',
       'Home_team_goals', 'Away_team_goals', 'Combined_team_goals',
       'Team_goals_FD_1', 'Home_team_goals_against', 'Away_team_goals_against',
       'Combined_team_goals_against', 'Team_goal_against_FD_1']

# Sort the correlation matrix
corr = defenders[correlations].corr().sort_values(by='GW Points', ascending=False)
corr.head(25)

Unnamed: 0,GW Points,Minutes,Goals,Assists,Clean Sheets,Goals Conceded,Penalties Saved,Penalties Missed,YC,RC,...,Combined_xGc_roll,xGc_FD_1,Home_team_goals,Away_team_goals,Combined_team_goals,Team_goals_FD_1,Home_team_goals_against,Away_team_goals_against,Combined_team_goals_against,Team_goal_against_FD_1
GW Points,1.0,0.024924,0.403171,0.226196,0.771949,-0.62458,0.199188,,-0.210013,-0.113041,...,0.03435,0.190621,-0.132523,-0.130273,-0.126628,0.11123,-0.080499,-0.130273,-0.10208,0.132703
Total BPS,0.89524,0.072275,0.21893,0.189636,0.754053,-0.726792,0.106428,,-0.224672,-0.120277,...,-0.000451,0.190005,-0.119165,-0.127913,-0.117581,0.156257,-0.164863,-0.127913,-0.142149,0.142232
Clean Sheets,0.771949,-0.035516,-0.013495,-0.019604,1.0,-0.639607,0.050145,,-0.056345,-0.028665,...,0.014398,0.211792,-0.159165,-0.116863,-0.131943,0.143363,-0.088679,-0.116863,-0.096091,0.180413
Total Bonus Points,0.683009,0.036207,0.213614,0.125636,0.457865,-0.325788,0.118532,,-0.066278,-0.017,...,0.002516,0.09705,-0.073039,-0.112329,-0.088891,0.054247,-0.034384,-0.112329,-0.068877,0.060603
Influence,0.428304,0.238979,0.49401,0.209451,0.002962,0.039888,0.160724,,-0.141151,-0.057729,...,0.10866,0.068345,-0.023996,0.093968,0.040596,0.007814,0.028782,0.093968,0.063889,0.02014
Goals,0.403171,0.018999,1.0,0.002819,-0.013495,0.066233,-0.012145,,-0.015739,-0.009908,...,0.061477,0.028525,0.02853,-0.026593,-0.005708,-0.030182,0.083268,-0.026593,0.017349,-0.014927
ICT Index,0.34057,0.126194,0.504704,0.289482,-0.016685,0.038484,0.050627,,-0.079399,-0.051541,...,0.039802,0.033709,0.069372,0.046494,0.058775,0.045819,-0.049392,0.046494,-0.002273,-0.009693
Assists,0.226196,-0.025645,0.002819,1.0,-0.019604,0.034843,-0.01498,,0.019952,-0.012222,...,0.00709,-0.025704,0.017266,0.053107,0.032966,-0.01333,-0.053299,0.053107,-0.009201,-0.057556
Penalties Saved,0.199188,0.02373,-0.012145,-0.01498,0.050145,-0.036588,1.0,,-0.003204,-0.003882,...,-0.017637,-0.015365,-0.030245,-0.075249,-0.058555,-0.051975,0.041806,-0.075249,-0.03261,-0.036254
xGc_FD_1,0.190621,0.129112,0.028525,-0.025704,0.211792,-0.179325,-0.015365,,-0.023773,-0.053015,...,0.830828,1.0,-0.108005,0.073931,-0.026536,0.386786,0.31415,0.073931,0.210177,0.60324


## Forwards

In [43]:
# Sort the correlation matrix
corr_f = attackers[correlations].corr().sort_values(by='Goals', ascending=False)
corr_f.head(15)

Unnamed: 0,GW Points,Minutes,Goals,Assists,Clean Sheets,Goals Conceded,Penalties Saved,Penalties Missed,YC,RC,...,Combined_xGc_roll,xGc_FD_1,Home_team_goals,Away_team_goals,Combined_team_goals,Team_goals_FD_1,Home_team_goals_against,Away_team_goals_against,Combined_team_goals_against,Team_goal_against_FD_1
Goals,0.866078,0.069949,1.0,0.061711,-0.009657,-0.000799,,-0.004593,-0.051881,7.3e-05,...,0.000197,0.044716,0.04634,-0.052216,0.005291,0.070833,-0.040434,-0.052216,-0.043545,0.048186
GW Points,1.0,0.085264,0.866078,0.480751,0.104043,-0.103851,,-0.038021,-0.158653,-0.067448,...,0.007121,0.079228,0.07175,-0.078944,0.008241,0.096237,-0.053466,-0.078944,-0.062081,0.069063
Influence,0.895288,0.207744,0.848594,0.371491,0.002917,-0.020974,,-0.000123,-0.05387,-0.024576,...,0.026858,0.104309,0.056297,-0.034892,0.01823,0.126621,-0.079729,-0.034892,-0.055032,0.083385
Total BPS,0.900487,0.180502,0.804039,0.428711,0.013453,-0.035021,,-0.035551,-0.127282,-0.050574,...,0.001439,0.081954,0.097916,-0.064095,0.027026,0.133654,-0.085951,-0.064095,-0.07199,0.077398
Total Bonus Points,0.855506,0.121537,0.759848,0.276876,-0.005065,-0.072155,,-0.016067,-0.055491,-0.025806,...,0.018661,0.065924,0.102186,-0.055159,0.036604,0.07076,-0.022122,-0.055159,-0.034322,0.057651
ICT Index,0.739703,0.230289,0.667589,0.348247,0.021512,-0.06959,,0.023682,-0.068096,-0.022164,...,0.021873,0.100193,0.101229,-0.10449,0.015713,0.134446,-0.185296,-0.10449,-0.141022,0.026911
xG,0.538817,0.070081,0.634131,0.07772,-0.00404,-0.021655,,0.149813,-0.079035,-0.00232,...,0.032139,0.059804,0.057267,-0.118638,-0.022106,0.046573,-0.064741,-0.118638,-0.088608,0.005097
Threat,0.534244,0.11808,0.58509,0.104027,0.016458,-0.067074,,0.046926,-0.075404,0.006931,...,0.026569,0.067351,0.066456,-0.115922,-0.007947,0.077138,-0.159889,-0.115922,-0.133915,-0.009553
xGi,0.576391,0.11475,0.577104,0.237343,-0.013038,-0.030639,,0.133921,-0.068089,-0.015963,...,0.018543,0.075314,0.097954,-0.14737,-0.013346,0.092725,-0.086646,-0.14737,-0.112808,0.022106
Team_gls,0.28101,0.055385,0.221427,0.219934,-0.016785,-0.043797,,-0.005875,-0.035715,-0.043017,...,-0.097566,0.023444,0.111013,-0.246632,-0.084569,0.083478,-0.056757,-0.246632,-0.162488,0.048838


In [44]:
player_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2661 entries, 23 to 8433
Data columns (total 57 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Player ID                    2661 non-null   int64  
 1   Name                         2661 non-null   object 
 2   Last_Name                    2661 non-null   object 
 3   Team                         2661 non-null   object 
 4   Position                     2661 non-null   object 
 5   Cost_Today                   2661 non-null   int64  
 6   GW Points                    2661 non-null   int64  
 7   Minutes                      2661 non-null   int64  
 8   Goals                        2661 non-null   int64  
 9   Assists                      2661 non-null   int64  
 10  Clean Sheets                 2661 non-null   int64  
 11  Goals Conceded               2661 non-null   int64  
 12  Penalties Saved              2661 non-null   int64  
 13  Penalties Missed      

# Moderators

When there is a difference in the relationship between two variables when you include another variable. 

For example, is there a different relationship between GW Points and Form/FD for defensive and attacking players, or players with high/low teamxG. If you can work out this difference, you can then work out a parameter to create a ratio on.

To assess for moderators, you need to create an interaction variable (which is )

In [48]:
player_data['GW Points']

23       1
25       6
26       6
27       2
28      15
        ..
8406     1
8407     3
8428     3
8429     5
8433     1
Name: GW Points, Length: 2661, dtype: int64

In [49]:
import statsmodels.formula.api as smf

# Centering variables to reduce multicollinearity
player_data['GW_Points_c'] = player_data['GW Points'] - player_data['GW Points'].mean()
player_data['Combined_team_goals_against'] = player_data['Combined_team_goals_against'] - player_data['Combined_team_goals_against'].mean()

# Creating interaction term
player_data['interaction'] = player_data['GW_Points_c'] * player_data['Combined_team_goals_against']

# Fitting the regression model with interaction
model = smf.ols('F_FD_1 ~ GW_Points_c * Combined_team_goals_against', data=player_data).fit()

# Displaying the summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 F_FD_1   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.066
Method:                 Least Squares   F-statistic:                     39.13
Date:                Tue, 03 Dec 2024   Prob (F-statistic):           2.08e-24
Time:                        17:50:50   Log-Likelihood:                -1776.2
No. Observations:                1625   AIC:                             3560.
Df Residuals:                    1621   BIC:                             3582.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------

## Partial Correlation

Assesses the unique relationship between two variables, when controlling for another variable (also known as a mediator or confounder). 

The confounding variable may suppress the interation or real relationship between the other variables. The null hypothesis is that after controlling for the third variable (or confounder), there is no relationship between a and b. 

rab.c = rab - rac-rbc/ sqrt(1-r^2 ac)sqrt(1-r^2bc)

Another way to think about it is the residuals. Look at the residuals between ac and the residuals between bc (c is the confounder) and then take the correlation between these residuals to calculate the direct relationship between ab.


In [173]:
partial_corr_results = pg.partial_corr(data = data[0], x = 'GW Points', y = 'Form_4', covar = ['M3_Difficulty'])
print(partial_corr_results)

           n         r          CI95%     p-val
pearson  699  0.006747  [-0.07, 0.08]  0.858768


In [None]:
# If you managed to find the better difficulty measure, it should really impact the relationship between GW Points and Total BPS??

## Semi partial correlation

Assesses the unique relationship between two variables, as a function of total variance.

Usually when a confounder is thought to only affect one of the variables and not both.



In [None]:
## Semi 

Correlation analysis

Clean sheets and Total BPS

or Clean sheets controlled for by fixture difficulty

Variables we are interested in doing further analysis on correlated with clean sheets and goals: 

ICT index
Total BPS
Influence
Creativity
xG

We can control for other impact of variables (moderation and partial correlation techniques)


## Multi-collineraity

This occurs when 2 or more predictors share over 80% variance with each other.
This could be indicated with an r^2 value of over 0.8. It means one could be predicted from the other to a substantial degree.
This is problematic, as the parameters of the model (b) become interchangeable (and therefore unreliable) and the mathmatical techniques cannot discriminate between
each predictor. 
One other test is the Variance Inflation Factor (VIF) = 1/ 1 r^2.
If the number is greater than 5 this is moderate, if over 10 then severe multicollineraity.