## Correlation analysis

In [246]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import random
import pingouin as pg
import glob
import re
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

In [247]:
# Current gameweek 
gameweek = 14

## Collect available player data

In [248]:
# Initialize an empty list to store all individual, player gameweek data 
all_player_sep = []

# Loop through each gameweek
for i in range(1, gameweek + 1):  # Adjusting the range to start from 1 to gameweek
    # Read the CSV for the current gameweek
    x = pd.read_csv(rf'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Players\Seperate_GW\GW_{i}.csv')
    
    # Append the current gameweek data to the list
    all_player_sep.append(x)

# Concatenate all dataframes in the list into a single dataframe
player_data = pd.concat(all_player_sep, axis=0, ignore_index=True)

# Drop unnamed column
player_data = player_data.drop(columns = ['Unnamed: 0'])

# Sort dataset correctly IMPORTANT
player_data = player_data.sort_values(by= ['Player ID','Gameweek'])

## Updated Difficulty Rating

In [249]:
# Read the difficulty data
difficulty = pd.read_csv(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Difficulty_ratings\Model\FD_combined\Current_FD.csv', index_col=0)

# Create a mapping dictionary
mapping = difficulty.set_index(['Opponent', 'Position'])['FD_combined'].to_dict()

# Apply the mapping to a new column in player_data
player_data['FD_combined'] = player_data.apply(
    lambda row: mapping.get((row['Opponent'], row['Position']), None), axis=1
)

## Team data

In [250]:
# Specify the path to the files
attack = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Attacking\*.csv')
defense = glob.glob(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Team\Seperate_GW\Defensive\*.csv')

# Define a function to extract the week number from the filename
def extract_week_number(filename):
    match = re.search(r'GW_(\d+)', filename)
    return int(match.group(1)) if match else None

# Read each attacking file and add the 'Week' column
att_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in attack],
    ignore_index=True
)

# Read each defensive file and add the 'Week' column
def_weekly_data = pd.concat(
    [pd.read_csv(file).assign(Week=extract_week_number(file)) for file in defense],
    ignore_index=True
)
# Remove 'VS' team
def_weekly_data['Team'] = def_weekly_data['Team'].str[3:]

# Choose columns data 
columns_new = ['Team','Week', 'Playing TimeMP', 'Possession','PerformanceGls','PerformanceAst','ExpectedxG','ExpectedxAG',
               'Per 90 MinutesGls','Per 90 MinutesAst','Per 90 MinutesxG','Per 90 MinutesxAG']

# Attacking data
attacking_data = pd.DataFrame(att_weekly_data[columns_new]).sort_values(by = 'Week')

# # Defensive data
defensive_data = pd.DataFrame(def_weekly_data[columns_new]).sort_values(by = 'Week')

# Collect fixture list
fixtures = pd.read_csv(r'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Fixtures\Schedule\Fixtures_alt_names.csv')

# Create function to collect homedata
def team_data(team, fixtures, gameweek):
    # Create a list to store the results
    data = []

    # Iterate over each row of the fixtures DataFrame
    for index, row in fixtures.iterrows():
        # Check if the row's team matches the input team
        if row['Team'] == team:
            # Loop through the columns corresponding to gameweeks
            for col in fixtures.columns[1:gameweek + 1]:
                if '(H)' or '(A)' in row[col]:  # Check if it's a home game and add GW and opponent
                    data.append([col, row[col]])

    # Return the collected home data
    return data
# Get games
games = []

# List of unique teams 
teams = attacking_data['Team'].unique()

for team in teams:
    data = team_data(team, fixtures, gameweek)  # Fetch home game data for the team
    for game in data:  # Assuming 'data' is a list of games
        # You can extract relevant information from 'game', like opponent, week, etc.
        games.append([game[0], team, game[1]])

# Creating DataFrame from the home_games list
home = pd.DataFrame(games, columns=['Week', 'Team', 'Opponent'])

# Remove 'GW' from the 'Week' string and convert it to an integer
home['Week'] = home['Week'].str[2:].astype(int)

# Define columns
cols = ['Team', 'Week', 'Possession', 'PerformanceGls',
       'PerformanceAst', 'ExpectedxG', 'ExpectedxAG', 'Per 90 MinutesGls',
       'Per 90 MinutesAst', 'Per 90 MinutesxG', 'Per 90 MinutesxAG']

# Get attacking and defensive data
attacking = attacking_data[cols]
defensive = defensive_data[cols]

# Get all data
team_attack = home.merge(attacking, on=['Week', 'Team'])
team_defense = home.merge(defensive, on=['Week', 'Team'])

# Rename team_names to align with player_data
# Define a dictionary of old team names as keys and new names as values
name_changes = {
    "Nott'ham Forest": 'Nottingham Forest',
    'Manchester Utd': 'Man Utd',
    'Manchester City': 'Man City',
    'Newcastle Utd': 'Newcastle',
    'Leicester City': 'Leicester',
    'Ipswich Town': 'Ipswich',
    'Tottenham': 'Spurs',
    # Add more teams as needed
}
# Replace the team names using the dictionary
team_attack['Team'] = team_attack['Team'].replace(name_changes)
team_defense['Team'] = team_defense['Team'].replace(name_changes)

# Rename team columns
team_defense.rename(columns=lambda col: f"{col} against", inplace=True)
team_defense.rename(columns={'Week against': 'Week', 'Team against': 'Team', 'Opponent against': 'Opponent'}, inplace=True)

# Merge the data with team information
merged_df = pd.merge(player_data, team_attack, on=['Team', 'Opponent'], how='left')
player_d = pd.merge(merged_df, team_defense, on=['Team', 'Opponent'], how='left')

# Drop uneeded columns
player_data = player_d.drop(columns = ['Week_x', 'Week_y', 'KO_time'])

# Collect columns that are averages of team performance for weeks we have nan data 
team = player_data[['Player ID', 'Gameweek','Per 90 MinutesxG', 'Per 90 MinutesGls', 'Per 90 MinutesxG against','Per 90 MinutesGls against']]

# Filter on earliest GW possible
team = team[team['Gameweek'] == 8]

# Merge the data on 'Player ID'
complete = player_data.merge(team, on='Player ID', how='left', suffixes=('_post8', '_pre8'))

# Replace NaN values in POST_8 variables with average values from 'Per 90 MinutesxG_team'
complete['PerformanceGls'] = complete['PerformanceGls'].fillna(complete['Per 90 MinutesGls_pre8'])
complete['ExpectedxG'] = complete['ExpectedxG'].fillna(complete['Per 90 MinutesxG_pre8'])
complete['PerformanceGls against'] = complete['PerformanceGls against'].fillna(complete['Per 90 MinutesGls against_pre8'])
complete['ExpectedxG against'] = complete['ExpectedxG against'].fillna(complete['Per 90 MinutesxG against_pre8'])

# Rename column
complete = complete.rename(columns={'Gameweek_post8': 'Gameweek',
                                    'PerformanceGls': 'Team_gls',
                                    'ExpectedxG': 'TeamxG',
                                    'PerformanceGls against': 'Team_gls_against',
                                    'ExpectedxG against': 'TeamxG_against',
                                    })

columns_to_keep = ['Player ID', 'Name', 'Last_Name', 'Team', 'Position', 'Cost_Today',
       'GW Points', 'Minutes', 'Goals', 'Assists', 'Clean Sheets',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Influence', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek', 'Opponent', 'FD_combined',
       'Team_gls', 'TeamxG', 'Team_gls_against', 'TeamxG_against']

player_data = complete[columns_to_keep]

In [251]:
# Sort dataset correctly IMPORTANT
player_data = player_data.sort_values(by= ['Player ID','Gameweek'])

# Difficulty rating difference

In [252]:
difficulty

Unnamed: 0,Opponent,Position,xG,Difficulty_xG,Av_GW_Points,Difficulty_points,FD_combined,Team
40,ARS (A),DEF,1.433,3,1.71,5,5,Arsenal
41,ARS (H),DEF,1.675,5,2.76,3,5,Arsenal
42,AVL (A),DEF,2.533,5,2.81,3,4,Aston Villa
43,AVL (H),DEF,1.375,4,2.56,4,4,Aston Villa
44,BHA (A),DEF,1.700,4,2.41,4,4,Brighton
...,...,...,...,...,...,...,...,...
115,TOT (H),MID,0.933,5,3.56,3,4,Spurs
116,WHU (A),MID,2.167,1,4.68,1,2,West Ham
117,WHU (H),MID,1.950,1,4.26,2,2,West Ham
118,WOL (A),MID,2.000,1,4.63,1,2,Wolves


In [253]:
# Opponent difficulty (same as FD_combined)
player_data['Opponent_Difficulty'] = player_data['FD_combined'].rename(inplace= True)

# Initialize a list to store the results
player_difficulty = []

# Iterate through each player in the player_data DataFrame
for _, row in player_data.iterrows():
    team = row['Team']  # Get the player's team
    player_position = row['Position']  # Get the player's position
    opponent_info = row['Opponent']  # Get the player's opponent info
    
    # Filter difficulty dataframe for the player's team
    difficulty_filtered = difficulty[difficulty['Team'] == team]
    
    if "(H)" in opponent_info:  # Player played at home
        opponent = difficulty_filtered[difficulty_filtered['Opponent'].str.contains(r"\(A\)")]
    elif "(A)" in opponent_info:  # Player played away
        opponent = difficulty_filtered[difficulty_filtered['Opponent'].str.contains(r"\(H\)")]
    else:
        continue  # Skip if no valid home/away info found

    # Determine difficulty based on player position
    if player_position in ['MID', 'FWD']:
        difficulty_final = opponent[opponent['Position'] == 'DEF']
    elif player_position in ['GK', 'DEF']:
        difficulty_final = opponent[opponent['Position'] == 'FWD']
    else:
        continue  # Skip if position is not recognized

    score = difficulty_final['FD_combined'].sum()

    # Append the result for this player
    player_difficulty.append({
        'Player ID': row['Player ID'],  # Assuming 'Player' is a column in player_data
        'Opponent': opponent_info,
        'Player_Difficulty': score
    })

# Convert to DataFrame, excluding the Difficulty DataFrame for simplicity
player_difficulty_summary = pd.DataFrame(player_difficulty)

In [254]:
# Full data
player_data = player_data.merge(player_difficulty_summary, on = ['Player ID', 'Opponent'])

In [255]:
# Create difficulty difference
player_data['Difficulty_diff'] = player_data['Player_Difficulty'] - player_data['Opponent_Difficulty']

## Feature Engineering and rolling averages

In [256]:
number_of_games = 4  # Define the window size

# Apply rolling mean for "Form" (GW Points) (excluding current gameweek)
player_data["Form"] = (
    player_data.groupby("Player ID")["GW Points"]
    .transform(lambda x: x.shift(1).rolling(window=number_of_games).mean().round(3))
)

# Apply rolling mean for "Form_xG" (excluding current gameweek)
player_data["Form_xG"] = (
    player_data.groupby("Player ID")["xG"]
    .transform(lambda x: x.shift(1).rolling(window=number_of_games).mean().round(3))
)

# Apply rolling mean for "Form_xGc" (excluding current gameweek)
player_data["Form_xGc"] = (
    player_data.groupby("Player ID")["xGc"]
    .transform(lambda x: x.shift(1).rolling(window=number_of_games).mean().round(3))
)

# Apply rolling mean for "Form_xGc" (excluding current gameweek)
player_data["Form_xGc"] = (
    player_data.groupby("Player ID")["xGc"]
    .transform(lambda x: x.shift(1).rolling(window=number_of_games).mean().round(3))
)

# Apply rolling mean for "Form_xGc" (excluding current gameweek)
player_data["Form_TeamxG"] = (
    player_data.groupby("Player ID")["TeamxG"]
    .transform(lambda x: x.shift(1).rolling(window=number_of_games).mean().round(3))
)

# Apply rolling mean for "Form_xGc" (excluding current gameweek)
player_data["Form_TeamxG_against"] = (
    player_data.groupby("Player ID")["TeamxG_against"]
    .transform(lambda x: x.shift(1).rolling(window=number_of_games).mean().round(3))
)

# Form/Difficulty_difference
player_data['Form_Fix_Diff'] = player_data['Form'] / player_data['Difficulty_diff']
# xG/Fixture Difficulty
player_data['xG_FD'] = player_data['Form_xG'] / player_data['FD_combined']
# xGc/Fixture Difficulty
player_data['xGc_FD'] = player_data['Form_xGc'] / player_data['FD_combined']
# Form/Fixture Difficulty
player_data['Form_FD'] = player_data['Form'] / player_data['FD_combined']

In [257]:
# Remove players who play less than 61 minutes in a game (i.e. they do not recieve their 2 points minimum for playoing this amount)
player_data = player_data[player_data['Minutes'] > 60].copy()

In [258]:
# Columns for correlations
player_data.columns

Index(['Player ID', 'Name', 'Last_Name', 'Team', 'Position', 'Cost_Today',
       'GW Points', 'Minutes', 'Goals', 'Assists', 'Clean Sheets',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Influence', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek', 'Opponent', 'FD_combined', 'Team_gls',
       'TeamxG', 'Team_gls_against', 'TeamxG_against', 'Opponent_Difficulty',
       'Player_Difficulty', 'Difficulty_diff', 'Form', 'Form_xG', 'Form_xGc',
       'Form_TeamxG', 'Form_TeamxG_against', 'Form_Fix_Diff', 'xG_FD',
       'xGc_FD', 'Form_FD'],
      dtype='object')

## Separate by position

In [259]:
# Defensive and Forward players
attackers = player_data[player_data['Position'].isin(['MID', 'FWD'])].copy()
defenders = player_data[player_data['Position'].isin(['GK', 'DEF'])].copy()

# Combined data
data = [attackers, defenders]

## Defensive

In [266]:
# Define correlation columns
correlations = ['GW Points', 'Minutes', 'Goals', 'Assists', 'Clean Sheets',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Influence', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek', 'FD_combined', 'Team_gls',
       'TeamxG', 'Team_gls_against', 'TeamxG_against', 'Opponent_Difficulty',
       'Player_Difficulty', 'Difficulty_diff', 'Form', 'Form_xG', 'Form_xGc',
       'Form_Fix_Diff', 'xG_FD', 'xGc_FD', 'Form_FD']

# Sort the correlation matrix
corr = defenders[correlations].corr().sort_values(by='Clean Sheets', ascending=False)
corr.head(20)

Unnamed: 0,GW Points,Minutes,Goals,Assists,Clean Sheets,Goals Conceded,Penalties Saved,Penalties Missed,YC,RC,...,Opponent_Difficulty,Player_Difficulty,Difficulty_diff,Form,Form_xG,Form_xGc,Form_Fix_Diff,xG_FD,xGc_FD,Form_FD
Clean Sheets,0.771023,-0.025059,0.010997,-0.02036,1.0,-0.640152,0.047385,,-0.061775,-0.027957,...,-0.2285,0.24759,0.32004,-0.002797,0.000136,-0.01281,0.231895,0.060693,0.115173,0.108227
GW Points,1.0,0.040318,0.435076,0.221614,0.771023,-0.627345,0.186905,,-0.209781,-0.107099,...,-0.237555,0.225858,0.312038,0.032374,0.001748,0.010355,0.188646,0.043485,0.118523,0.125213
Total BPS,0.894425,0.083829,0.245905,0.192318,0.751086,-0.729302,0.10076,,-0.226476,-0.114225,...,-0.299501,0.227485,0.355849,-0.012641,-0.025826,-0.013305,0.237148,0.039281,0.133785,0.112449
Total Bonus Points,0.69172,0.04289,0.253681,0.126415,0.463389,-0.328465,0.1126,,-0.068945,-0.016491,...,-0.121931,0.119189,0.1623,0.032194,-0.024262,-0.008799,0.145199,-0.008562,0.064643,0.108701
Difficulty_diff,0.312038,-0.026025,0.002562,0.012479,0.32004,-0.455065,0.030853,,-0.049467,-0.010144,...,-0.757993,0.727301,1.0,0.150415,0.001875,-0.184061,0.580464,0.117665,0.193886,0.413676
Player_Difficulty,0.225858,-0.050333,0.017511,-0.0119,0.24759,-0.299594,0.014254,,-0.006087,-0.006322,...,-0.103628,1.0,0.727301,0.188054,-0.031113,-0.361785,0.449498,-0.02755,-0.280144,0.174335
Form_Fix_Diff,0.188646,-0.000816,-0.048837,0.027864,0.231895,-0.295734,-0.027234,,-0.058986,-0.037708,...,-0.495419,0.449498,0.580464,0.092969,0.001596,-0.165353,1.0,0.10442,0.163242,0.389947
xGc_FD,0.118523,0.135571,0.009919,-0.02987,0.115173,-0.150852,0.004451,,-0.062857,-0.041853,...,-0.539693,-0.280144,0.193886,0.199996,0.163692,0.82825,0.163242,0.267543,1.0,0.425013
Form_FD,0.125213,0.117938,0.027062,-0.009974,0.108227,-0.102289,0.029483,,-0.042071,-0.027266,...,-0.419244,0.174335,0.413676,0.870273,0.17702,0.255305,0.389947,0.253796,0.425013,1.0
TeamxG,0.112317,-0.01344,0.058584,0.043655,0.075786,-0.141788,0.0274,,0.014533,-0.00977,...,-0.174792,0.253237,0.286684,-0.011556,0.021492,-0.108681,0.118095,0.047882,-0.023252,0.035633


## Forwards

In [267]:
# Sort the correlation matrix
corr_f = attackers[correlations].corr().sort_values(by='Goals', ascending=False)
corr_f.head(25)

Unnamed: 0,GW Points,Minutes,Goals,Assists,Clean Sheets,Goals Conceded,Penalties Saved,Penalties Missed,YC,RC,...,Opponent_Difficulty,Player_Difficulty,Difficulty_diff,Form,Form_xG,Form_xGc,Form_Fix_Diff,xG_FD,xGc_FD,Form_FD
Goals,0.865781,0.075501,1.0,0.072872,-0.008592,-0.000265,,-0.00434,-0.050532,0.000164,...,-0.124425,0.147361,0.18342,0.165149,0.18879,0.007917,0.200975,0.241454,0.077228,0.221208
GW Points,1.0,0.086007,0.865781,0.493423,0.112114,-0.107917,,-0.036549,-0.162333,-0.064678,...,-0.151972,0.211068,0.245546,0.164138,0.17108,-0.01405,0.199336,0.223935,0.063046,0.222099
Influence,0.892757,0.211277,0.847898,0.37706,0.011129,-0.024162,,-0.000408,-0.051222,-0.023708,...,-0.146749,0.192822,0.229497,0.110871,0.081222,0.02487,0.203848,0.1364,0.098063,0.173204
Total BPS,0.899571,0.18137,0.803963,0.437782,0.020889,-0.037496,,-0.034296,-0.127385,-0.04871,...,-0.172896,0.211807,0.259746,0.087831,0.06042,0.003713,0.216546,0.122473,0.089643,0.163676
Total Bonus Points,0.855993,0.117355,0.765724,0.282509,-0.000346,-0.072805,,-0.01524,-0.055985,-0.024669,...,-0.12342,0.16439,0.194552,0.13861,0.163907,0.011128,0.185806,0.206994,0.080886,0.186857
ICT Index,0.742268,0.223248,0.670898,0.356186,0.030987,-0.070843,,0.022059,-0.071891,-0.02142,...,-0.192183,0.253341,0.301118,0.199241,0.197287,0.030655,0.264874,0.249218,0.120039,0.263684
xG,0.536613,0.065611,0.626788,0.092092,-0.004135,-0.017965,,0.143587,-0.079415,-0.002603,...,-0.158651,0.183642,0.230928,0.226854,0.316049,0.044155,0.198527,0.376194,0.115715,0.285581
Threat,0.538616,0.105558,0.584381,0.119755,0.018913,-0.062618,,0.044316,-0.082811,0.006323,...,-0.156716,0.210654,0.248363,0.267792,0.36102,0.037035,0.226007,0.402389,0.109255,0.305439
xGi,0.572865,0.112581,0.571739,0.244371,-0.009488,-0.027639,,0.127623,-0.07389,-0.015665,...,-0.194408,0.235039,0.289904,0.237872,0.277732,0.039032,0.235569,0.340828,0.130442,0.311939
xG_FD,0.223935,0.062904,0.241454,0.073737,-0.030122,-0.020947,,0.000235,-0.091126,-0.005389,...,-0.186055,0.16148,0.239776,0.641103,0.916038,0.199536,0.340408,1.0,0.267452,0.695411


# Moderators

When there is a difference in the relationship between two variables when you include another variable. 

For example, is there a different relationship between GW Points and Form/FD for defensive and attacking players, or players with high/low teamxG. If you can work out this difference, you can then work out a parameter to create a ratio on.

To assess for moderators, you need to create an interaction variable (which is )

In [49]:
import statsmodels.formula.api as smf

# Centering variables to reduce multicollinearity
player_data['GW_Points_c'] = player_data['GW Points'] - player_data['GW Points'].mean()
player_data['Combined_team_goals_against'] = player_data['Combined_team_goals_against'] - player_data['Combined_team_goals_against'].mean()

# Creating interaction term
player_data['interaction'] = player_data['GW_Points_c'] * player_data['Combined_team_goals_against']

# Fitting the regression model with interaction
model = smf.ols('F_FD_1 ~ GW_Points_c * Combined_team_goals_against', data=player_data).fit()

# Displaying the summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 F_FD_1   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.066
Method:                 Least Squares   F-statistic:                     39.13
Date:                Tue, 03 Dec 2024   Prob (F-statistic):           2.08e-24
Time:                        17:50:50   Log-Likelihood:                -1776.2
No. Observations:                1625   AIC:                             3560.
Df Residuals:                    1621   BIC:                             3582.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------

## Partial Correlation

Assesses the unique relationship between two variables, when controlling for another variable (also known as a mediator or confounder). 

The confounding variable may suppress the interation or real relationship between the other variables. The null hypothesis is that after controlling for the third variable (or confounder), there is no relationship between a and b. 

rab.c = rab - rac-rbc/ sqrt(1-r^2 ac)sqrt(1-r^2bc)

Another way to think about it is the residuals. Look at the residuals between ac and the residuals between bc (c is the confounder) and then take the correlation between these residuals to calculate the direct relationship between ab.


In [173]:
partial_corr_results = pg.partial_corr(data = data[0], x = 'GW Points', y = 'Form_4', covar = ['M3_Difficulty'])
print(partial_corr_results)

           n         r          CI95%     p-val
pearson  699  0.006747  [-0.07, 0.08]  0.858768


In [None]:
# If you managed to find the better difficulty measure, it should really impact the relationship between GW Points and Total BPS??

## Semi partial correlation

Assesses the unique relationship between two variables, as a function of total variance.

Usually when a confounder is thought to only affect one of the variables and not both.



In [None]:
## Semi 

Correlation analysis

Clean sheets and Total BPS

or Clean sheets controlled for by fixture difficulty

Variables we are interested in doing further analysis on correlated with clean sheets and goals: 

ICT index
Total BPS
Influence
Creativity
xG

We can control for other impact of variables (moderation and partial correlation techniques)


## Multi-collineraity

This occurs when 2 or more predictors share over 80% variance with each other.
This could be indicated with an r^2 value of over 0.8. It means one could be predicted from the other to a substantial degree.
This is problematic, as the parameters of the model (b) become interchangeable (and therefore unreliable) and the mathmatical techniques cannot discriminate between
each predictor. 
One other test is the Variance Inflation Factor (VIF) = 1/ 1 r^2.
If the number is greater than 5 this is moderate, if over 10 then severe multicollineraity.