## Correlation, partial correlation and multicollinerarity

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from scipy.stats import boxcox, zscore
import random

In [72]:
# Global parameters

# Current gameweek 
gameweek = 11

# Number of gameweeks to calculate rolling averages off 
rolling_number = 3

## Collect available player data

In [73]:
# Initialize an empty list to store all individual, player gameweek data 
all_player_sep = []

# Loop through each gameweek
for i in range(1, gameweek + 1):  # Adjusting the range to start from 1 to gameweek
    # Read the CSV for the current gameweek
    x = pd.read_csv(rf'C:\Users\thoma\Code\Projects\Fantasy-Premier-League\Data\Players\Seperate_GW\GW_{i}.csv')
    
    # Append the current gameweek data to the list
    all_player_sep.append(x)

# Concatenate all dataframes in the list into a single dataframe
player_data = pd.concat(all_player_sep, axis=0, ignore_index=True)

# Drop unnamed column
player_data = player_data.drop(columns = ['Unnamed: 0'])

In [74]:
# Remove players who play less than 61 minutes in a game (i.e. they do not recieve their 2 points minimum for playoing this amount)
player_data = player_data[player_data['Minutes'] > 60].copy()

In [75]:
# Filter by Goalkeepers, Defenders, Midfielders, and Forwards
final_data_mids = player_data[player_data['Position'] == 'MID'].copy()
final_data_defs = player_data[player_data['Position'] == 'DEF'].copy()
final_data_gks = player_data[player_data['Position'] == 'GK'].copy()
final_data_fwds = player_data[player_data['Position'] == 'FWD'].copy()

# Combined data
data = [final_data_gks, final_data_defs, final_data_mids, final_data_fwds]

In [76]:
## Assess sample size of each category
print(f'GK:',final_data_gks.shape)
print(f'DEF:',final_data_defs.shape)
print(f'MID:',final_data_mids.shape)
print(f'FWD:',final_data_fwds.shape)

GK: (219, 33)
DEF: (859, 33)
MID: (952, 33)
FWD: (218, 33)


## Salient Predictors

From my analysis on features, we want to focus on:

- Goalkeepers: CLEAN SHEETS
- Defenders: CLEAN SHEETS + goals 
- Midfielders: GOALS + assists 
- Forwards: GOALS

## Goalkeepers

In [None]:
# Define correlation columns
correlations = ['Clean Sheets','Influence', 'Minutes', 'Goals', 'Assists', 'GW Points',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek','Difficulty']

# Sort the correlation matrix
corr = data[0][correlations].corr().sort_values(by='Clean Sheets', ascending=False)
corr.head(15)

Unnamed: 0,Clean Sheets,Influence,Minutes,Goals,Assists,GW Points,Goals Conceded,Penalties Saved,Penalties Missed,YC,...,Threat,ICT Index,xG,xA,xGi,xGc,Transfers In GW,Transfers Out GW,Gameweek,Difficulty
Clean Sheets,1.0,0.001367,-0.13318,,-0.059933,0.841103,-0.646412,0.183818,,0.074161,...,-0.034443,-0.007833,-0.034443,-0.069543,-0.071224,-0.290621,-0.025933,-0.049293,-0.027213,-0.070897
GW Points,0.841103,0.372487,-0.156254,,0.093204,1.0,-0.664103,0.445926,,-0.022894,...,-0.057026,0.364927,-0.057026,-0.057723,-0.060552,-0.192663,-0.026375,-0.036567,-0.034459,-0.058054
Total BPS,0.780782,0.445929,-0.123405,,0.050851,0.911805,-0.736402,0.281609,,-0.045187,...,-0.086861,0.43268,-0.086861,-0.054648,-0.058985,-0.226133,-0.030378,-0.060221,-0.040009,-0.04829
Total Bonus Points,0.672468,0.292074,-0.260277,,0.012811,0.821636,-0.443704,0.285417,,0.064057,...,-0.024252,0.287988,-0.024252,-0.044179,-0.045367,-0.113798,-0.000609,0.030924,-0.003572,-0.024643
Penalties Saved,0.183818,0.220646,0.009238,,-0.016075,0.445926,-0.142854,1.0,,0.075132,...,-0.009238,0.214983,-0.009238,-0.018652,-0.019103,0.036343,0.001663,0.03312,-0.011372,-0.046261
YC,0.074161,-0.074262,0.021471,,-0.037361,-0.022894,-0.119205,0.075132,,1.0,...,-0.021471,-0.080786,-0.021471,-0.019617,-0.020684,-0.049908,0.05964,0.088507,0.028696,0.072457
Influence,0.001367,1.0,0.005577,,0.068504,0.372487,0.07693,0.220646,,-0.074262,...,-0.10014,0.993473,-0.10014,-0.06571,-0.070708,0.414427,-0.078325,-0.0333,-0.028165,0.160274
ICT Index,-0.007833,0.993473,0.005097,,0.117097,0.364927,0.095419,0.214983,,-0.080786,...,-0.083281,1.0,-0.083281,-0.002837,-0.007035,0.404175,-0.077988,-0.03513,-0.026712,0.153887
Saves,-0.016392,0.943758,-0.022068,,-0.015855,0.304504,0.105584,0.044443,,-0.064544,...,-0.102653,0.931288,-0.102653,-0.087648,-0.092755,0.410523,-0.062334,-0.039355,0.004734,0.173167
Transfers In GW,-0.025933,-0.078325,0.014874,,0.000652,-0.026375,-0.063559,0.001663,,0.05964,...,0.032759,-0.077988,0.032759,0.077155,0.078745,-0.181432,1.0,0.404279,-0.011469,-0.235937


## Defenders

In [92]:
# Define correlation columns
correlations = ['Clean Sheets','Influence', 'Minutes', 'Goals', 'Assists', 'GW Points',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek','Difficulty']

# Sort the correlation matrix
corr = data[1][correlations].corr().sort_values(by='Clean Sheets', ascending=False)
corr.head(15)

Unnamed: 0,Clean Sheets,Influence,Minutes,Goals,Assists,GW Points,Goals Conceded,Penalties Saved,Penalties Missed,YC,...,Threat,ICT Index,xG,xA,xGi,xGc,Transfers In GW,Transfers Out GW,Gameweek,Difficulty
Clean Sheets,1.0,-0.022733,-0.044252,-0.028743,-0.027724,0.755845,-0.647349,,,-0.05149,...,-0.033375,-0.035701,-0.021674,0.014898,-0.00479,-0.28979,-0.027069,0.029328,-0.007676,-0.081007
GW Points,0.755845,0.420141,0.02064,0.462684,0.266254,1.0,-0.609091,,,-0.206265,...,0.21702,0.365922,0.137406,0.142875,0.196793,-0.309598,0.02217,0.05295,0.007484,-0.119303
Total BPS,0.741986,0.398072,0.072532,0.262583,0.216237,0.893727,-0.703661,,,-0.212979,...,0.142662,0.360338,0.037567,0.200016,0.166673,-0.383752,0.017004,0.084932,0.02437,-0.151602
Total Bonus Points,0.422636,0.303869,0.041201,0.232771,0.163836,0.655749,-0.306072,,,-0.081766,...,0.095891,0.27987,0.0256,0.179169,0.143641,-0.181406,0.01915,0.040892,0.000757,-0.043948
Transfers Out GW,0.029328,0.044764,0.06406,0.048824,-0.03418,0.05295,-0.072443,,,0.016676,...,0.069357,0.117106,-0.020572,0.11464,0.065929,-0.116152,0.346953,1.0,-0.000691,0.064498
xA,0.014898,0.159587,-0.069221,0.024383,0.310502,0.142875,0.013322,,,-0.056912,...,0.111463,0.448885,0.014176,1.0,0.711219,-0.075902,0.07871,0.11464,0.059798,-0.055506
xGi,-0.00479,0.269769,-0.000578,0.240183,0.257177,0.196793,0.045391,,,-0.032173,...,0.541971,0.60559,0.712982,0.711219,1.0,-0.054882,0.086519,0.065929,0.068408,-0.061757
Gameweek,-0.007676,0.061994,0.065366,0.031972,-0.005669,0.007484,0.011602,,,0.002221,...,0.010857,0.06985,0.037657,0.059798,0.068408,0.03254,-0.005907,-0.000691,1.0,0.006723
Creativity,-0.020598,0.242725,-0.007397,0.070887,0.310951,0.144263,0.01065,,,-0.040417,...,0.117556,0.680078,0.053048,0.632495,0.480834,-0.070271,0.116699,0.128596,0.068242,-0.082078
xG,-0.021674,0.224536,0.068223,0.317313,0.056094,0.137406,0.051276,,,0.011004,...,0.659716,0.413644,1.0,0.014176,0.712982,-0.002355,0.044554,-0.020572,0.037657,-0.032478


## Midfielders

In [95]:
# Define correlation columns
correlations = ['Goals','Influence', 'Minutes', 'Clean Sheets', 'Assists', 'GW Points',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek','Difficulty']

# Sort the correlation matrix
corr = data[1][correlations].corr().sort_values(by='Goals', ascending=False)
corr.head(15)


Unnamed: 0,Goals,Influence,Minutes,Clean Sheets,Assists,GW Points,Goals Conceded,Penalties Saved,Penalties Missed,YC,...,Threat,ICT Index,xG,xA,xGi,xGc,Transfers In GW,Transfers Out GW,Gameweek,Difficulty
Goals,1.0,0.649023,0.051358,-0.028743,0.000508,0.462684,0.073759,,,-0.030757,...,0.4782,0.547832,0.317313,0.024383,0.240183,0.044173,0.032899,0.048824,0.031972,-0.002215
Influence,0.649023,1.0,0.271489,-0.022733,0.295749,0.420141,0.073985,,,-0.11328,...,0.391974,0.767385,0.224536,0.159587,0.269769,0.093942,0.062577,0.044764,0.061994,-0.050364
ICT Index,0.547832,0.767385,0.179363,-0.035701,0.318232,0.365922,0.063038,,,-0.08476,...,0.670927,1.0,0.413644,0.448885,0.60559,0.000107,0.145881,0.117106,0.06985,-0.088125
Threat,0.4782,0.391974,0.124319,-0.033375,0.041435,0.21702,0.051837,,,-0.024287,...,1.0,0.670927,0.659716,0.111463,0.541971,-0.019089,0.132579,0.069357,0.010857,-0.050261
GW Points,0.462684,0.420141,0.02064,0.755845,0.266254,1.0,-0.609091,,,-0.206265,...,0.21702,0.365922,0.137406,0.142875,0.196793,-0.309598,0.02217,0.05295,0.007484,-0.119303
xG,0.317313,0.224536,0.068223,-0.021674,0.056094,0.137406,0.051276,,,0.011004,...,0.659716,0.413644,1.0,0.014176,0.712982,-0.002355,0.044554,-0.020572,0.037657,-0.032478
Total BPS,0.262583,0.398072,0.072532,0.741986,0.216237,0.893727,-0.703661,,,-0.212979,...,0.142662,0.360338,0.037567,0.200016,0.166673,-0.383752,0.017004,0.084932,0.02437,-0.151602
xGi,0.240183,0.269769,-0.000578,-0.00479,0.257177,0.196793,0.045391,,,-0.032173,...,0.541971,0.60559,0.712982,0.711219,1.0,-0.054882,0.086519,0.065929,0.068408,-0.061757
Total Bonus Points,0.232771,0.303869,0.041201,0.422636,0.163836,0.655749,-0.306072,,,-0.081766,...,0.095891,0.27987,0.0256,0.179169,0.143641,-0.181406,0.01915,0.040892,0.000757,-0.043948
Goals Conceded,0.073759,0.073985,0.042116,-0.647349,0.068396,-0.609091,1.0,,,0.036157,...,0.051837,0.063038,0.051276,0.013322,0.045391,0.582567,0.000153,-0.072443,0.011602,0.172248


## Forwards

In [96]:
# Define correlation columns
correlations = ['Goals','Influence', 'Minutes', 'Clean Sheets', 'Assists', 'GW Points',
       'Goals Conceded', 'Penalties Saved', 'Penalties Missed', 'YC', 'RC',
       'Saves', 'Total Bonus Points', 'Total BPS', 'Creativity',
       'Threat', 'ICT Index', 'xG', 'xA', 'xGi', 'xGc', 'Transfers In GW',
       'Transfers Out GW', 'Gameweek','Difficulty']

# Sort the correlation matrix
corr = data[2][correlations].corr().sort_values(by='Goals', ascending=False)
corr.head(15)

Unnamed: 0,Goals,Influence,Minutes,Clean Sheets,Assists,GW Points,Goals Conceded,Penalties Saved,Penalties Missed,YC,...,Threat,ICT Index,xG,xA,xGi,xGc,Transfers In GW,Transfers Out GW,Gameweek,Difficulty
Goals,1.0,0.805712,0.059245,-0.004392,0.032392,0.84585,-0.010813,,-0.016179,-0.06132,...,0.520349,0.602691,0.595684,0.098901,0.519607,-0.002473,0.145502,0.147364,-0.01808,-0.097846
GW Points,0.84585,0.875345,0.095178,0.134694,0.484845,1.0,-0.127996,,-0.044476,-0.181627,...,0.498576,0.704561,0.519531,0.27692,0.558803,-0.076643,0.157731,0.190201,-0.02394,-0.144448
Influence,0.805712,1.0,0.22954,-0.006979,0.406213,0.875345,-0.018268,,-0.028155,-0.065837,...,0.496917,0.800828,0.482769,0.315973,0.551715,0.001784,0.119033,0.130986,0.002095,-0.136279
Total BPS,0.735835,0.905418,0.219888,-0.010403,0.453674,0.873635,-0.025634,,-0.048181,-0.167031,...,0.429942,0.758192,0.403679,0.377728,0.524183,-0.026513,0.109397,0.133401,-0.006781,-0.171122
Total Bonus Points,0.710528,0.738433,0.142942,-0.020625,0.299828,0.840608,-0.060051,,-0.01714,-0.064588,...,0.418353,0.614484,0.385204,0.252032,0.440173,-0.04409,0.147868,0.170984,-0.011906,-0.106386
ICT Index,0.602691,0.800828,0.231775,0.013887,0.373823,0.704561,-0.079885,,0.000797,-0.075222,...,0.764372,1.0,0.581459,0.534925,0.749971,-0.114518,0.218205,0.177046,0.0066,-0.201162
xG,0.595684,0.482769,0.067926,0.015621,0.078013,0.519531,-0.051387,,0.156448,-0.076187,...,0.708222,0.581459,1.0,0.097318,0.834425,-0.081988,0.209747,0.185564,0.003094,-0.118543
Threat,0.520349,0.496917,0.094537,0.018277,0.119432,0.498576,-0.083258,,0.034244,-0.070171,...,1.0,0.764372,0.708222,0.218986,0.674033,-0.131419,0.244508,0.170132,0.011129,-0.155031
xGi,0.519607,0.551715,0.130808,0.003158,0.269494,0.558803,-0.062696,,0.131907,-0.065234,...,0.674033,0.749971,0.834425,0.629707,1.0,-0.12651,0.239896,0.203223,-0.017257,-0.172654
Transfers Out GW,0.147364,0.130986,0.087464,-0.000712,0.112345,0.190201,-0.024965,,0.0021,-0.047367,...,0.170132,0.177046,0.185564,0.10535,0.203223,-0.047444,0.25488,1.0,-0.001203,0.082029


Variables we are interested in doing further analysis on correlated with clean sheets and goals: 

ICT index
Total BPS
Influence
Creativity
xG

We can control for other impact of variables (moderation and partial correlation techniques)


## Multi-collineraity

This occurs when 2 or more predictors share over 80% variance with each other.
This could be indicated with an r^2 value of over 0.8. It means one could be predicted from the other to a substantial degree.
This is problematic, as the parameters of the model (b) become interchangeable (and therefore unreliable) and the mathmatical techniques cannot discriminate between
each predictor. 
One other test is the Variance Inflation Factor (VIF) = 1/ 1 r^2.
If the number is greater than 5 this is moderate, if over 10 then severe multicollineraity.