In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.feature_selection import VarianceThreshold

%matplotlib inline

# Helper functions


In [91]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# For the linear model
def Linear_regression(features_train, features_test, target_train, target_test):
    # Before using our data, we need to do feature scaling and we opt for the 'standardization' method of scaling.
    # The 'standardization' is avaliable thorugh the StandardScaler() method
    # Transformers help in batching tasks in a pipepline. In this case, the data is scaled and then a linear regression model is fitted on the scaled data.
    # We use a transformer that takes the regression model and the transformation method
    # The TransformedTargetRegressor does the transformation and when we do the prediction, it automatically does the inverse transformation (scaling) and returns the values
    model = TransformedTargetRegressor(
        LinearRegression(), transformer=StandardScaler())

    # fit the transofrmer on the train data
    model.fit(features_train, target_train)

    # With the model fitted, we can predict the total_points given the feature_train and feature_test set
    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    # Evaluate the performance of the model on both sets using the mean absolute error
    train_MAE = mean_absolute_error(target_train, pred_train)
    test_MAE = mean_absolute_error(target_test, pred_test)

    # Evaluate the performance of the model on both sets using the mean square error
    train_MSE = mean_squared_error(target_train, pred_train)
    test_MSE = mean_squared_error(target_test, pred_test)

    # Evaluate the performance of the model on both sets using the root mean square error
    train_RMSE = mean_squared_error(target_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(target_test, pred_test, squared=False)

    # Get the score of the model or the coeeficient of determination i.e how much of the target value can be explained by the model.
    # In this case, 0.6 implies that 60% of the variations in the target value can be explained by the model and 40% is not explainable
    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    # If the test error significantly differs from the train error, then there is either overfitting or underfitting
    # RMSE, just like the squared loss function that it derives from, effectively penalizes larger errors more severely.
    print('Training set RMSE: {}'.format(train_RMSE))
    print('Test set RMSE: {}'.format(test_RMSE))

    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))

    # Carry out cross validation of the model.
    # The evaluation method is the root mean square error
    # The method expects a utility function (greater is better) and so the scoring function is the opposite of the the RMSE. Hence the -ve
    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)

    return {'train_MAE': train_MAE, 'test_MAE': test_MAE, 'train_MSE': train_MSE, 'test_MSE': test_MSE, 'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


# Decision Tree Model
def DecisionTreeRegression(features_train, features_test, target_train, target_test):
    # The DecisionTreeRegressor is passed as the model to the TransformedTreeRegressor together with the StandardScaler
    model = TransformedTargetRegressor(
        DecisionTreeRegressor(), transformer=StandardScaler())
    model.fit(features_train, target_train)

    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    train_MAE = mean_absolute_error(target_train, pred_train)
    test_MAE = mean_absolute_error(target_test, pred_test)

    train_MSE = mean_squared_error(target_train, pred_train)
    test_MSE = mean_squared_error(target_test, pred_test)

    train_RMSE = mean_squared_error(target_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(target_test, pred_test, squared=False)

    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)

    return {'train_MAE': train_MAE, 'test_MAE': test_MAE, 'train_MSE': train_MSE, 'test_MSE': test_MSE,
            'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


# RandomForestRegressor
def RandomForestRegression(features_train, features_test, target_train, target_test, hyperparameters):
    # RandomForestRegressor is an ensemble method
    # The TransformedTargetRegressor is passed the RandomForestRegressor model
    # The RandomForestRegressor is passed some hyper-parameters such as;
    # n_esimtaors: number of trees in the forest,
    # max_depth: the maximum depth of the tree,
    # criterion: the function to measure the quality of the split

    model = TransformedTargetRegressor(RandomForestRegressor(
        n_estimators=hyperparameters['n_estimators'],  max_depth=hyperparameters['max_depth'], criterion=hyperparameters['criterion'], random_state=18), transformer=StandardScaler())
    model.fit(features_train, target_train)

    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    train_MAE = mean_absolute_error(target_train, pred_train)
    test_MAE = mean_absolute_error(target_test, pred_test)

    train_MSE = mean_squared_error(target_train, pred_train)
    test_MSE = mean_squared_error(target_test, pred_test)

    train_RMSE = mean_squared_error(target_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(target_test, pred_test, squared=False)

    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)

    return {'train_MAE': train_MAE, 'test_MAE': test_MAE, 'train_MSE': train_MSE, 'test_MSE': test_MSE,
            'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


def XGBoostRegression(features_train, features_test, target_train, target_test, hyperparameters):
    regressor = xgb(learning_rate=hyperparameters["learning_rate"],
                    n_estimators=hyperparameters["n_estimators"],
                    max_depth=hyperparameters["max_depth"],
                    eval_metric='rmsle')

    model = TransformedTargetRegressor(regressor, transformer=StandardScaler())


    model.fit(features_train, target_train)

    # =========================================================================
    # To use early_stopping_rounds:
    # "Validation metric needs to improve at least once in every
    # early_stopping_rounds round(s) to continue training."
    # =========================================================================
    # first perform a test/train split
    # from sklearn.model_selection import train_test_split

    # X_train,X_test,y_train,y_test = train_test_split(X_train,y_train, test_size = 0.2)
    # model.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_test, y_test)], verbose=False)

    # =========================================================================
    # use the model to predict the prices for the test data
    # =========================================================================
    # predictions = model.predict(goalkeepers_splits['feature_test'])

    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    train_MAE = mean_absolute_error(target_train, pred_train)
    test_MAE = mean_absolute_error(target_test, pred_test)

    train_MSE = mean_squared_error(target_train, pred_train)
    test_MSE = mean_squared_error(target_test, pred_test)

    train_RMSE = mean_squared_error(target_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(target_test, pred_test, squared=False)

    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)

    return {'train_MAE': train_MAE, 'test_MAE': test_MAE, 'train_MSE': train_MSE, 'test_MSE': test_MSE,
            'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


def GridSearchParams(features_train, target_train):
    # Instatiate the model
    model = RandomForestRegressor()

    param_grid = {'n_estimators': [8, 10, 12, 14, 16, 18, 20]}

    # Define the possible values of the hyperparameter
    grid = {
        'n_estimators': [8, 10, 12, 14, 16, 18, 20, 200, 300, 400, 500],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [4, 5, 6, 7, 8],
        'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
        'random_state': [18]
    }

    # Deine the model with cv=3 for a 3-fold cross validation
    # GridSearchCV has the best_estimator_ parameter that returns the  estimator
    # which gave highest score (or smallest loss if specified)

    grid_search = GridSearchCV(
        model, grid, cv=3, scoring='neg_root_mean_squared_error')
    grid_search.fit(features_train, target_train)

    # Get the best param combination
    print(grid_search.best_estimator_)

    return {'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'R2_train': R2_train, 'R2_test': R2_test}

In [92]:

# define a function that splits and returns features_train, features_test, target_train, target_test

def split_data(data):
    # Store the 'total_points' target in the 'player_target' variable
    # and the rest in the player_features variable
    player_target = data['pts_bps']
    player_features = data.drop("pts_bps", axis=1)

    # The train_test_split function splits the set into train and test sets while maintain the same data distribution over both sets.
    # It takes the feature and target sets and reutrns the respective train and test sets
    features_train, features_test, target_train, target_test = train_test_split(
        player_features, player_target, test_size=0.2)

    return {'feature_train': features_train, 'features_test': features_test, 'target_train': target_train, 'target_test': target_test}

# Get the data


```
    Total Points – Bonus Points (tp-bp), Minutes, Yellow Cards, Red Cards, Expected Goals (xG), Expected Assists (xA), Non-penalty Expected Goals (npxG),
    Shots, Expected Goals Against, _Expected_goal_involvements_,  clean_sheets, ict_index, opponent_team, Expected Goals Buildup (xG Buildup), threat, value,
    Key Passes,


    _Games_,  Expected Goals Chain (xG Chain),  _Non-penalty Expected Goal Difference (npxGD)_, _Non-penalty Expected Goals Against (npxGA)_, Expected Points (xPts)
```

```js
    Total Points – Bonus Points (tp-bp)	for, mid, def, gk
    Minutes	for, mid, def, gk
    Yellow Cards	for, mid, def, gk
    Red Cards	for, mid, def, gk
    Expected Goals (xG)	for, mid, def, gk
    Expected Assists (xA)	for, mid, def, gk
    Non-penalty Expected Goals (npxG)	for, mid, def
    Games	for, mid, def, gk
    Shots	for, mid, def
    Key Passes	for, mid, def, gk
    Expected Goals Chain (xG Chain)	for, mid, def
    Expected Goals Buildup (xG Buildup)	for, mid, def
    Non-penalty Expected Goal Difference (npxGD)	def, gk
    Expected Goals Against	def, gk
    Non-penalty Expected Goals Against (npxGA)	def, gk
    Expected Points (xPts)	def, gk
    expected_goal_involvements	for, mid, def, gk
    clean_sheets	mid, def, gk
    ict_index	for, mid, def, gk
    opponent_team	for, mid, def, gk
    threat	for, mid, def, gk
    value	for, mid, def, gk

```


In [93]:
# cols =[
#        'assists_x', 'bonus', 'goals_conceded', 'goals_scored', 'ict_index', 'red_cards', 'round', 'selected', 'threat', 'total_points', 'transfers_in', 'transfers_out',
#        'value', 'was_home', 'xG','season', 'npg', 'npxG', 'xGChain', 'xGBuildup', 'team_h_difficulty', 'team_a_difficulty', 'season', 'event', 'position', 'clean_sheets_3',
#        'expected_assists_3', 'expected_goal_involvements_3', 'expected_goals_3', 'expected_goals_conceded_3', 'goals_conceded_3', 'goals_scored_3', 'influence_3', 'minutes_3',
#        'penalties_missed_3', 'penalties_saved_3', 'red_cards_3', 'saves_3', 'starts_3', 'team_a_score_3', 'team_h_score_3', 'total_points_3', 'yellow_cards_3', 'goals_3',
#        'shots_3', 'xA_3', 'key_passes_3', 'npg_3', 'npxG_3', 'xGChain_3', 'xGBuildup_3', 'xP_3', 'pts_bps', 'WHH', 'WHD', 'WHA'
#  ]


cols =[
       'red_cards', 'selected', 'threat', 'transfers_in', 'transfers_out',
       'value', 'was_home', 'xG','season', 'npg', 'npxG', 'xGChain', 'xGBuildup', 'team_h_difficulty', 'team_a_difficulty', 'season', 'event', 'position', 'clean_sheets_3',
       'expected_assists_3', 'expected_goal_involvements_3', 'expected_goals_3', 'expected_goals_conceded_3', 'goals_conceded_3', 'goals_scored_3', 'influence_3', 'minutes_3',
       'penalties_missed_3', 'penalties_saved_3', 'red_cards_3', 'saves_3', 'starts_3', 'team_a_score_3', 'team_h_score_3', 'total_points_3', 'yellow_cards_3', 'goals_3',
       'shots_3', 'xA_3', 'key_passes_3', 'npg_3', 'npxG_3', 'xGChain_3', 'xGBuildup_3', 'xP_3', 'pts_bps', 'WHH', 'WHD', 'WHA'
 ]

In [94]:
columns = [
    'total_points', 'bonus', 'minutes', 'yellow_cards', 'red_cards', 'expected_goals', 'expected_assists', 'npxG', 'shots', 'expected_goal_involvements', 'expected_goals_conceded', 'clean_sheets', 'ict_index',
    'xGBuildup', 'threat', 'value', 'key_passes', 'xGChain', 'xP', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'position']
gk_cols = [
       # 'threat', 'transfers_in', 'transfers_out', 'value',  'xG','season', 'npg', 'npxG', 'xGChain', 'xGBuildup', 'season', 'event', 'penalties_missed_3', 'penalties_saved_3',
       #  'team_a_score_3', 'team_h_score_3', 'position',  # 'total_points_3',  'goals_3', 'shots_3', 'xA_3',  'npg_3', 'npxG_3',
       'selected_3', 'pts_bps', 'minutes_3', 'yellow_cards_3', 'red_cards_3', 'expected_assists_3', 'expected_goal_involvements_3', 'expected_goals_conceded_3', 'clean_sheets_3',
       'goals_conceded_3',  'ict_index_3', 'influence_3', 'creativity_3', 'threat_3', 'key_passes_3', 'xP_3', 'team_h_difficulty', 'team_a_difficulty', 'xGChain_3', 'xGBuildup_3',
       'saves_3', 'starts_3','was_home', 'WHH', 'WHD', 'WHA'
       ]

def_cols = [
       # 'threat', 'transfers_in', 'transfers_out', 'value', 'season', 'npg', 'npxG', 'xGChain', 'xGBuildup', 'season', 'event', 'penalties_missed_3',
       # 'penalties_saved_3',   'team_a_score_3', 'team_h_score_3', 'total_points_3',  'goals_3', 'npg_3', 'npxG_3','position',

       'selected_3', 'pts_bps', 'minutes_3', 'yellow_cards_3', 'red_cards_3', 'expected_goals_3', 'xG', 'goals_scored_3', 'expected_assists_3', 'xA_3', 'expected_goal_involvements_3',
       'expected_goals_conceded_3', 'clean_sheets_3', 'goals_conceded_3',  'ict_index_3', 'influence_3', 'creativity_3', 'threat_3', 'key_passes_3', 'xP_3', 'team_h_difficulty',
       'team_a_difficulty', 'xGChain_3', 'xGBuildup_3', 'saves_3', 'starts_3', 'shots_3', 'was_home', 'WHH', 'WHD', 'WHA'
       ]

mid_cols = [
       # 'threat', 'transfers_in', 'transfers_out', 'value', 'season', 'npg', 'npxG', 'xGChain', 'xGBuildup', 'season', 'event', 'penalties_missed_3',
       # 'penalties_saved_3',   'team_a_score_3', 'team_h_score_3', 'total_points_3',  'npg_3', 'npxG_3','position',

       'selected_3', 'pts_bps', 'minutes_3', 'yellow_cards_3', 'red_cards_3', 'expected_goals_3', 'xG', 'goals_3', 'goals_scored_3', 'expected_assists_3', 'xA_3',
       'expected_goal_involvements_3', 'expected_goals_conceded_3', 'clean_sheets_3', 'goals_conceded_3',  'ict_index_3', 'influence_3', 'creativity_3', 'threat_3', 'key_passes_3', 'xP_3',
       'team_h_difficulty', 'team_a_difficulty', 'xGChain_3', 'xGBuildup_3',  'starts_3', 'shots_3', 'was_home', 'WHH', 'WHD', 'WHA'
       ]

fwd_cols =[
       #  'threat', 'transfers_in', 'transfers_out', 'value', 'season', 'npg', 'npxG', 'xGChain', 'xGBuildup', 'season', 'event', 'penalties_missed_3',
       # 'penalties_saved_3',   'team_a_score_3', 'team_h_score_3', 'total_points_3',  'npg_3', 'npxG_3', 'expected_goals_conceded_3', 'clean_sheets_3', 'goals_conceded_3','position',

       'selected_3', 'pts_bps', 'minutes_3', 'yellow_cards_3', 'red_cards_3', 'expected_goals_3', 'xG', 'goals_3', 'goals_scored_3', 'expected_assists_3', 'xA_3',
       'expected_goal_involvements_3', 'ict_index_3', 'influence_3', 'creativity_3', 'threat_3', 'key_passes_3', 'xP_3', 'team_h_difficulty', 'team_a_difficulty', 'xGChain_3', 'xGBuildup_3',
       'starts_3', 'shots_3', 'was_home', 'WHH', 'WHD', 'WHA'
       ]

In [95]:
# player_21_22 = pd.read_csv('./data/joint/21-22/merged_player_data.csv')[columns]
player_22_23 = pd.read_csv('./data/joint/22-23/merged_player_data.csv')
player_23_24 = pd.read_csv('./data/joint/23-24/merged_player_data.csv')
player_24_25 = pd.read_csv('./data/joint/24-25/merged_player_data.csv')

In [96]:
## Filter out players with zero points
player_22_23_no = player_22_23[player_22_23['total_points'] !=0]
player_23_24_no = player_23_24[player_23_24['total_points'] !=0]
player_24_25_no = player_24_25[player_24_25['total_points'] !=0]

player_data_all = pd.concat([player_22_23_no, player_23_24_no, player_24_25_no]).dropna()

# def points_(row):
#     return row['total_points'] - row['bonus']
# player_data['pts_bps'] = player_data.apply(points_, axis=1)
# player_data = player_data.drop(['total_points', 'bonus'], axis=1)

In [97]:
# Group by position
gk_player_data = player_data_all[player_data_all['position']=='GK']
gk_player_data = gk_player_data.drop('position', axis=1)
gk_player_data = gk_player_data[gk_cols]

def_player_data = player_data_all[player_data_all['position']=='DEF']
def_player_data = def_player_data.drop('position', axis=1)
def_player_data = def_player_data[def_cols]

mid_player_data = player_data_all[player_data_all['position']=='MID']
mid_player_data = mid_player_data.drop('position', axis=1)
mid_player_data = mid_player_data[mid_cols]

fwd_player_data = player_data_all[player_data_all['position']=='FWD']
fwd_player_data = fwd_player_data.drop('position', axis=1)
fwd_player_data = fwd_player_data[fwd_cols]

In [98]:
# %run ./goalkeepers.ipynb

In [4]:

import requests
base_url = "https://fantasy.premierleague.com/api/"
general_info = requests.get(base_url + "/fixtures/?event=12").json()

In [5]:
general_info

[{'code': 2444586,
  'event': 12,
  'finished': True,
  'finished_provisional': True,
  'id': 117,
  'kickoff_time': '2024-11-23T12:30:00Z',
  'minutes': 90,
  'provisional_start_time': False,
  'started': True,
  'team_a': 6,
  'team_a_score': 2,
  'team_h': 11,
  'team_h_score': 1,
  'stats': [{'identifier': 'goals_scored',
    'a': [{'value': 1, 'element': 168}, {'value': 1, 'element': 180}],
    'h': [{'value': 1, 'element': 192}]},
   {'identifier': 'assists',
    'a': [{'value': 1, 'element': 168}, {'value': 1, 'element': 180}],
    'h': [{'value': 1, 'element': 290}]},
   {'identifier': 'own_goals', 'a': [], 'h': []},
   {'identifier': 'penalties_saved', 'a': [], 'h': []},
   {'identifier': 'penalties_missed', 'a': [], 'h': []},
   {'identifier': 'yellow_cards',
    'a': [{'value': 1, 'element': 157},
     {'value': 1, 'element': 174},
     {'value': 1, 'element': 185}],
    'h': [{'value': 1, 'element': 285},
     {'value': 1, 'element': 291},
     {'value': 1, 'element': 300},