<a href="https://colab.research.google.com/github/ilyandho/FPL-Optimal-Transfer/blob/main/FPL_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fetch and extract important attributes from FPL API
 Store the data to *`player_data.csv`*

In [1]:
import requests, json
import pandas as pd

base_url = 'https://fantasy.premierleague.com/api/'

# This functionn fetches the player details using FPL API up to a given week
def get_player_data(gw):
  # Get the general, inorder to have access to the team ids in the elements component
  general_info = requests.get(base_url + "/bootstrap-static/").json()

  # player_history will contain the data
  player_history = []

  # For each game week, the stats for all players
  for gwk in range(1,gw+1):
    r = requests.get(base_url + 'event/' + str(gwk) + '/live/').json()
    # Get the fixtures for each game week gwk.
    # It will contain 10 matches with each match containing the stats for the home and away team difficulty rating.
    fixture = requests.get(base_url + 'fixtures?event='+ str(gwk)).json()

    # For each player we will also need the match difficulty for both player's team and plyaer's opponent team.
    for player in r['elements']:

      team_h_difficulty = 0
      team_a_difficulty = 0

      # Get the player element id in the player component and compare it to the id of the element in elements component of general_info
      # Also get the player position which will later be used to filter the data by position
      player_team = None
      position = None
      for element in general_info['elements']:
        if player['id'] == element['id']:
          player_team = element['team']
          position = element['element_type']

      # For each match in the fixture array, we compare the player team id with the away team and home team ids in the match.
      # If they match either, then update team_h_difficulty and team_a_difficulty
      for match_ in fixture:
        if player_team == match_['team_a'] or player_team == match_['team_h']:
          team_h_difficulty = match_['team_h_difficulty']
          team_a_difficulty = match_['team_a_difficulty']

      # Create a temporary variable containing the gwk, team_a_difficulty, team_h_difficulty and position
      temp_data = {'id': int(player['id']), 'gw':gwk,'team_h_difficulty': team_h_difficulty, 'team_a_difficulty':team_a_difficulty, 'position':position}

      # The stats element in each player component contains:
      #  - minutes, goals_scored, assists, clean_sheets, goals_conceded, own_goals, penalties_saved,
      #  - penalties_missed, yellow_cards, red_cards, saves, bonus, bps, influence,
      #  - creativity, threat, ict_index, starts, expected_goals, expected_assists,
      #  - expected_goal_involvements, expected_goals_conceded, total_points, in_dreamteam
      for key in player['stats'].keys():
        temp_data.update({key: player['stats'][key]})

      # Append the player info to the player_history array
      player_history.append(temp_data)
  return player_history


player_data = pd.DataFrame(get_player_data(30))

player_data



Unnamed: 0,id,gw,team_h_difficulty,team_a_difficulty,position,minutes,goals_scored,assists,clean_sheets,goals_conceded,...,creativity,threat,ict_index,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,total_points,in_dreamteam
0,1,1,2,5,4,0,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.00,0,False
1,2,1,2,5,2,0,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.00,0,False
2,3,1,2,5,3,0,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.00,0,False
3,4,1,2,5,3,0,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.00,0,False
4,5,1,2,5,2,4,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.02,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22833,839,30,2,4,3,0,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.00,0,False
22834,840,30,2,4,4,0,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.00,0,False
22835,841,30,2,2,3,0,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.00,0,False
22836,842,30,2,2,3,0,0,0,0,0,...,0.0,0.0,0.0,0,0.00,0.00,0.00,0.00,0,False


In [2]:
player_data.columns


Index(['id', 'gw', 'team_h_difficulty', 'team_a_difficulty', 'position',
       'minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded',
       'own_goals', 'penalties_saved', 'penalties_missed', 'yellow_cards',
       'red_cards', 'saves', 'bonus', 'bps', 'influence', 'creativity',
       'threat', 'ict_index', 'starts', 'expected_goals', 'expected_assists',
       'expected_goal_involvements', 'expected_goals_conceded', 'total_points',
       'in_dreamteam'],
      dtype='object')

In [3]:
# Store data to csv
player_data.to_csv('/content/player_data.csv', encoding = 'utf-8-sig')

## Fetch stored player data in csv

In [4]:
import pandas as pd
# Load the player detaills
df = pd.read_csv('/content/player_data.csv')
df.tail()

Unnamed: 0.1,Unnamed: 0,id,gw,team_h_difficulty,team_a_difficulty,position,minutes,goals_scored,assists,clean_sheets,...,creativity,threat,ict_index,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,total_points,in_dreamteam
22833,22833,839,30,2,4,3,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,False
22834,22834,840,30,2,4,4,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,False
22835,22835,841,30,2,2,3,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,False
22836,22836,842,30,2,2,3,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,False
22837,22837,843,30,2,5,3,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,False


In [5]:
# Get the data statics
df.describe()

Unnamed: 0.1,Unnamed: 0,id,gw,team_h_difficulty,team_a_difficulty,position,minutes,goals_scored,assists,clean_sheets,...,influence,creativity,threat,ict_index,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,total_points
count,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,...,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0,22838.0
mean,11418.5,383.021806,16.069884,2.513618,2.916367,2.575401,25.265216,0.039977,0.036124,0.063053,...,5.991208,4.052855,3.798713,1.383619,0.282249,0.040455,0.025396,0.065842,0.443002,1.046677
std,6592.90706,222.936607,8.636192,1.048487,1.233877,0.856158,38.149475,0.217718,0.205797,0.243063,...,12.36945,10.516684,10.739667,2.793724,0.456959,0.150496,0.083918,0.192029,0.813138,2.287807
min,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4.0
25%,5709.25,191.0,9.0,2.0,2.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,11418.5,381.0,16.0,2.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17127.75,571.0,24.0,3.0,4.0,3.0,65.0,0.0,0.0,0.0,...,6.4,1.2,0.0,1.8,1.0,0.0,0.01,0.01,0.63,1.0
max,22837.0,843.0,30.0,5.0,5.0,4.0,180.0,3.0,4.0,1.0,...,130.0,181.8,142.0,27.4,2.0,2.77,1.47,3.88,7.82,23.0


In [6]:
# Get details about each feature
## From the data info, all features have numerical values except the 'in_dreamteam' which has a boolean value
## No observation (row) for each feature (column) is null
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22838 entries, 0 to 22837
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  22838 non-null  int64  
 1   id                          22838 non-null  int64  
 2   gw                          22838 non-null  int64  
 3   team_h_difficulty           22838 non-null  int64  
 4   team_a_difficulty           22838 non-null  int64  
 5   position                    22838 non-null  int64  
 6   minutes                     22838 non-null  int64  
 7   goals_scored                22838 non-null  int64  
 8   assists                     22838 non-null  int64  
 9   clean_sheets                22838 non-null  int64  
 10  goals_conceded              22838 non-null  int64  
 11  own_goals                   22838 non-null  int64  
 12  penalties_saved             22838 non-null  int64  
 13  penalties_missed            228

In [7]:
# Of the features, we want to use features that are available at the time the model is running.
## This makes sure that the model only depends on games stats that are available before the match starts.
## These will store in the 'attributes' variable and we will subsquently be using these for the rest of the work.
attributes = ['team_h_difficulty', 'team_a_difficulty', 'position',
       'threat', 'ict_index', 'expected_goals', 'expected_assists',
       'expected_goal_involvements', 'expected_goals_conceded', 'total_points']
players_data = df[attributes]
players_data

Unnamed: 0,team_h_difficulty,team_a_difficulty,position,threat,ict_index,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,total_points
0,2,5,4,0.0,0.0,0.0,0.0,0.0,0.00,0
1,2,5,2,0.0,0.0,0.0,0.0,0.0,0.00,0
2,2,5,3,0.0,0.0,0.0,0.0,0.0,0.00,0
3,2,5,3,0.0,0.0,0.0,0.0,0.0,0.00,0
4,2,5,2,0.0,0.0,0.0,0.0,0.0,0.02,1
...,...,...,...,...,...,...,...,...,...,...
22833,2,4,3,0.0,0.0,0.0,0.0,0.0,0.00,0
22834,2,4,4,0.0,0.0,0.0,0.0,0.0,0.00,0
22835,2,2,3,0.0,0.0,0.0,0.0,0.0,0.00,0
22836,2,2,3,0.0,0.0,0.0,0.0,0.0,0.00,0


In [8]:
# Group the players by position
## Here we will train models for each position i.e each position will have its own model for predicting the data
goalkeepers = players_data.loc[players_data['position'] == 1]
defenders = players_data.loc[players_data['position'] == 2]
midfielders = players_data.loc[players_data['position'] == 3]
forwards = players_data.loc[players_data['position'] == 4]

forwards

Unnamed: 0,team_h_difficulty,team_a_difficulty,position,threat,ict_index,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,total_points
0,2,5,4,0.0,0.0,0.00,0.00,0.00,0.00,0
7,2,5,4,0.0,0.0,0.00,0.00,0.00,0.00,0
12,2,5,4,36.0,7.2,0.21,0.01,0.22,0.30,8
32,2,2,4,0.0,0.0,0.00,0.00,0.00,0.00,0
38,3,4,4,0.0,0.0,0.00,0.00,0.00,0.00,0
...,...,...,...,...,...,...,...,...,...,...
22815,2,2,4,0.0,0.0,0.00,0.00,0.00,0.00,0
22816,2,2,4,0.0,0.0,0.00,0.00,0.00,0.00,0
22818,2,5,4,0.0,0.0,0.00,0.00,0.00,0.00,0
22830,2,4,4,7.0,1.3,0.04,0.00,0.04,0.97,2


In [9]:
from sklearn.model_selection import train_test_split

# define a function that splits and returns features_train, features_test, target_train, target_test
def split_position_data(data):
  # Store the 'total_points' target in the 'player_target' variable
  # and the rest in the player_features variable
  player_target = data['total_points']
  player_features = data.drop("total_points", axis=1)

  # The train_test_split function splits the set into train and test sets while maintain the same data distribution over both sets.
  ## It takes the feature and target sets and reutrns the respective train and test sets
  features_train, features_test, target_train, target_test = train_test_split(player_features, player_target, test_size=0.2)

  return {'feature_train': features_train, 'features_test': features_test, 'target_train': target_train, 'target_test': target_test}

In [10]:
# Split positions into the training and the test sets
goalkeepers_splits =  split_position_data(goalkeepers)
defenders_splits =  split_position_data(defenders)
midfielders_splits =  split_position_data(midfielders)
forwards_splits =  split_position_data(forwards)

## Using ML Models to Predict Points in Fantasy Premier League
For this project we are going to compare to Linear Regression, Decision Trees and Random Forests and find out how they all perform.


#### Helper functions


In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# For the linear model
def Linear_regression(features_train, features_test, target_train, target_test):
    # Before using our data, we need to do feature scaling and we opt for the 'standardization' method of scaling.
    ## The 'standardization' is avaliable thorugh the StandardScaler() method
    # Transformers help in batching tasks in a pipepline. In this case, the data is scaled and then a linear regression model is fitted on the scaled data.
    # We use a transformer that takes the regression model and the transformation method
    # The TransformedTargetRegressor does the transformation and when we do the prediction, it automatically does the inverse transformation (scaling) and returns the values
    model = TransformedTargetRegressor(LinearRegression(), transformer=StandardScaler())

    # fit the transofrmer on the train data
    model.fit(features_train, target_train)

    # With the model fitted, we can predict the total_points given the feature_train and feature_test set
    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    # Evaluate the performance of the model on both sets using the root mean square error
    train_MSE = mean_squared_error(target_train, pred_train)
    test_MSE = mean_squared_error(target_test, pred_test)


    # Get the score of the model or the coeeficient of determination i.e how much of the target value can be explained by the model.
    ## In this case, 0.6 implies that 60% of the variations in the target value can be explained by the model and 40% is not explainable
    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    # If the test error significantly differs from the train error, then there is either overfitting or underfitting

    print('Training set MSE: {}'.format(train_MSE))
    print('Test set MSE: {}'.format(test_MSE))


    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))



    # Carry out cross validation of the model.
    # The evaluation method is the root mean square error
    # The method expects a utility function (greater is better) and so the scoring function is the opposite of the the RMSE. Hence the -ve
    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)

    print(pd.Series(tree_rmses).describe())


# Decision Tree Model
def DecionTreeRegression(features_train, features_test, target_train, target_test):
    # The DecisionTreeRegressor is passed as the model to the TransformedTreeRegressor together with the StandardScaler
    model = TransformedTargetRegressor(DecisionTreeRegressor(), transformer=StandardScaler())
    model.fit(features_train, target_train)

    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    train_MSE = mean_squared_error(target_train, pred_train)
    test_MSE = mean_squared_error(target_test, pred_test)

    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    print('Training set MSE: {}'.format(train_MSE))
    print('Test set MSE: {}'.format(test_MSE))
    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))

    from sklearn.model_selection import cross_val_score

    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)
    print(pd.Series(tree_rmses).describe())


# RandomForestRegressor
def RandomForestRegression(features_train, features_test, target_train, target_test):
    # RandomForestRegressor is an ensemble method
    # The TransformedTargetRegressor is passed the RandomForestRegressor model
    # The RandomForestRegressor is passed some hyper-parameters such as;
    ## n_esimtaors: number of trees in the forest,
    ## max_depth: the maximum depth of the tree,
    ## criterion: the function to measure the quality of the split

    model = TransformedTargetRegressor(RandomForestRegressor(n_estimators=200,  max_depth=8, criterion="squared_error", random_state=18), transformer=StandardScaler())
    model.fit(features_train, target_train)

    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    train_MSE = mean_squared_error(target_train, pred_train)
    test_MSE = mean_squared_error(target_test, pred_test)

    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    print('Training set MSE: {}'.format(train_MSE))
    print('Test set MSE: {}'.format(test_MSE))
    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))


    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)
    print(pd.Series(tree_rmses).describe())


def GridSearchParams(features_train, target_train):
    # Instatiate the model
    model = RandomForestRegressor()


    param_grid = {'n_estimators': [8,10,12,14,16,18,20]}

    # Define the possible values of the hyperparameter
    grid = {
        'n_estimators': [8, 10, 12, 14, 16, 18, 20, 200, 300, 400, 500],
        'max_features': ['sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
        'random_state' : [18]
    }

    # Deine the model with cv=3 for a 3-fold cross validation
    # GridSearchCV has the best_estimator_ parameter that returns the  estimator
    # which gave highest score (or smallest loss if specified)

    grid_search = GridSearchCV(model, grid, cv=3, scoring='neg_root_mean_squared_error')
    grid_search.fit(features_train, target_train)

    # Get the best param combination
    print(grid_search.best_estimator_)

### Linear Regression

#### Goalkeepers

In [40]:
Linear_regression(goalkeepers_splits['feature_train'], goalkeepers_splits['features_test'], goalkeepers_splits['target_train'], goalkeepers_splits['target_test'])

Training set MSE: 1.4777769174219517
Test set MSE: 1.3815357070515257
Training set R2: 0.5724696799985334
Test set R2: 0.5532251533410851
count    10.000000
mean      1.213898
std       0.181890
min       0.944873
25%       1.084946
50%       1.231925
75%       1.316268
max       1.500201
dtype: float64


#### Defenders


In [41]:
Linear_regression(defenders_splits['feature_train'], defenders_splits['features_test'], defenders_splits['target_train'], defenders_splits['target_test'])

Training set MSE: 2.253626756366973
Test set MSE: 2.6475898601744565
Training set R2: 0.5129571713775085
Test set R2: 0.5047604717148642
count    10.000000
mean      1.504566
std       0.098543
min       1.371166
25%       1.402343
50%       1.540565
75%       1.588740
max       1.610302
dtype: float64


#### Midfielders

In [42]:
Linear_regression(midfielders_splits['feature_train'], midfielders_splits['features_test'], midfielders_splits['target_train'], midfielders_splits['target_test'])

Training set MSE: 1.4573912446956916
Test set MSE: 1.293127483791271
Training set R2: 0.7465070103905842
Test set R2: 0.7438729649955487
count    10.000000
mean      1.210221
std       0.079891
min       1.088600
25%       1.166196
50%       1.195352
75%       1.258717
max       1.369121
dtype: float64


#### Forwards

In [43]:
Linear_regression(forwards_splits['feature_train'], forwards_splits['features_test'], forwards_splits['target_train'], forwards_splits['target_test'])

Training set MSE: 1.0039239666377573
Test set MSE: 1.0000550446408045
Training set R2: 0.8498979820293926
Test set R2: 0.8408476639991612
count    10.000000
mean      1.006067
std       0.092849
min       0.872329
25%       0.960783
50%       0.995949
75%       1.069438
max       1.150083
dtype: float64


### DecionTreeRegressor

#### Goalkeepers

In [44]:
DecionTreeRegression(goalkeepers_splits['feature_train'], goalkeepers_splits['features_test'], goalkeepers_splits['target_train'], goalkeepers_splits['target_test'])

Training set MSE: 0.0018960791942247005
Test set MSE: 1.7262422389466736
Training set R2: 0.9994514521541795
Test set R2: 0.44175050440967156
count    10.000000
mean      1.436349
std       0.161433
min       1.272421
25%       1.359261
50%       1.399811
75%       1.452989
max       1.858073
dtype: float64


#### Defenders

In [45]:
DecionTreeRegression(defenders_splits['feature_train'], defenders_splits['features_test'], defenders_splits['target_train'], defenders_splits['target_test'])

Training set MSE: 0.0037760189224556972
Test set MSE: 4.560226267140264
Training set R2: 0.9991839451977889
Test set R2: 0.1469961645557828
count    10.000000
mean      1.983287
std       0.137577
min       1.786877
25%       1.902610
50%       1.967359
75%       2.016096
max       2.211768
dtype: float64


#### Midfielders

In [46]:
DecionTreeRegression(midfielders_splits['feature_train'], midfielders_splits['features_test'], midfielders_splits['target_train'], midfielders_splits['target_test'])

Training set MSE: 0.0047547036323420085
Test set MSE: 2.454938352171454
Training set R2: 0.9991729852619495
Test set R2: 0.5137555352107253
count    10.000000
mean      1.610496
std       0.094334
min       1.444768
25%       1.561283
50%       1.580187
75%       1.671618
max       1.780387
dtype: float64


#### Forwards


In [47]:
DecionTreeRegression(forwards_splits['feature_train'], forwards_splits['features_test'], forwards_splits['target_train'], forwards_splits['target_test'])

Training set MSE: 0.007274753905774058
Test set MSE: 1.8072258896139535
Training set R2: 0.9989123128067623
Test set R2: 0.7123916092873044
count    10.000000
mean      1.435281
std       0.232165
min       1.081491
25%       1.339719
50%       1.409952
75%       1.474294
max       1.961489
dtype: float64


### RandomForestRegressor

#### Goalkeeprs

In [48]:
RandomForestRegression(goalkeepers_splits['feature_train'], goalkeepers_splits['features_test'], goalkeepers_splits['target_train'], goalkeepers_splits['target_test'])

Training set MSE: 0.542725421736506
Test set MSE: 1.1600145634999697
Training set R2: 0.8429860620419428
Test set R2: 0.6248628782560466
count    10.000000
mean      1.061845
std       0.164132
min       0.848590
25%       0.995718
50%       1.015753
75%       1.159645
max       1.403496
dtype: float64


#### Defenders


In [49]:
RandomForestRegression(defenders_splits['feature_train'], defenders_splits['features_test'], defenders_splits['target_train'], defenders_splits['target_test'])

Training set MSE: 1.1872838902288627
Test set MSE: 2.398469681375406
Training set R2: 0.7434099934067258
Test set R2: 0.5513591393146189
count    10.000000
mean      1.403177
std       0.088586
min       1.284151
25%       1.321954
50%       1.419753
75%       1.479265
max       1.520489
dtype: float64


#### Midfielders

In [50]:
RandomForestRegression(midfielders_splits['feature_train'], midfielders_splits['features_test'], midfielders_splits['target_train'], midfielders_splits['target_test'])

Training set MSE: 0.6672342655247372
Test set MSE: 1.091991170639396
Training set R2: 0.8839438556027381
Test set R2: 0.7837116105777131
count    10.000000
mean      1.142626
std       0.099386
min       1.001977
25%       1.082784
50%       1.114699
75%       1.195172
max       1.329462
dtype: float64


#### Forwards

In [51]:
RandomForestRegression(forwards_splits['feature_train'], forwards_splits['features_test'], forwards_splits['target_train'], forwards_splits['target_test'])

Training set MSE: 0.30324271522597984
Test set MSE: 0.9451327238504087
Training set R2: 0.9546605669324262
Test set R2: 0.8495881985319563
count    10.000000
mean      0.999950
std       0.139053
min       0.840827
25%       0.896659
50%       0.987925
75%       1.020344
max       1.250391
dtype: float64


#### Fine Tunning the RandomForestRegressor
From the above, we see that RandomForestRegressor performs the best MSE and they do not differ that much on the test and the train sets.
In order to improve its performance we carry out Cross validation.

#### GridSearchCV

At this stage we are doing hyperparameter tuning i.e. finding the best combination of hyperparameters to be passed to the RandomForestRegressor.

#### Goalkeepers

In [52]:
GridSearchParams(goalkeepers_splits['feature_train'], goalkeepers_splits['target_train'])

330 fits failed out of a total of 1320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 373, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

 -1.174839   -1.17076003 -1.1727467  -1.17126744 -1.17213955 -1.18874209
 -1.1822541  -1.17087299 -1.17472833 -1.17004882 -1.1749628  -1.174839
 -1.17076003 -1.1727467  -1.17126744 -1.17213955 -1.17717395 -1.

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=300,
                      random_state=18)


##### RandomForestRegressor using the best hyperparameters from the GridSearchCV


In [53]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


# RandomForestRegressor is an ensemble method
# The TransformedTargetRegressor is passed the RandomForestRegressor model
# The RandomForestRegressor is passed some hyper-parameters such as;
## n_esimtaors: number of trees in the forest,
## max_depth: the maximum depth of the tree,
## criterion: the function to measure the quality of the split

model = TransformedTargetRegressor(RandomForestRegressor(criterion='friedman_mse', max_depth=8,
                      max_features='sqrt', n_estimators=300, random_state=18), transformer=StandardScaler())
model.fit(goalkeepers_splits['feature_train'], goalkeepers_splits['target_train'])

pred_train = model.predict(goalkeepers_splits['feature_train'])
pred_test = model.predict(goalkeepers_splits['features_test'])

train_MSE = mean_squared_error(goalkeepers_splits['target_train'], pred_train)
test_MSE = mean_squared_error(goalkeepers_splits['target_test'], pred_test)

R2_train = model.score(goalkeepers_splits['feature_train'], goalkeepers_splits['target_train'])
R2_test = model.score(goalkeepers_splits['features_test'], goalkeepers_splits['target_test'])

print('Training set MSE: {}'.format(train_MSE))
print('Test set MSE: {}'.format(test_MSE))
print('Training set R2: {}'.format(R2_train))
print('Test set R2: {}'.format(R2_test))

tree_rmses = -cross_val_score(model, goalkeepers_splits['feature_train'], goalkeepers_splits['target_train'],
                              scoring="neg_root_mean_squared_error", cv=10)

pd.Series(tree_rmses).describe()

Training set MSE: 0.7227174395204712
Test set MSE: 1.1875550077600365
Training set R2: 0.7909132193458107
Test set R2: 0.6159565737006112


count    10.000000
mean      1.089370
std       0.154952
min       0.886496
25%       1.012531
50%       1.037961
75%       1.189111
max       1.361156
dtype: float64

#### Defenders

In [54]:
GridSearchParams(defenders_splits['feature_train'], defenders_splits['target_train'])

330 fits failed out of a total of 1320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 373, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

 -1.49267374 -1.49383713 -1.4953359  -1.49603629 -1.49716652 -1.48887686
 -1.49803383 -1.49445552 -1.49659105 -1.49350163 -1.49286483 -1.49267374
 -1.49383713 -1.4953359  -1.49603629 -1.49716652 -1.48254503 -

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=200,
                      random_state=18)


##### RandomForestRegressor using the best hyperparameters from the GridSearchCV

In [58]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


# RandomForestRegressor is an ensemble method
# The TransformedTargetRegressor is passed the RandomForestRegressor model
# The RandomForestRegressor is passed some hyper-parameters such as;
## n_esimtaors: number of trees in the forest,
## max_depth: the maximum depth of the tree,
## criterion: the function to measure the quality of the split

model = TransformedTargetRegressor(RandomForestRegressor(criterion='friedman_mse', max_depth=8,
                      max_features='sqrt', n_estimators=200, random_state=18), transformer=StandardScaler())
model.fit(defenders_splits['feature_train'], defenders_splits['target_train'])

pred_train = model.predict(defenders_splits['feature_train'])
pred_test = model.predict(defenders_splits['features_test'])

train_MSE = mean_squared_error(defenders_splits['target_train'], pred_train)
test_MSE = mean_squared_error(defenders_splits['target_test'], pred_test)

R2_train = model.score(defenders_splits['feature_train'], defenders_splits['target_train'])
R2_test = model.score(defenders_splits['features_test'], defenders_splits['target_test'])

print('Training set MSE: {}'.format(train_MSE))
print('Test set MSE: {}'.format(test_MSE))
print('Training set R2: {}'.format(R2_train))
print('Test set R2: {}'.format(R2_test))

tree_rmses = -cross_val_score(model, defenders_splits['feature_train'], defenders_splits['target_train'],
                              scoring="neg_root_mean_squared_error", cv=10)

pd.Series(tree_rmses).describe()

Training set MSE: 1.3525282695655276
Test set MSE: 2.5111368385378436
Training set R2: 0.7076981836766001
Test set R2: 0.5302844137290342


count    10.000000
mean      1.423372
std       0.098705
min       1.269727
25%       1.351930
50%       1.443910
75%       1.514239
max       1.527302
dtype: float64

#### Midfielders

In [55]:
GridSearchParams(midfielders_splits['feature_train'], midfielders_splits['target_train'])

330 fits failed out of a total of 1320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 373, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

 -1.26984521 -1.26285002 -1.26070868 -1.26142277 -1.26205675 -1.27956709
 -1.27553694 -1.25921887 -1.26523084 -1.26322399 -1.26705438 -1.26984521
 -1.26285002 -1.26070868 -1.26142277 -1.26205675 -1.2407147  -

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=300,
                      random_state=18)


##### RandomForestRegressor using the best hyperparameters from the GridSearchCV

In [59]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


# RandomForestRegressor is an ensemble method
# The TransformedTargetRegressor is passed the RandomForestRegressor model
# The RandomForestRegressor is passed some hyper-parameters such as;
## n_esimtaors: number of trees in the forest,
## max_depth: the maximum depth of the tree,
## criterion: the function to measure the quality of the split

model = TransformedTargetRegressor(RandomForestRegressor(criterion='friedman_mse', max_depth=8,
                      max_features='sqrt', n_estimators=300, random_state=18), transformer=StandardScaler())
model.fit(midfielders_splits['feature_train'], midfielders_splits['target_train'])

pred_train = model.predict(midfielders_splits['feature_train'])
pred_test = model.predict(midfielders_splits['features_test'])

train_MSE = mean_squared_error(midfielders_splits['target_train'], pred_train)
test_MSE = mean_squared_error(midfielders_splits['target_test'], pred_test)

R2_train = model.score(midfielders_splits['feature_train'], midfielders_splits['target_train'])
R2_test = model.score(midfielders_splits['features_test'], midfielders_splits['target_test'])

print('Training set MSE: {}'.format(train_MSE))
print('Test set MSE: {}'.format(test_MSE))
print('Training set R2: {}'.format(R2_train))
print('Test set R2: {}'.format(R2_test))

tree_rmses = -cross_val_score(model, midfielders_splits['feature_train'], midfielders_splits['target_train'],
                              scoring="neg_root_mean_squared_error", cv=10)

pd.Series(tree_rmses).describe()

Training set MSE: 0.8122982640234496
Test set MSE: 1.1723222478916933
Training set R2: 0.8587119854388601
Test set R2: 0.7678006034316724


count    10.000000
mean      1.173845
std       0.100744
min       1.062463
25%       1.100407
50%       1.149468
75%       1.221723
max       1.338032
dtype: float64

#### Forwards

In [56]:
GridSearchParams(forwards_splits['feature_train'], forwards_splits['target_train'])

330 fits failed out of a total of 1320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 373, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

 -1.19803871 -1.17931974 -1.17790588 -1.17943999 -1.18152415 -1.20488469
 -1.21735673 -1.20065864 -1.19054805 -1.1918743  -1.19044922 -1.19803871
 -1.17931974 -1.17790588 -1.17943999 -1.18152415 -1.1623152  -

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=500,
                      random_state=18)


##### RandomForestRegressor using the best hyperparameters from the GridSearchCV

In [62]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


# RandomForestRegressor is an ensemble method
# The TransformedTargetRegressor is passed the RandomForestRegressor model
# The RandomForestRegressor is passed some hyper-parameters such as;
## n_esimtaors: number of trees in the forest,
## max_depth: the maximum depth of the tree,
## criterion: the function to measure the quality of the split

model = TransformedTargetRegressor(RandomForestRegressor(criterion='friedman_mse', max_depth=8,
                      max_features='sqrt', n_estimators=500, random_state=18), transformer=StandardScaler())
model.fit(forwards_splits['feature_train'], forwards_splits['target_train'])

pred_train = model.predict(forwards_splits['feature_train'])
pred_test = model.predict(forwards_splits['features_test'])

train_MSE = mean_squared_error(forwards_splits['target_train'], pred_train)
test_MSE = mean_squared_error(forwards_splits['target_test'], pred_test)

R2_train = model.score(forwards_splits['feature_train'], forwards_splits['target_train'])
R2_test = model.score(forwards_splits['features_test'], forwards_splits['target_test'])

print('Training set MSE: {}'.format(train_MSE))
print('Test set MSE: {}'.format(test_MSE))
print('Training set R2: {}'.format(R2_train))
print('Test set R2: {}'.format(R2_test))

tree_rmses = -cross_val_score(model, forwards_splits['feature_train'], forwards_splits['target_train'],
                              scoring="neg_root_mean_squared_error", cv=10)

pd.Series(tree_rmses).describe()

Training set MSE: 0.41698846058138783
Test set MSE: 1.1133190145000793
Training set R2: 0.9376538348682457
Test set R2: 0.8228224308038157


count    10.000000
mean      1.089309
std       0.140496
min       0.969474
25%       0.984216
50%       1.056134
75%       1.084890
max       1.358957
dtype: float64

### RadomGridSearchCV
This allows for specifying the range in which values can be picked instead of listing the hyperparamter values.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor

# Generate random numbers between 2 aand 100 for the n_estimators parameter
param_distribs = {'n_estimators': randint(low=2, high=100)}

rnd_search = RandomizedSearchCV(
    RandomForestRegressor(), param_distributions=param_distribs,
    cv=3,
    scoring='neg_root_mean_squared_error', random_state=42
  )
rnd_search.fit(features_train,target_train)

# Get the best param combination
rnd_search.best_params_

KeyboardInterrupt: 

### XGBoost

In [12]:
!pip install xgboost



In [13]:
from xgboost import XGBRegressor as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np

#=========================================================================
# XGBoost regression:
# Parameters:
# n_estimators  "Number of gradient boosted trees. Equivalent to number
#                of boosting rounds."
# learning_rate "Boosting learning rate (also known as “eta”)"
# max_depth     "Maximum depth of a tree. Increasing this value will make
#                the model more complex and more likely to overfit."
#=========================================================================
regressor = xgb(eval_metric='rmsle')

#=========================================================================
# exhaustively search for the optimal hyperparameters
#=========================================================================
# set up our search grid
param_grid = {"max_depth":    [4, 5, 6],
              "n_estimators": np.arange(100,1000, 50),
              "learning_rate": np.arange(0.01, .2, 0.005)}

# try out every combination of the above values
search = GridSearchCV(regressor, param_grid, cv=5).fit(goalkeepers_splits['feature_train'], goalkeepers_splits['target_train'])

print("The best hyperparameters are ",search.best_params_)

NameError: name 'np' is not defined

In [1]:
from xgboost import XGBRegressor as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np

#=========================================================================
# XGBoost regression:
# Parameters:
# n_estimators  "Number of gradient boosted trees. Equivalent to number
#                of boosting rounds."
# learning_rate "Boosting learning rate (also known as “eta”)"
# max_depth     "Maximum depth of a tree. Increasing this value will make
#                the model more complex and more likely to overfit."
#=========================================================================
regressor = xgb(eval_metric='rmsle')

#=========================================================================
# exhaustively search for the optimal hyperparameters
#=========================================================================
# set up our search grid
param_grid = {"max_depth":    [4, 5, 6],
              "n_estimators": [100, 200, 300, 400, 500, 600, 700],
              "learning_rate": np.arange(0.01, .2, 0.005)}

# try out every combination of the above values
goalkeepers_search = GridSearchCV(regressor, param_grid, cv=5).fit(goalkeepers_splits['feature_train'], goalkeepers_splits['target_train'])
defenders_search = GridSearchCV(regressor, param_grid, cv=5).fit(defenders_splits['feature_train'], defenders_splits['target_train'])
midfielders_search = GridSearchCV(regressor, param_grid, cv=5).fit(midfielders_splits['feature_train'], midfielders_splits['target_train'])
forwards_search = GridSearchCV(regressor, param_grid, cv=5).fit(forwards_splits['feature_train'], forwards_splits['target_train'])

print("The best hyperparameters are ",goalkeepers_search.best_params_)
print("The best defenders hyperparameters are ",defenders_search.best_params_)
print("The best midfielders hyperparameters are ",midfielders_search.best_params_)
print("The best midfielder hyperparameters are ",forwards_search.best_params_)

NameError: name 'goalkeepers_splits' is not defined