In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.feature_selection import VarianceThreshold

%matplotlib inline

# Helper functions


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# For the linear model


def Linear_regression(features_train, features_test, target_train, target_test):
    # Before using our data, we need to do feature scaling and we opt for the 'standardization' method of scaling.
    # The 'standardization' is avaliable thorugh the StandardScaler() method
    # Transformers help in batching tasks in a pipepline. In this case, the data is scaled and then a linear regression model is fitted on the scaled data.
    # We use a transformer that takes the regression model and the transformation method
    # The TransformedTargetRegressor does the transformation and when we do the prediction, it automatically does the inverse transformation (scaling) and returns the values
    model = TransformedTargetRegressor(
        LinearRegression(), transformer=StandardScaler())

    # fit the transofrmer on the train data
    model.fit(features_train, target_train)

    # With the model fitted, we can predict the total_points given the feature_train and feature_test set
    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    # Evaluate the performance of the model on both sets using the root mean square error
    train_RMSE = mean_squared_error(target_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(target_test, pred_test, squared=False)

    # Get the score of the model or the coeeficient of determination i.e how much of the target value can be explained by the model.
    # In this case, 0.6 implies that 60% of the variations in the target value can be explained by the model and 40% is not explainable
    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    # If the test error significantly differs from the train error, then there is either overfitting or underfitting
    # RMSE, just like the squared loss function that it derives from, effectively penalizes larger errors more severely.
    print('Training set RMSE: {}'.format(train_RMSE))
    print('Test set RMSE: {}'.format(test_RMSE))



    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))

    # Carry out cross validation of the model.
    # The evaluation method is the root mean square error
    # The method expects a utility function (greater is better) and so the scoring function is the opposite of the the RMSE. Hence the -ve
    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)

    print(pd.Series(tree_rmses).describe())

    return {'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


# Decision Tree Model
def DecisionTreeRegression(features_train, features_test, target_train, target_test):
    # The DecisionTreeRegressor is passed as the model to the TransformedTreeRegressor together with the StandardScaler
    model = TransformedTargetRegressor(
        DecisionTreeRegressor(), transformer=StandardScaler())
    model.fit(features_train, target_train)

    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    train_RMSE = mean_squared_error(target_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(target_test, pred_test, squared=False)

    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    print('Training set RMSE: {}'.format(train_RMSE))
    print('Test set RMSE: {}'.format(test_RMSE))
    print('Training set R2: {}'.format(R2_train))
    print('Test set R2: {}'.format(R2_test))

    from sklearn.model_selection import cross_val_score

    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)
    print(pd.Series(tree_rmses).describe())
    return {'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


# RandomForestRegressor
def RandomForestRegression(features_train, features_test, target_train, target_test, hyperparameters):
    # RandomForestRegressor is an ensemble method
    # The TransformedTargetRegressor is passed the RandomForestRegressor model
    # The RandomForestRegressor is passed some hyper-parameters such as;
    # n_esimtaors: number of trees in the forest,
    # max_depth: the maximum depth of the tree,
    # criterion: the function to measure the quality of the split

    model = TransformedTargetRegressor(RandomForestRegressor(
        n_estimators=hyperparameters['n_estimators'],  max_depth=hyperparameters['max_depth'], criterion=hyperparameters['criterion'], random_state=18), transformer=StandardScaler())
    model.fit(features_train, target_train)

    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    train_RMSE = mean_squared_error(target_train, pred_train, squared=False)
    test_RMSE = mean_squared_error(target_test, pred_test, squared=False)

    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    # print('Training set RMSE: {}'.format(train_RMSE))
    # print('Test set RMSE: {}'.format(test_RMSE))
    # print('Training set R2: {}'.format(R2_train))
    # print('Test set R2: {}'.format(R2_test))

    tree_rmses = -cross_val_score(model, features_train, target_train,
                                  scoring="neg_root_mean_squared_error", cv=10)
    print(pd.Series(tree_rmses).describe())

    return {'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


def XGBoostRegression(features_train, features_test, target_train, target_test, hyperparameters):
    regressor = xgb(learning_rate=hyperparameters["learning_rate"],
                    n_estimators=hyperparameters["n_estimators"],
                    max_depth=hyperparameters["max_depth"],
                    eval_metric='rmsle')

    model = TransformedTargetRegressor(regressor, transformer=StandardScaler())


    model.fit(features_train, target_train)

    # =========================================================================
    # To use early_stopping_rounds:
    # "Validation metric needs to improve at least once in every
    # early_stopping_rounds round(s) to continue training."
    # =========================================================================
    # first perform a test/train split
    # from sklearn.model_selection import train_test_split

    # X_train,X_test,y_train,y_test = train_test_split(X_train,y_train, test_size = 0.2)
    # model.fit(X_train, y_train, early_stopping_rounds=6, eval_set=[(X_test, y_test)], verbose=False)

    # =========================================================================
    # use the model to predict the prices for the test data
    # =========================================================================
    # predictions = model.predict(goalkeepers_splits['feature_test'])

    pred_train = model.predict(features_train)
    pred_test = model.predict(features_test)

    train_RMSE = mean_squared_error(target_train,  pred_train, squared=False)
    test_RMSE = mean_squared_error(target_test, pred_test, squared=False)

    R2_train = model.score(features_train, target_train)
    R2_test = model.score(features_test, target_test)

    # print('Training set RMSE: {}'.format(train_RMSE))
    # print('Test set RMSE: {}'.format(test_RMSE))
    # print('Training set R2: {}'.format(R2_train))
    # print('Test set R2: {}'.format(R2_test))

    tree_rmses = -cross_val_score(model, features_train, target_train, scoring="neg_root_mean_squared_error", cv=10)
    pd.Series(tree_rmses).describe()

    return {'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE,  'cv_rmse': tree_rmses.mean(), 'R2_train': R2_train, 'R2_test': R2_test}


def GridSearchParams(features_train, target_train):
    # Instatiate the model
    model = RandomForestRegressor()

    param_grid = {'n_estimators': [8, 10, 12, 14, 16, 18, 20]}

    # Define the possible values of the hyperparameter
    grid = {
        'n_estimators': [8, 10, 12, 14, 16, 18, 20, 200, 300, 400, 500],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [4, 5, 6, 7, 8],
        'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
        'random_state': [18]
    }

    # Deine the model with cv=3 for a 3-fold cross validation
    # GridSearchCV has the best_estimator_ parameter that returns the  estimator
    # which gave highest score (or smallest loss if specified)

    grid_search = GridSearchCV(
        model, grid, cv=3, scoring='neg_root_mean_squared_error')
    grid_search.fit(features_train, target_train)

    # Get the best param combination
    print(grid_search.best_estimator_)

    return {'train_RMSE': train_RMSE, 'test_RMSE': test_RMSE, 'R2_train': R2_train, 'R2_test': R2_test}

In [11]:

# define a function that splits and returns features_train, features_test, target_train, target_test

def split_data(data):
    # Store the 'total_points' target in the 'player_target' variable
    # and the rest in the player_features variable
    player_target = data['pts_bps']
    player_features = data.drop("pts_bps", axis=1)

    # The train_test_split function splits the set into train and test sets while maintain the same data distribution over both sets.
    # It takes the feature and target sets and reutrns the respective train and test sets
    features_train, features_test, target_train, target_test = train_test_split(
        player_features, player_target, test_size=0.2)

    return {'feature_train': features_train, 'features_test': features_test, 'target_train': target_train, 'target_test': target_test}

# Get the data


```
    Total Points – Bonus Points (tp-bp), Minutes, Yellow Cards, Red Cards, Expected Goals (xG), Expected Assists (xA), Non-penalty Expected Goals (npxG),
    Shots, Expected Goals Against, _Expected_goal_involvements_,  clean_sheets, ict_index, opponent_team, Expected Goals Buildup (xG Buildup), threat, value,
    Key Passes,


    _Games_,  Expected Goals Chain (xG Chain),  _Non-penalty Expected Goal Difference (npxGD)_, _Non-penalty Expected Goals Against (npxGA)_, Expected Points (xPts)
```

```js
    Total Points – Bonus Points (tp-bp)	for, mid, def, gk
    Minutes	for, mid, def, gk
    Yellow Cards	for, mid, def, gk
    Red Cards	for, mid, def, gk
    Expected Goals (xG)	for, mid, def, gk
    Expected Assists (xA)	for, mid, def, gk
    Non-penalty Expected Goals (npxG)	for, mid, def
    Games	for, mid, def, gk
    Shots	for, mid, def
    Key Passes	for, mid, def, gk
    Expected Goals Chain (xG Chain)	for, mid, def
    Expected Goals Buildup (xG Buildup)	for, mid, def
    Non-penalty Expected Goal Difference (npxGD)	def, gk
    Expected Goals Against	def, gk
    Non-penalty Expected Goals Against (npxGA)	def, gk
    Expected Points (xPts)	def, gk
    expected_goal_involvements	for, mid, def, gk
    clean_sheets	mid, def, gk
    ict_index	for, mid, def, gk
    opponent_team	for, mid, def, gk
    threat	for, mid, def, gk
    value	for, mid, def, gk

```


In [40]:
columns = [
    'total_points', 'bonus', 'minutes', 'yellow_cards', 'red_cards', 'expected_goals', 'expected_assists', 'npxG', 'shots', 'expected_goal_involvements', 'expected_goals_conceded', 'clean_sheets', 'ict_index',
    'xGBuildup', 'threat', 'value', 'key_passes', 'xGChain', 'xP', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'position']

gk_cols = [
            'pts_bps', 'minutes', 'yellow_cards', 'red_cards', 'expected_goals', 'expected_assists', 'expected_goal_involvements', 'expected_goals_conceded', 'clean_sheets', 'ict_index',
            'threat', 'value', 'key_passes', 'xP', 'team_h_difficulty', 'team_a_difficulty', 'was_home']
def_cols = [
            'pts_bps', 'minutes', 'yellow_cards', 'red_cards', 'expected_goals', 'expected_assists', 'npxG', 'shots', 'expected_goal_involvements', 'expected_goals_conceded', 'clean_sheets', 'ict_index',
            'xGBuildup', 'threat', 'value', 'key_passes', 'xGChain', 'xP', 'team_h_difficulty', 'team_a_difficulty', 'was_home',
]
mid_cols = [
            'pts_bps', 'minutes', 'yellow_cards', 'red_cards', 'expected_goals', 'expected_assists', 'npxG', 'shots', 'expected_goal_involvements', 'clean_sheets', 'ict_index',
            'xGBuildup', 'threat', 'value', 'key_passes', 'xGChain', 'xP', 'team_h_difficulty', 'team_a_difficulty', 'was_home',
]
fwd_cols = [
            'pts_bps', 'minutes', 'yellow_cards', 'red_cards', 'expected_goals', 'expected_assists', 'npxG', 'shots', 'expected_goal_involvements', 'ict_index',
            'xGBuildup', 'threat', 'value', 'key_passes', 'xGChain', 'xP', 'team_h_difficulty', 'team_a_difficulty', 'was_home',
]



In [3]:
# player_21_22 = pd.read_csv('./data/joint/21-22/merged_player_data.csv')[columns]
player_22_23 = pd.read_csv('./data/joint/22-23/merged_player_data.csv')[columns]
player_23_24 = pd.read_csv('./data/joint/23-24/merged_player_data.csv')[columns]


In [4]:
## Filter out players with zero points
player_22_23_no = player_22_23[player_22_23['total_points'] !=0]
player_23_24_no = player_23_24[player_23_24['total_points'] !=0]

player_data = pd.concat([player_22_23_no, player_23_24_no])

def points_(row):
    return row['total_points'] - row['bonus']
player_data['pts_bps'] = player_data.apply(points_, axis=1)
player_data = player_data.drop(['total_points', 'bonus'], axis=1)

In [41]:
# Group by position
gk_player_data = player_data[player_data['position']=='GK']
gk_player_data = gk_player_data.drop('position', axis=1)
gk_player_data = gk_player_data[gk_cols]

def_player_data = player_data[player_data['position']=='DEF']
def_player_data = def_player_data.drop('position', axis=1)
def_player_data = def_player_data[def_cols]

mid_player_data = player_data[player_data['position']=='MID']
mid_player_data = mid_player_data.drop('position', axis=1)
mid_player_data = mid_player_data[mid_cols]

fwd_player_data = player_data[player_data['position']=='FWD']
fwd_player_data = fwd_player_data.drop('position', axis=1)
fwd_player_data = fwd_player_data[fwd_cols]

# Modeling


## Baseline Model


### Goalkeepers


In [42]:
# Split the gk data into train and test sets
gk_splits = split_data(gk_player_data)

#### Linear Model


In [43]:

base_gk_lin_reg = Linear_regression(gk_splits['feature_train'], gk_splits['features_test'],
                               gk_splits['target_train'], gk_splits['target_test'])

# Store the model evaluation details in a DataFrame
gk_evaluation_stats = pd.DataFrame({"base_gk_lin_reg": [base_gk_lin_reg['train_RMSE'], base_gk_lin_reg['test_RMSE'], base_gk_lin_reg['cv_rmse'], base_gk_lin_reg['R2_train'], base_gk_lin_reg['R2_test']]},
                                                        index=(['train_RMSE', 'test_RMSE', 'cv_rmse', 'R2_train', 'R2_test']))

gk_evaluation_stats

Training set RMSE: 0.8666694545135065
Test set RMSE: 0.9547367922783395
Training set R2: 0.8617362962638505
Test set R2: 0.8240696338751616
count    10.000000
mean      0.887170
std       0.144102
min       0.707084
25%       0.767397
50%       0.914407
75%       0.969839
max       1.145134
dtype: float64


Unnamed: 0,base_gk_lin_reg
train_RMSE,0.866669
test_RMSE,0.954737
cv_rmse,0.88717
R2_train,0.861736
R2_test,0.82407


#### DecisionTree Model


In [44]:
base_gk_dt_reg = DecisionTreeRegression(gk_splits['feature_train'], gk_splits['features_test'],
                               gk_splits['target_train'], gk_splits['target_test'])

# Store the model evaluation details in a DataFrame
gk_evaluation_stats = gk_evaluation_stats.assign(base_gk_dt_reg = [base_gk_dt_reg['train_RMSE'], base_gk_dt_reg['test_RMSE'], base_gk_dt_reg['cv_rmse'], base_gk_dt_reg['R2_train'], base_gk_dt_reg['R2_test']])

gk_evaluation_stats



Training set RMSE: 1.76330628745295e-16
Test set RMSE: 1.0440737953277488
Training set R2: 1.0
Test set R2: 0.7896047746604633


count    10.000000
mean      1.129773
std       0.123166
min       0.917011
25%       1.054332
50%       1.147251
75%       1.210570
max       1.323937
dtype: float64


Unnamed: 0,base_gk_lin_reg,base_gk_dt_reg
train_RMSE,0.866669,1.763306e-16
test_RMSE,0.954737,1.044074
cv_rmse,0.88717,1.129773
R2_train,0.861736,1.0
R2_test,0.82407,0.7896048


#### RandomForest Model


In [45]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}
base_gk_rf_reg = RandomForestRegression(gk_splits['feature_train'], gk_splits['features_test'],
                               gk_splits['target_train'], gk_splits['target_test'], hyperparameters)

# Store the model evaluation details in a DataFrame
gk_evaluation_stats = gk_evaluation_stats.assign(base_gk_rf_reg = [base_gk_rf_reg['train_RMSE'], base_gk_rf_reg['test_RMSE'], base_gk_rf_reg['cv_rmse'], base_gk_rf_reg['R2_train'], base_gk_rf_reg['R2_test']])

gk_evaluation_stats

count    10.000000
mean      0.849519
std       0.156072
min       0.656151
25%       0.717453
50%       0.872263
75%       0.945775
max       1.106741
dtype: float64


Unnamed: 0,base_gk_lin_reg,base_gk_dt_reg,base_gk_rf_reg
train_RMSE,0.866669,1.763306e-16,0.534741
test_RMSE,0.954737,1.044074,0.718032
cv_rmse,0.88717,1.129773,0.849519
R2_train,0.861736,1.0,0.947363
R2_test,0.82407,0.7896048,0.900491


#### XgBoost Model


In [46]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}
base_gk_xgb_reg = XGBoostRegression(gk_splits['feature_train'], gk_splits['features_test'], gk_splits['target_train'], gk_splits['target_test'], hyperparameters)

# Store the model evaluation details in a DataFrame
gk_evaluation_stats = gk_evaluation_stats.assign(base_gk_xgb_reg = [base_gk_xgb_reg['train_RMSE'], base_gk_xgb_reg['test_RMSE'], base_gk_xgb_reg['cv_rmse'], base_gk_xgb_reg['R2_train'], base_gk_xgb_reg['R2_test']])
gk_evaluation_stats

Unnamed: 0,base_gk_lin_reg,base_gk_dt_reg,base_gk_rf_reg,base_gk_xgb_reg
train_RMSE,0.866669,1.763306e-16,0.534741,0.680271
test_RMSE,0.954737,1.044074,0.718032,0.707018
cv_rmse,0.88717,1.129773,0.849519,0.856708
R2_train,0.861736,1.0,0.947363,0.914815
R2_test,0.82407,0.7896048,0.900491,0.903521


### Defenders


In [47]:
# Split the gk data into train and test sets
def_splits = split_data(def_player_data)

#### Linear Model


In [48]:

base_def_lin_reg = Linear_regression(def_splits['feature_train'], def_splits['features_test'],
                               def_splits['target_train'], def_splits['target_test'])

# Store the model evaluation details in a DataFrame
def_evaluation_stats = pd.DataFrame({"base_def_lin_reg": [base_def_lin_reg['train_RMSE'], base_def_lin_reg['test_RMSE'], base_def_lin_reg['cv_rmse'], base_def_lin_reg['R2_train'], base_def_lin_reg['R2_test']]},
                                                        index=(['train_RMSE', 'test_RMSE', 'cv_rmse', 'R2_train', 'R2_test']))

def_evaluation_stats

Training set RMSE: 0.9881638099068195
Test set RMSE: 1.0500368551635082
Training set R2: 0.843257828690887
Test set R2: 0.8422845501458033
count    10.000000
mean      0.996349
std       0.064262
min       0.868749
25%       0.977478
50%       0.986592
75%       1.025145
max       1.102427
dtype: float64


Unnamed: 0,base_def_lin_reg
train_RMSE,0.988164
test_RMSE,1.050037
cv_rmse,0.996349
R2_train,0.843258
R2_test,0.842285


#### DecisionTree Model


In [49]:
base_def_dt_reg = DecisionTreeRegression(def_splits['feature_train'], def_splits['features_test'],
                               def_splits['target_train'], def_splits['target_test'])

# Store the model evaluation details in a DataFrame
def_evaluation_stats = def_evaluation_stats.assign(base_def_dt_reg = [base_def_dt_reg['train_RMSE'], base_def_dt_reg['test_RMSE'], base_def_dt_reg['cv_rmse'], base_def_dt_reg['R2_train'], base_def_dt_reg['R2_test']])

def_evaluation_stats



Training set RMSE: 3.263849605818619e-15
Test set RMSE: 1.3908150322617885
Training set R2: 1.0
Test set R2: 0.7233033470082894
count    10.000000
mean      1.298078
std       0.111831
min       1.178188
25%       1.211493
50%       1.286162
75%       1.343359
max       1.515521
dtype: float64


Unnamed: 0,base_def_lin_reg,base_def_dt_reg
train_RMSE,0.988164,3.26385e-15
test_RMSE,1.050037,1.390815
cv_rmse,0.996349,1.298078
R2_train,0.843258,1.0
R2_test,0.842285,0.7233033


#### RandomForest Model


In [50]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}
base_def_rf_reg = RandomForestRegression(def_splits['feature_train'], def_splits['features_test'],
                               def_splits['target_train'], def_splits['target_test'], hyperparameters)

# Store the model evaluation details in a DataFrame
def_evaluation_stats = def_evaluation_stats.assign(base_def_rf_reg = [base_def_rf_reg['train_RMSE'], base_def_rf_reg['test_RMSE'], base_def_rf_reg['cv_rmse'], base_def_rf_reg['R2_train'], base_def_rf_reg['R2_test']])

def_evaluation_stats

count    10.000000
mean      0.947927
std       0.104976
min       0.756752
25%       0.907261
50%       0.926384
75%       0.983453
max       1.158156
dtype: float64


Unnamed: 0,base_def_lin_reg,base_def_dt_reg,base_def_rf_reg
train_RMSE,0.988164,3.26385e-15,0.671091
test_RMSE,1.050037,1.390815,1.040306
cv_rmse,0.996349,1.298078,0.947927
R2_train,0.843258,1.0,0.927708
R2_test,0.842285,0.7233033,0.845194


#### XgBoost Model


In [51]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}
base_def_xgb_reg = XGBoostRegression(def_splits['feature_train'], def_splits['features_test'], def_splits['target_train'], def_splits['target_test'], hyperparameters)

# Store the model evaluation details in a DataFrame
def_evaluation_stats = def_evaluation_stats.assign(base_def_xgb_reg = [base_def_xgb_reg['train_RMSE'], base_def_xgb_reg['test_RMSE'], base_def_xgb_reg['cv_rmse'], base_def_xgb_reg['R2_train'], base_def_xgb_reg['R2_test']])
def_evaluation_stats

Unnamed: 0,base_def_lin_reg,base_def_dt_reg,base_def_rf_reg,base_def_xgb_reg
train_RMSE,0.988164,3.26385e-15,0.671091,0.854862
test_RMSE,1.050037,1.390815,1.040306,1.070279
cv_rmse,0.996349,1.298078,0.947927,0.967495
R2_train,0.843258,1.0,0.927708,0.882694
R2_test,0.842285,0.7233033,0.845194,0.836145


### Midfielders


In [52]:
# Split the gk data into train and test sets
mid_splits = split_data(mid_player_data)

#### Linear Model


In [53]:

base_mid_lin_reg = Linear_regression(mid_splits['feature_train'], mid_splits['features_test'],
                               mid_splits['target_train'], mid_splits['target_test'])

# Store the model evaluation details in a DataFrame
mid_evaluation_stats = pd.DataFrame({"base_mid_lin_reg": [base_mid_lin_reg['train_RMSE'], base_mid_lin_reg['test_RMSE'], base_mid_lin_reg['cv_rmse'], base_mid_lin_reg['R2_train'], base_mid_lin_reg['R2_test']]},
                                                        index=(['train_RMSE', 'test_RMSE', 'cv_rmse', 'R2_train', 'R2_test']))

mid_evaluation_stats

Training set RMSE: 1.0225933223651604
Test set RMSE: 1.0071337033302326
Training set R2: 0.827081791423851
Test set R2: 0.8259616543207144
count    10.000000
mean      1.027555
std       0.023931
min       0.998427
25%       1.008550
50%       1.028332
75%       1.037493
max       1.078652
dtype: float64


Unnamed: 0,base_mid_lin_reg
train_RMSE,1.022593
test_RMSE,1.007134
cv_rmse,1.027555
R2_train,0.827082
R2_test,0.825962


#### DecisionTree Model


In [54]:
base_mid_dt_reg = DecisionTreeRegression(mid_splits['feature_train'], mid_splits['features_test'],
                               mid_splits['target_train'], mid_splits['target_test'])

# Store the model evaluation details in a DataFrame
mid_evaluation_stats = mid_evaluation_stats.assign(base_mid_dt_reg = [base_mid_dt_reg['train_RMSE'], base_mid_dt_reg['test_RMSE'], base_mid_dt_reg['cv_rmse'], base_mid_dt_reg['R2_train'], base_mid_dt_reg['R2_test']])

mid_evaluation_stats



Training set RMSE: 4.665718631925586e-15
Test set RMSE: 1.4548466562509097
Training set R2: 1.0
Test set R2: 0.6368340460953038
count    10.000000
mean      1.493043
std       0.103292
min       1.351210
25%       1.426836
50%       1.506224
75%       1.559925
max       1.674371
dtype: float64


Unnamed: 0,base_mid_lin_reg,base_mid_dt_reg
train_RMSE,1.022593,4.665719e-15
test_RMSE,1.007134,1.454847
cv_rmse,1.027555,1.493043
R2_train,0.827082,1.0
R2_test,0.825962,0.636834


#### RandomForest Model


In [55]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}
base_mid_rf_reg = RandomForestRegression(mid_splits['feature_train'], mid_splits['features_test'],
                               mid_splits['target_train'], mid_splits['target_test'], hyperparameters)

# Store the model evaluation details in a DataFrame
mid_evaluation_stats = mid_evaluation_stats.assign(base_mid_rf_reg = [base_mid_rf_reg['train_RMSE'], base_mid_rf_reg['test_RMSE'], base_mid_rf_reg['cv_rmse'], base_mid_rf_reg['R2_train'], base_mid_rf_reg['R2_test']])

mid_evaluation_stats

count    10.000000
mean      1.091387
std       0.062725
min       0.988239
25%       1.050997
50%       1.098002
75%       1.119026
max       1.186334
dtype: float64


Unnamed: 0,base_mid_lin_reg,base_mid_dt_reg,base_mid_rf_reg
train_RMSE,1.022593,4.665719e-15,0.798703
test_RMSE,1.007134,1.454847,1.104558
cv_rmse,1.027555,1.493043,1.091387
R2_train,0.827082,1.0,0.894511
R2_test,0.825962,0.636834,0.790662


#### XgBoost Model


In [56]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}
base_mid_xgb_reg = XGBoostRegression(mid_splits['feature_train'], mid_splits['features_test'], mid_splits['target_train'], mid_splits['target_test'], hyperparameters)

# Store the model evaluation details in a DataFrame
mid_evaluation_stats = mid_evaluation_stats.assign(base_mid_xgb_reg = [base_mid_xgb_reg['train_RMSE'], base_mid_xgb_reg['test_RMSE'], base_mid_xgb_reg['cv_rmse'], base_mid_xgb_reg['R2_train'], base_mid_xgb_reg['R2_test']])
mid_evaluation_stats

Unnamed: 0,base_mid_lin_reg,base_mid_dt_reg,base_mid_rf_reg,base_mid_xgb_reg
train_RMSE,1.022593,4.665719e-15,0.798703,1.049512
test_RMSE,1.007134,1.454847,1.104558,1.166173
cv_rmse,1.027555,1.493043,1.091387,1.161116
R2_train,0.827082,1.0,0.894511,0.817858
R2_test,0.825962,0.636834,0.790662,0.766656


### Forwards


In [57]:
# Split the gk data into train and test sets
for_splits = split_data(fwd_player_data)

#### Linear Model


In [58]:

base_for_lin_reg = Linear_regression(for_splits['feature_train'], for_splits['features_test'],
                               for_splits['target_train'], for_splits['target_test'])

# Store the model evaluation details in a DataFrame
for_evaluation_stats = pd.DataFrame({"base_for_lin_reg": [base_for_lin_reg['train_RMSE'], base_for_lin_reg['test_RMSE'], base_for_lin_reg['cv_rmse'], base_for_lin_reg['R2_train'], base_for_lin_reg['R2_test']]},
                                                        index=(['train_RMSE', 'test_RMSE', 'cv_rmse', 'R2_train', 'R2_test']))

for_evaluation_stats

Training set RMSE: 0.900325055225056
Test set RMSE: 0.9259475119208623
Training set R2: 0.8783408896949323
Test set R2: 0.9035256079638062
count    10.000000
mean      0.912752
std       0.070553
min       0.816379
25%       0.870412
50%       0.887105
75%       0.937166
max       1.057395
dtype: float64


Unnamed: 0,base_for_lin_reg
train_RMSE,0.900325
test_RMSE,0.925948
cv_rmse,0.912752
R2_train,0.878341
R2_test,0.903526


#### DecisionTree Model


In [59]:
base_for_dt_reg = DecisionTreeRegression(for_splits['feature_train'], for_splits['features_test'],
                               for_splits['target_train'], for_splits['target_test'])

# Store the model evaluation details in a DataFrame
for_evaluation_stats = for_evaluation_stats.assign(base_for_dt_reg = [base_for_dt_reg['train_RMSE'], base_for_dt_reg['test_RMSE'], base_for_dt_reg['cv_rmse'], base_for_dt_reg['R2_train'], base_for_dt_reg['R2_test']])

for_evaluation_stats



Training set RMSE: 5.649877792470089e-15
Test set RMSE: 1.5397947077242096
Training set R2: 1.0
Test set R2: 0.7332128193287577
count    10.000000
mean      1.666214
std       0.182951
min       1.412168
25%       1.555949
50%       1.645276
75%       1.827235
max       1.921885
dtype: float64


Unnamed: 0,base_for_lin_reg,base_for_dt_reg
train_RMSE,0.900325,5.649878e-15
test_RMSE,0.925948,1.539795
cv_rmse,0.912752,1.666214
R2_train,0.878341,1.0
R2_test,0.903526,0.7332128


#### RandomForest Model


In [60]:
hyperparameters = {"criterion": 'friedman_mse', "max_depth": 8, "max_features": 'sqrt', "n_estimators": 20}
base_for_rf_reg = RandomForestRegression(for_splits['feature_train'], for_splits['features_test'],
                               for_splits['target_train'], for_splits['target_test'], hyperparameters)

# Store the model evaluation details in a DataFrame
for_evaluation_stats = for_evaluation_stats.assign(base_for_rf_reg = [base_for_rf_reg['train_RMSE'], base_for_rf_reg['test_RMSE'], base_for_rf_reg['cv_rmse'], base_for_rf_reg['R2_train'], base_for_rf_reg['R2_test']])

for_evaluation_stats

count    10.000000
mean      1.162595
std       0.137746
min       0.916681
25%       1.090327
50%       1.145427
75%       1.265113
max       1.375213
dtype: float64


Unnamed: 0,base_for_lin_reg,base_for_dt_reg,base_for_rf_reg
train_RMSE,0.900325,5.649878e-15,0.706578
test_RMSE,0.925948,1.539795,1.201435
cv_rmse,0.912752,1.666214,1.162595
R2_train,0.878341,1.0,0.925068
R2_test,0.903526,0.7332128,0.83758


#### XgBoost Model


In [61]:
hyperparameters = {'learning_rate': 0.02, 'max_depth': 4, 'n_estimators': 150}
base_for_xgb_reg = XGBoostRegression(for_splits['feature_train'], for_splits['features_test'], for_splits['target_train'], for_splits['target_test'], hyperparameters)

# Store the model evaluation details in a DataFrame
for_evaluation_stats = for_evaluation_stats.assign(base_for_xgb_reg = [base_for_xgb_reg['train_RMSE'], base_for_xgb_reg['test_RMSE'], base_for_xgb_reg['cv_rmse'], base_for_xgb_reg['R2_train'], base_for_xgb_reg['R2_test']])
for_evaluation_stats

Unnamed: 0,base_for_lin_reg,base_for_dt_reg,base_for_rf_reg,base_for_xgb_reg
train_RMSE,0.900325,5.649878e-15,0.706578,0.903485
test_RMSE,0.925948,1.539795,1.201435,1.205441
cv_rmse,0.912752,1.666214,1.162595,1.200257
R2_train,0.878341,1.0,0.925068,0.877486
R2_test,0.903526,0.7332128,0.83758,0.836495
