In [38]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline  # Use the imblearn pipeline

from sklearn.utils.validation import column_or_1d
from sklearn.ensemble import RandomForestClassifier

In [39]:
%run ./ml_model_init.ipynb

In [40]:
data_tar_preds = data_tar.copy()
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,npg_5,npxG_5,xGChain_5,xGBuildup_5,xP_5,selected_5,pts_bps,whh,whd,wha
17,0,0,17,0,0.0,14,0.01,0.01,0.00,2.03,...,0.0,0.00,0.44,0.44,2.1,289678.0,3.0,0.62,0.25,0.20
35,0,0,-8,0,21.8,388,0.02,0.17,0.15,3.15,...,0.0,0.15,1.70,1.35,28.7,3267128.0,0.0,0.15,0.20,0.71
51,0,0,10,0,27.8,217,0.04,0.09,0.05,1.48,...,0.0,0.17,0.33,0.11,9.5,79189.0,2.0,0.38,0.32,0.36
67,1,0,19,0,13.7,453,0.01,0.12,0.11,1.82,...,0.0,1.05,2.76,1.27,13.0,524668.0,5.0,0.62,0.25,0.20
77,0,0,4,0,0.7,463,0.00,0.00,0.00,0.11,...,0.0,0.00,0.02,0.00,-0.3,2784.0,1.0,0.62,0.25,0.20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,0,0,14,0,3.3,18,0.08,0.08,0.00,0.29,...,2.0,2.48,2.19,2.17,38.7,15131904.0,2.0,0.17,0.22,0.69
3863,0,0,1,0,0.0,425,0.00,0.00,0.00,0.43,...,0.0,0.00,0.00,0.00,0.2,13728.0,0.0,0.38,0.32,0.36
3895,0,0,9,0,11.0,600,0.01,0.01,0.00,1.59,...,0.0,0.73,0.94,0.21,7.3,14658.0,1.0,0.51,0.26,0.28
3911,0,0,0,0,10.8,110,0.01,0.15,0.14,1.95,...,1.0,0.70,0.89,0.07,22.1,5536769.0,2.0,0.17,0.22,0.69


## Functions


In [41]:
# For the linear model
def Linear_regression(features_train, features_pred, target_train):
    # bool_cols = features_train.drop(columns=['was_home']).columns.tolist()

    bool_cols = features_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = features_train.select_dtypes(include=['object', 'category']).columns.tolist()

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('one_hot_encoder', OneHotEncoder())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, bool_cols),
            ('cat', categorical_transformer, categorical_cols),
        ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])

    model = TransformedTargetRegressor(regressor=pipeline, transformer=StandardScaler())
    model.fit(features_train, target_train)

    return model.predict(features_pred)

# Decision Tree Model
def DecisionTreeRegression(features_train, features_pred, target_train):
    bool_cols = features_train.drop(columns=['was_home']).columns.tolist()
    categorical_cols = ['was_home']

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('one_hot_encoder', OneHotEncoder())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, bool_cols),
            ('cat', categorical_transformer, categorical_cols),
        ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', DecisionTreeRegressor())
    ])

    model = TransformedTargetRegressor(regressor=pipeline, transformer=StandardScaler())
    model.fit(features_train, target_train)

    return model.predict(features_pred)

# RandomForestRegressor
def RandomForestRegression(features_train, features_pred, target_train, hyperparameters):
    model = TransformedTargetRegressor(RandomForestRegressor(
        n_estimators=hyperparameters['n_estimators'],  max_depth=hyperparameters['max_depth'], criterion=hyperparameters['criterion'], random_state=18), transformer=StandardScaler())
    model.fit(features_train, target_train)

    pred_pred = model.predict(features_pred)

def XGBoostRegression(features_train, features_pred, target_train, hyperparameters):
    regressor = xgb(learning_rate=hyperparameters["learning_rate"],
                    n_estimators=hyperparameters["n_estimators"],
                    max_depth=hyperparameters["max_depth"],
                    eval_metric='rmsle')

    model = TransformedTargetRegressor(regressor, transformer=StandardScaler())


    model.fit(features_train, target_train)

    return model.predict(features_pred)


def Random_Forest_Classifier(features_train, features_pred, target_train, hy_params):
    encoder = LabelEncoder()
    cs_train_ = encoder.fit_transform(target_train)

    # bool_cols = features_train.drop(columns=['was_home']).columns.tolist()
    # categorical_cols = ['was_home']
    bool_cols = features_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = features_train.select_dtypes(include=['object', 'category']).columns.tolist()

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('one_hot_encoder', OneHotEncoder(sparse=False)),
        # ('to_dense', ToDense())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, bool_cols),
            ('cat', categorical_transformer, categorical_cols),
        ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', BorderlineSMOTE(sampling_strategy='auto', random_state=42)),  # Apply SMOTE to the data
        ('classifier', RandomForestClassifier(bootstrap = hy_params['bootstrap'], min_samples_split = hy_params['min_samples_split'], n_estimators =hy_params['n_estimators']  , class_weight='balanced', random_state=42))  # Random Forest Classifier
    ])

    pipeline.fit(features_train, cs_train_)

    # Make predictions
    return pipeline.predict(features_pred)

In [65]:
len(data[data['position'] == 'GK']['element'].unique()) + len(data[data['position'] == 'DEF']['element'].unique()) + len(data[data['position'] == 'MID']['element'].unique()) + len(data[data['position'] == 'FWD']['element'].unique())

987

In [64]:
len(data[data['position'] == 'GK']['element'].unique()), len(data[data['position'] == 'DEF']['element'].unique()), len(data[data['position'] == 'MID']['element'].unique()), len(data[data['position'] == 'FWD']['element'].unique())

(81, 332, 443, 131)

In [76]:
len(data[(data['position'] != 'GK')]['element'].unique())

660

## Goals Predictor


In [77]:

# (data['minutes_5'] >= 300) &
xg_data= data[(data['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',  'ict_index_5',
                                                                     'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5', 'was_home', 'xG', 'xG_3',
                                                                      'xG_5', 'expected_goals_3', 'expected_goals_5', 'goals_scored_3', 'whh', 'whd', 'wha']]

# (data_tar['minutes_5'] >= 300) &
xg_data_tar = data_tar[ (data_tar['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',
                                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3',
                                                                                        'threat_5', 'was_home', 'xG', 'xG_3', 'xG_5', 'expected_goals_3', 'expected_goals_5',
                                                                                        'goals_scored_3','whh', 'whd', 'wha']]

In [79]:
xg_data_tar

Unnamed: 0,position,minutes_3,minutes_5,team_h_difficulty,team_a_difficulty,ict_index_3,ict_index_5,influence_3,influence_5,creativity_3,...,was_home,xG,xG_3,xG_5,expected_goals_3,expected_goals_5,goals_scored_3,whh,whd,wha
35,DEF,270.0,450.0,5.0,2.0,11.9,21.5,55.4,91.2,52.4,...,True,0.306113,0.11,0.15,0.13,0.17,0.0,0.15,0.20,0.71
51,MID,270.0,352.0,3.0,3.0,5.9,7.7,29.0,32.0,15.9,...,True,0.056160,0.15,0.17,0.13,0.15,0.0,0.38,0.32,0.36
67,FWD,243.0,394.0,2.0,3.0,9.0,11.1,21.2,23.6,18.1,...,False,0.134276,1.05,1.05,0.73,0.73,0.0,0.62,0.25,0.20
77,MID,3.0,3.0,2.0,3.0,1.8,1.8,4.2,4.2,14.1,...,False,0.000000,0.00,0.00,0.00,0.00,0.0,0.62,0.25,0.20
115,MID,97.0,175.0,3.0,3.0,11.5,12.6,17.8,21.6,65.5,...,True,0.000000,0.15,0.15,0.14,0.14,0.0,0.42,0.28,0.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,DEF,270.0,450.0,4.0,4.0,9.1,26.4,57.0,151.8,27.5,...,False,0.000000,0.00,2.48,0.00,1.88,0.0,0.17,0.22,0.69
3863,DEF,0.0,1.0,3.0,3.0,0.0,0.7,0.0,7.2,0.0,...,False,0.000000,0.00,0.00,0.00,0.00,0.0,0.38,0.32,0.36
3895,MID,128.0,198.0,3.0,4.0,11.5,15.1,19.2,31.6,46.7,...,False,0.000000,0.68,0.73,0.54,0.59,0.0,0.51,0.26,0.28
3911,FWD,255.0,411.0,4.0,4.0,6.1,18.8,8.8,51.2,16.1,...,True,0.062779,0.43,0.70,0.41,0.62,0.0,0.17,0.22,0.69


In [44]:

xg = xg_data[['xG']]
feats = xg_data.drop('xG', axis=1)

xg_tar = xg_data_tar[['xG']].copy()
feats_tar = xg_data_tar.drop('xG', axis=1)

In [45]:
xg_pred = Linear_regression(feats, feats_tar, xg)
xg_tar['xg_pred'] = xg_pred
# xg_tar = xg_tar.drop('xG', axis=1)
# comb_data = xg_data_tar.join(xg_tar)
# comb_data

def add_preds(row):
    if row.name in list(xg_tar.index):
        # print(row.name, xg_tar.loc[row.name, 'xg_pred'])
        row['xg_pred'] = xg_tar.loc[row.name, 'xg_pred']
    else:
        row['xg_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,npxG_5,xGChain_5,xGBuildup_5,xP_5,selected_5,pts_bps,whh,whd,wha,xg_pred
17,0,0,17,0,0.0,14,0.01,0.01,0.00,2.03,...,0.00,0.44,0.44,2.1,289678.0,3.0,0.62,0.25,0.20,0.000000
35,0,0,-8,0,21.8,388,0.02,0.17,0.15,3.15,...,0.15,1.70,1.35,28.7,3267128.0,0.0,0.15,0.20,0.71,0.051633
51,0,0,10,0,27.8,217,0.04,0.09,0.05,1.48,...,0.17,0.33,0.11,9.5,79189.0,2.0,0.38,0.32,0.36,0.065206
67,1,0,19,0,13.7,453,0.01,0.12,0.11,1.82,...,1.05,2.76,1.27,13.0,524668.0,5.0,0.62,0.25,0.20,0.270399
77,0,0,4,0,0.7,463,0.00,0.00,0.00,0.11,...,0.00,0.02,0.00,-0.3,2784.0,1.0,0.62,0.25,0.20,0.071579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,0,0,14,0,3.3,18,0.08,0.08,0.00,0.29,...,2.48,2.19,2.17,38.7,15131904.0,2.0,0.17,0.22,0.69,0.150608
3863,0,0,1,0,0.0,425,0.00,0.00,0.00,0.43,...,0.00,0.00,0.00,0.2,13728.0,0.0,0.38,0.32,0.36,0.013862
3895,0,0,9,0,11.0,600,0.01,0.01,0.00,1.59,...,0.73,0.94,0.21,7.3,14658.0,1.0,0.51,0.26,0.28,0.145535
3911,0,0,0,0,10.8,110,0.01,0.15,0.14,1.95,...,0.70,0.89,0.07,22.1,5536769.0,2.0,0.17,0.22,0.69,0.287291


## Assists Predictor


In [46]:

xa_data= data[ (data['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5', 'was_home', 'expected_assists', 'xA_3', 'xA_5',  'expected_assists_3', 'expected_assists_5', 'whh', 'whd', 'wha']]

xa_data_tar = data_tar[(data_tar['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5','was_home', 'expected_assists', 'xA_3', 'xA_5',  'expected_assists_3',
                    'expected_assists_5', 'whh', 'whd', 'wha']]



In [47]:

xa = xa_data[['expected_assists']]
feats = xa_data.drop('expected_assists', axis=1)

xa_tar = xa_data_tar[['expected_assists']].copy()
feats_tar = xa_data_tar.drop('expected_assists', axis=1)

In [48]:
xa_pred = Linear_regression(feats, feats_tar, xa)
xa_tar['xa_pred'] = xa_pred
# xa_tar = xa_tar.drop('expected_assists', axis=1)
# comb_data = comb_data.join(xa_tar, rsuffix='x')
# comb_data

def add_preds(row):
    if row.name in list(xa_tar.index):
        # print(row.name, xa_tar.loc[row.name, 'xa_pred'])
        row['xa_pred'] = xa_tar.loc[row.name, 'xa_pred']
    else:
        row['xa_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,xGChain_5,xGBuildup_5,xP_5,selected_5,pts_bps,whh,whd,wha,xg_pred,xa_pred
17,0,0,17,0,0.0,14,0.01,0.01,0.00,2.03,...,0.44,0.44,2.1,289678.0,3.0,0.62,0.25,0.20,0.000000,0.000000
35,0,0,-8,0,21.8,388,0.02,0.17,0.15,3.15,...,1.70,1.35,28.7,3267128.0,0.0,0.15,0.20,0.71,0.051633,0.102801
51,0,0,10,0,27.8,217,0.04,0.09,0.05,1.48,...,0.33,0.11,9.5,79189.0,2.0,0.38,0.32,0.36,0.065206,0.044179
67,1,0,19,0,13.7,453,0.01,0.12,0.11,1.82,...,2.76,1.27,13.0,524668.0,5.0,0.62,0.25,0.20,0.270399,0.028191
77,0,0,4,0,0.7,463,0.00,0.00,0.00,0.11,...,0.02,0.00,-0.3,2784.0,1.0,0.62,0.25,0.20,0.071579,0.064975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,0,0,14,0,3.3,18,0.08,0.08,0.00,0.29,...,2.19,2.17,38.7,15131904.0,2.0,0.17,0.22,0.69,0.150608,0.051394
3863,0,0,1,0,0.0,425,0.00,0.00,0.00,0.43,...,0.00,0.00,0.2,13728.0,0.0,0.38,0.32,0.36,0.013862,0.024985
3895,0,0,9,0,11.0,600,0.01,0.01,0.00,1.59,...,0.94,0.21,7.3,14658.0,1.0,0.51,0.26,0.28,0.145535,0.092502
3911,0,0,0,0,10.8,110,0.01,0.15,0.14,1.95,...,0.89,0.07,22.1,5536769.0,2.0,0.17,0.22,0.69,0.287291,0.040909


## Clean Sheets


In [49]:
cs_data= data[(data['position'] != 'FWD')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'clean_sheets', 'clean_sheets_3', 'clean_sheets_5',
                                                                         'expected_goals_conceded_3', 'expected_goals_conceded_5', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5', 'whh', 'whd', 'wha']]

cs_data_tar = data_tar[ (data_tar['position'] != 'FWD')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'clean_sheets', 'clean_sheets_3', 'clean_sheets_5',
                                                                         'expected_goals_conceded_3', 'expected_goals_conceded_5', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5', 'whh', 'whd', 'wha']]

In [50]:
cs = cs_data[['clean_sheets']]
feats = cs_data.drop('clean_sheets', axis=1)

cs_tar = cs_data_tar[['clean_sheets']].copy()
feats_tar = cs_data_tar.drop('clean_sheets', axis=1)

In [51]:
cs_hy_params = {'bootstrap': False, 'min_samples_split': 2, 'n_estimators': 100}
cs_pred = Random_Forest_Classifier(feats, feats_tar, column_or_1d(cs), cs_hy_params)
cs_tar['cs_pred'] = cs_pred
# cs_tar = cs_tar.drop('clean_sheets', axis=1)
# comb_data = comb_data.join(cs_tar, rsuffix='x')
# comb_data

def add_preds(row):
    if row.name in list(cs_tar.index):
        # print(row.name, cs_tar.loc[row.name, 'cs_pred'])
        row['cs_pred'] = cs_tar.loc[row.name, 'cs_pred']
    else:
        row['cs_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,xGBuildup_5,xP_5,selected_5,pts_bps,whh,whd,wha,xg_pred,xa_pred,cs_pred
17,0,0,17,0,0.0,14,0.01,0.01,0.00,2.03,...,0.44,2.1,289678.0,3.0,0.62,0.25,0.20,0.000000,0.000000,0
35,0,0,-8,0,21.8,388,0.02,0.17,0.15,3.15,...,1.35,28.7,3267128.0,0.0,0.15,0.20,0.71,0.051633,0.102801,0
51,0,0,10,0,27.8,217,0.04,0.09,0.05,1.48,...,0.11,9.5,79189.0,2.0,0.38,0.32,0.36,0.065206,0.044179,0
67,1,0,19,0,13.7,453,0.01,0.12,0.11,1.82,...,1.27,13.0,524668.0,5.0,0.62,0.25,0.20,0.270399,0.028191,0
77,0,0,4,0,0.7,463,0.00,0.00,0.00,0.11,...,0.00,-0.3,2784.0,1.0,0.62,0.25,0.20,0.071579,0.064975,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,0,0,14,0,3.3,18,0.08,0.08,0.00,0.29,...,2.17,38.7,15131904.0,2.0,0.17,0.22,0.69,0.150608,0.051394,0
3863,0,0,1,0,0.0,425,0.00,0.00,0.00,0.43,...,0.00,0.2,13728.0,0.0,0.38,0.32,0.36,0.013862,0.024985,0
3895,0,0,9,0,11.0,600,0.01,0.01,0.00,1.59,...,0.21,7.3,14658.0,1.0,0.51,0.26,0.28,0.145535,0.092502,0
3911,0,0,0,0,10.8,110,0.01,0.15,0.14,1.95,...,0.07,22.1,5536769.0,2.0,0.17,0.22,0.69,0.287291,0.040909,0


## Yellow Cards


In [52]:
#
yc_data= data[ (data['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'yellow_cards', 'yellow_cards_3', 'yellow_cards_5', 'whh', 'whd', 'wha']]

yc_data_tar = data_tar[ (data_tar['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'yellow_cards', 'yellow_cards_3', 'yellow_cards_5', 'whh', 'whd', 'wha']]


In [53]:
yc = yc_data[['yellow_cards']]
feats = yc_data.drop('yellow_cards', axis=1)

yc_tar = yc_data_tar[['yellow_cards']].copy()
feats_tar = yc_data_tar.drop('yellow_cards', axis=1)

In [54]:
params =  {'bootstrap': False, 'min_samples_split': 10, 'n_estimators': 300}
yc_pred = Random_Forest_Classifier(feats, feats_tar, column_or_1d(yc), params)
yc_tar['yc_pred'] = yc_pred
# yc_tar = yc_tar.drop('yellow_cards', axis=1)
# comb_data = comb_data.merge(yc_tar, left_index=True, right_index=True, how='inner')
# comb_data

def add_preds(row):
    if row.name in list(yc_tar.index):
        # print(row.name, yc_tar.loc[row.name, 'yc_pred'])
        row['yc_pred'] = yc_tar.loc[row.name, 'yc_pred']
    else:
        row['yc_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,xP_5,selected_5,pts_bps,whh,whd,wha,xg_pred,xa_pred,cs_pred,yc_pred
17,0,0,17,0,0.0,14,0.01,0.01,0.00,2.03,...,2.1,289678.0,3.0,0.62,0.25,0.20,0.000000,0.000000,0,0
35,0,0,-8,0,21.8,388,0.02,0.17,0.15,3.15,...,28.7,3267128.0,0.0,0.15,0.20,0.71,0.051633,0.102801,0,0
51,0,0,10,0,27.8,217,0.04,0.09,0.05,1.48,...,9.5,79189.0,2.0,0.38,0.32,0.36,0.065206,0.044179,0,0
67,1,0,19,0,13.7,453,0.01,0.12,0.11,1.82,...,13.0,524668.0,5.0,0.62,0.25,0.20,0.270399,0.028191,0,0
77,0,0,4,0,0.7,463,0.00,0.00,0.00,0.11,...,-0.3,2784.0,1.0,0.62,0.25,0.20,0.071579,0.064975,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,0,0,14,0,3.3,18,0.08,0.08,0.00,0.29,...,38.7,15131904.0,2.0,0.17,0.22,0.69,0.150608,0.051394,0,0
3863,0,0,1,0,0.0,425,0.00,0.00,0.00,0.43,...,0.2,13728.0,0.0,0.38,0.32,0.36,0.013862,0.024985,0,0
3895,0,0,9,0,11.0,600,0.01,0.01,0.00,1.59,...,7.3,14658.0,1.0,0.51,0.26,0.28,0.145535,0.092502,0,0
3911,0,0,0,0,10.8,110,0.01,0.15,0.14,1.95,...,22.1,5536769.0,2.0,0.17,0.22,0.69,0.287291,0.040909,0,0


## Saves


In [55]:
sv_data= data[ (data['position'] == 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'clean_sheets_3', 'clean_sheets_5', 'expected_goals_conceded_3', 'expected_goals_conceded_5','saves',
                                                                          'saves_3', 'saves_5', 'whh', 'whd', 'wha']]

sv_data_tar = data_tar[(data_tar['position'] == 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'clean_sheets_3', 'clean_sheets_5', 'expected_goals_conceded_3', 'expected_goals_conceded_5','saves',
                                                                          'saves_3', 'saves_5', 'whh', 'whd', 'wha']]


In [56]:
sv = sv_data[['saves']]
feats = sv_data.drop('saves', axis=1)

sv_tar = sv_data_tar[['saves']].copy()
feats_tar = sv_data_tar.drop('saves', axis=1)

In [57]:
sv_pred = Linear_regression(feats, feats_tar, sv)
sv_tar['sv_pred'] = sv_pred
# sv_tar = sv_tar.drop('saves', axis=1)
# comb_data = comb_data.join(sv_tar, rsuffix='x')
# comb_data

def add_preds(row):
    if row.name in list(sv_tar.index):
        # print(row.name, sv_tar.loc[row.name, 'sv_pred'])
        row['sv_pred'] = sv_tar.loc[row.name, 'sv_pred']
    else:
        row['sv_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,selected_5,pts_bps,whh,whd,wha,xg_pred,xa_pred,cs_pred,yc_pred,sv_pred
17,0,0,17,0,0.0,14,0.01,0.01,0.00,2.03,...,289678.0,3.0,0.62,0.25,0.20,0.000000,0.000000,0,0,2.927312
35,0,0,-8,0,21.8,388,0.02,0.17,0.15,3.15,...,3267128.0,0.0,0.15,0.20,0.71,0.051633,0.102801,0,0,0.000000
51,0,0,10,0,27.8,217,0.04,0.09,0.05,1.48,...,79189.0,2.0,0.38,0.32,0.36,0.065206,0.044179,0,0,0.000000
67,1,0,19,0,13.7,453,0.01,0.12,0.11,1.82,...,524668.0,5.0,0.62,0.25,0.20,0.270399,0.028191,0,0,0.000000
77,0,0,4,0,0.7,463,0.00,0.00,0.00,0.11,...,2784.0,1.0,0.62,0.25,0.20,0.071579,0.064975,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,0,0,14,0,3.3,18,0.08,0.08,0.00,0.29,...,15131904.0,2.0,0.17,0.22,0.69,0.150608,0.051394,0,0,0.000000
3863,0,0,1,0,0.0,425,0.00,0.00,0.00,0.43,...,13728.0,0.0,0.38,0.32,0.36,0.013862,0.024985,0,0,0.000000
3895,0,0,9,0,11.0,600,0.01,0.01,0.00,1.59,...,14658.0,1.0,0.51,0.26,0.28,0.145535,0.092502,0,0,0.000000
3911,0,0,0,0,10.8,110,0.01,0.15,0.14,1.95,...,5536769.0,2.0,0.17,0.22,0.69,0.287291,0.040909,0,0,0.000000


In [58]:
data_tar_preds[['sv_pred']].max()

sv_pred    3.301961
dtype: float64

## Total Points


In [59]:
# For playing up to 60 minutes	1
# For playing 60 minutes or more (excluding stoppage time)	2
# For each goal scored by a goalkeeper	10
# For each goal scored by a defender	6
# For each goal scored by a midfielder	5
# For each goal scored by a forward	4
# For each goal assist	3
# For a clean sheet by a goalkeeper or defender	4
# For a clean sheet by a midfielder	1
# For every 3 shot saves by a goalkeeper	1
# For each penalty save	5
# For each penalty miss	-2
# Bonus points for the best players in a match	1-3
# For every 2 goals conceded by a goalkeeper or defender	-1
# For each yellow card	-1
# For each red card	-3
# For each own goal	-2

## GK
## ---> >60  = 2
## ---> cs*4
## ---> ((sv)/3)*1
## ---> gc
## ---> yc*-1


## DEF
## ---> >60  = 2
## ---> xg*6
## ---> xa*3
## ---> cs*4
## ---> gc
## ---> yc*-1


## MID
## ---> >60  = +2
## ---> xg*5
## ---> xa*3
## ---> cs*1
## ---> yc*-1


## FWD
## ---> >60  = 2
## ---> xg*4
## ---> xa*3
## ---> yc*-1

In [60]:
def calc_total_pts(row):
    if(row.position == 'GK'):
                ## ---> >60  = 2
        ## ---> cs*4
        ## ---> ((sv)/3)*1
        ## ---> gc
        ## ---> yc*-1
        row['xPts'] = 2 + row['cs_pred']*4 + ((row['sv_pred'])/3)*1 + row['yc_pred']*(-1)

    elif(row.position == 'DEF'):
        ## ---> >60  = 2
        ## ---> xg*6
        ## ---> xa*3
        ## ---> cs*4
        ## ---> gc
        ## ---> yc*-1
        row['xPts'] = 2 + row['xg_pred']*6 + row['xa_pred']*3 + row['cs_pred']*4 + row['yc_pred']*(-1)
    elif(row.position == 'MID'):
        ## MID
        ## ---> >60  = +2
        ## ---> xg*5
        ## ---> xa*3
        ## ---> cs*1
        ## ---> yc*-1

        row['xPts'] = 2 + row['xg_pred']*5 + row['xa_pred']*3 + row['cs_pred']*1 + row['yc_pred']*(-1)

    elif(row.position == 'FWD'):
        ## ---> >60  = 2
        ## ---> xg*4
        ## ---> xa*3
        ## ---> yc*-1

        row['xPts'] = 2 + row['xg_pred']*4 + row['xa_pred']*3 + row['yc_pred']*(-1)

    return row

data_tar_preds = data_tar_preds.apply(calc_total_pts, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,pts_bps,whh,whd,wha,xg_pred,xa_pred,cs_pred,yc_pred,sv_pred,xPts
17,0,0,17,0,0.0,14,0.01,0.01,0.00,2.03,...,3.0,0.62,0.25,0.20,0.000000,0.000000,0,0,2.927312,2.975771
35,0,0,-8,0,21.8,388,0.02,0.17,0.15,3.15,...,0.0,0.15,0.20,0.71,0.051633,0.102801,0,0,0.000000,2.618200
51,0,0,10,0,27.8,217,0.04,0.09,0.05,1.48,...,2.0,0.38,0.32,0.36,0.065206,0.044179,0,0,0.000000,2.458567
67,1,0,19,0,13.7,453,0.01,0.12,0.11,1.82,...,5.0,0.62,0.25,0.20,0.270399,0.028191,0,0,0.000000,3.166167
77,0,0,4,0,0.7,463,0.00,0.00,0.00,0.11,...,1.0,0.62,0.25,0.20,0.071579,0.064975,0,0,0.000000,2.552820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3859,0,0,14,0,3.3,18,0.08,0.08,0.00,0.29,...,2.0,0.17,0.22,0.69,0.150608,0.051394,0,0,0.000000,3.057830
3863,0,0,1,0,0.0,425,0.00,0.00,0.00,0.43,...,0.0,0.38,0.32,0.36,0.013862,0.024985,0,0,0.000000,2.158125
3895,0,0,9,0,11.0,600,0.01,0.01,0.00,1.59,...,1.0,0.51,0.26,0.28,0.145535,0.092502,0,0,0.000000,3.005180
3911,0,0,0,0,10.8,110,0.01,0.15,0.14,1.95,...,2.0,0.17,0.22,0.69,0.287291,0.040909,0,0,0.000000,3.271892


## Player Points


In [61]:
data_tar_preds[['element', 'position', 'xPts']]

Unnamed: 0,element,position,xPts
17,14,GK,2.975771
35,388,DEF,2.618200
51,217,MID,2.458567
67,453,FWD,3.166167
77,463,MID,2.552820
...,...,...,...
3859,18,DEF,3.057830
3863,425,DEF,2.158125
3895,600,MID,3.005180
3911,110,FWD,3.271892


In [62]:
pd.read_csv('./data/vaastav/data/2024-25/player_idlist.csv')

Unnamed: 0,first_name,second_name,id
0,Fábio,Ferreira Vieira,1
1,Gabriel,Fernando de Jesus,2
2,Gabriel,dos Santos Magalhães,3
3,Kai,Havertz,4
4,Karl,Hein,5
...,...,...,...
704,Bastien,Meupiyou,637
705,André,Trindade da Costa Neto,642
706,Carlos Roberto,Forbs Borges,655
707,Alfie,Pond,664
