In [28]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline  # Use the imblearn pipeline

from sklearn.utils.validation import column_or_1d
from sklearn.ensemble import RandomForestClassifier

In [29]:
%run ./ml_model_init.ipynb

In [30]:
data_tar_preds = data_tar.copy()
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,npg_5,npxG_5,xGChain_5,xGBuildup_5,xP_5,selected_5,pts_bps,whh,whd,wha
12,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.740,0.174,0.087
18,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.278,0.249,0.473
36,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.278,0.249,0.473
39,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.278,0.249,0.473
56,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.278,0.249,0.473
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6306,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.278,0.249,0.473
6309,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.278,0.249,0.473
6315,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.278,0.249,0.473
6336,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,,0.278,0.249,0.473


## Functions


In [None]:
# For the linear model
def Linear_regression(features_train, features_pred, target_train):
    # bool_cols = features_train.drop(columns=['was_home']).columns.tolist()

    bool_cols = features_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = features_train.select_dtypes(include=['object', 'category']).columns.tolist()

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('one_hot_encoder', OneHotEncoder())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, bool_cols),
            ('cat', categorical_transformer, categorical_cols),
        ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])

    model = TransformedTargetRegressor(regressor=pipeline, transformer=StandardScaler())
    model.fit(features_train, target_train)

    return model.predict(features_pred)

# Decision Tree Model
def DecisionTreeRegression(features_train, features_pred, target_train):
    bool_cols = features_train.drop(columns=['was_home']).columns.tolist()
    categorical_cols = ['was_home']

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('one_hot_encoder', OneHotEncoder())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, bool_cols),
            ('cat', categorical_transformer, categorical_cols),
        ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', DecisionTreeRegressor())
    ])

    model = TransformedTargetRegressor(regressor=pipeline, transformer=StandardScaler())
    model.fit(features_train, target_train)

    return model.predict(features_pred)

# RandomForestRegressor
def RandomForestRegression(features_train, features_pred, target_train, hyperparameters):
    model = TransformedTargetRegressor(RandomForestRegressor(
        n_estimators=hyperparameters['n_estimators'],  max_depth=hyperparameters['max_depth'], criterion=hyperparameters['criterion'], random_state=18), transformer=StandardScaler())
    model.fit(features_train, target_train)

    pred_pred = model.predict(features_pred)

def XGBoostRegression(features_train, features_pred, target_train, hyperparameters):
    regressor = xgb(learning_rate=hyperparameters["learning_rate"],
                    n_estimators=hyperparameters["n_estimators"],
                    max_depth=hyperparameters["max_depth"],
                    eval_metric='rmsle')

    model = TransformedTargetRegressor(regressor, transformer=StandardScaler())


    model.fit(features_train, target_train)

    return model.predict(features_pred)


def Random_Forest_Classifier(features_train, features_pred, target_train, hy_params):
    encoder = LabelEncoder()
    cs_train_ = encoder.fit_transform(target_train)

    # bool_cols = features_train.drop(columns=['was_home']).columns.tolist()
    # categorical_cols = ['was_home']
    bool_cols = features_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = features_train.select_dtypes(include=['object', 'category']).columns.tolist()

    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('one_hot_encoder', OneHotEncoder(sparse=False)),
        # ('to_dense', ToDense())
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, bool_cols),
            ('cat', categorical_transformer, categorical_cols),
        ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', BorderlineSMOTE(sampling_strategy='auto', random_state=42)),  # Apply SMOTE to the data
        ('classifier', RandomForestClassifier(bootstrap = hy_params['bootstrap'], min_samples_split = hy_params['min_samples_split'], n_estimators =hy_params['n_estimators']  , class_weight='balanced', random_state=42))  # Random Forest Classifier
    ])

    pipeline.fit(features_train, cs_train_)

    # Make predictions
    return pipeline.predict(features_pred)

In [32]:
len(data[data['position'] == 'GK']['element'].unique()) + len(data[data['position'] == 'DEF']['element'].unique()) + len(data[data['position'] == 'MID']['element'].unique()) + len(data[data['position'] == 'FWD']['element'].unique())

1220

In [33]:
len(data[data['position'] == 'GK']['element'].unique()), len(data[data['position'] == 'DEF']['element'].unique()), len(data[data['position'] == 'MID']['element'].unique()), len(data[data['position'] == 'FWD']['element'].unique())

(113, 403, 529, 175)

In [34]:
len(data[(data['position'] != 'GK')]['element'].unique())

751

## Goals Predictor


In [45]:

# (data['minutes_5'] >= 300) &
xg_data= data[(data['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',  'ict_index_5',
                                                                     'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5', 'was_home', 'xG', 'xG_3',
                                                                      'xG_5', 'expected_goals_3', 'expected_goals_5', 'goals_scored_3', 'whh', 'whd', 'wha', 'value']]

# (data_tar['minutes_5'] >= 300) &
xg_data_tar = data_tar[ (data_tar['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',
                                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3',
                                                                                        'threat_5', 'was_home', 'xG', 'xG_3', 'xG_5', 'expected_goals_3', 'expected_goals_5',
                                                                                        'goals_scored_3','whh', 'whd', 'wha', 'value']]

In [46]:
xg_data

Unnamed: 0,position,minutes_3,minutes_5,team_h_difficulty,team_a_difficulty,ict_index_3,ict_index_5,influence_3,influence_5,creativity_3,...,xG,xG_3,xG_5,expected_goals_3,expected_goals_5,goals_scored_3,whh,whd,wha,value
0,DEF,0.0,0.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.00,0.00,0.00,0.0,0.105,0.188,0.707,50.0
1,DEF,90.0,90.0,2.0,2.0,4.1,4.1,22.0,22.0,18.7,...,0.054937,0.00,0.00,0.00,0.00,0.0,0.224,0.268,0.508,50.0
2,DEF,180.0,180.0,3.0,3.0,12.2,12.2,36.6,36.6,84.5,...,0.063659,0.05,0.05,0.05,0.05,0.0,0.419,0.277,0.304,50.0
3,DEF,270.0,270.0,2.0,2.0,18.9,18.9,46.8,46.8,123.3,...,0.062027,0.12,0.12,0.10,0.10,0.0,0.419,0.277,0.304,50.0
4,DEF,270.0,360.0,3.0,3.0,17.6,21.7,38.6,60.6,110.4,...,0.000000,0.18,0.18,0.15,0.15,0.0,0.255,0.262,0.483,49.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6352,MID,270.0,450.0,2.0,4.0,21.9,31.4,87.8,126.6,99.9,...,0.584815,0.36,0.43,0.38,0.47,1.0,0.700,0.173,0.127,76.0
6353,MID,270.0,450.0,3.0,3.0,28.5,38.0,123.0,155.0,92.1,...,0.041791,0.95,0.95,0.91,0.91,2.0,0.413,0.271,0.316,76.0
6354,MID,270.0,450.0,2.0,3.0,35.3,44.2,135.0,159.6,113.1,...,0.030085,0.99,0.99,0.98,0.98,2.0,0.576,0.226,0.198,76.0
6355,MID,270.0,450.0,4.0,3.0,25.5,43.5,83.4,159.0,84.6,...,0.133722,0.66,1.02,0.63,1.01,1.0,0.155,0.202,0.644,76.0


In [47]:

xg = xg_data[['xG']]
feats = xg_data.drop('xG', axis=1)

xg_tar = xg_data_tar[['xG']].copy()
feats_tar = xg_data_tar.drop('xG', axis=1)

feats_tar.isnull().sum()

position             488
minutes_3              0
minutes_5              0
team_h_difficulty      0
team_a_difficulty      0
ict_index_3            0
ict_index_5            0
influence_3            0
influence_5            0
creativity_3           0
creativity_5           0
threat_3               0
threat_5               0
was_home               0
xG_3                   0
xG_5                   0
expected_goals_3       0
expected_goals_5       0
goals_scored_3         0
whh                    0
whd                    0
wha                    0
value                488
dtype: int64

In [38]:
xg_pred = Linear_regression(feats, feats_tar, xg)
xg_tar['xg_pred'] = xg_pred
# xg_tar = xg_tar.drop('xG', axis=1)
# comb_data = xg_data_tar.join(xg_tar)
# comb_data

def add_preds(row):
    if row.name in list(xg_tar.index):
        # print(row.name, xg_tar.loc[row.name, 'xg_pred'])
        row['xg_pred'] = xg_tar.loc[row.name, 'xg_pred']
    else:
        row['xg_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

ValueError: Found unknown categories [nan] in column 0 during transform

## Assists Predictor


In [12]:

xa_data= data[ (data['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5', 'was_home', 'expected_assists', 'xA_3', 'xA_5',  'expected_assists_3', 'expected_assists_5', 'whh', 'whd', 'wha']]

xa_data_tar = data_tar[(data_tar['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5','was_home', 'expected_assists', 'xA_3', 'xA_5',  'expected_assists_3',
                    'expected_assists_5', 'whh', 'whd', 'wha']]



In [13]:

xa = xa_data[['expected_assists']]
feats = xa_data.drop('expected_assists', axis=1)

xa_tar = xa_data_tar[['expected_assists']].copy()
feats_tar = xa_data_tar.drop('expected_assists', axis=1)

In [14]:
xa_pred = Linear_regression(feats, feats_tar, xa)
xa_tar['xa_pred'] = xa_pred
# xa_tar = xa_tar.drop('expected_assists', axis=1)
# comb_data = comb_data.join(xa_tar, rsuffix='x')
# comb_data

def add_preds(row):
    if row.name in list(xa_tar.index):
        # print(row.name, xa_tar.loc[row.name, 'xa_pred'])
        row['xa_pred'] = xa_tar.loc[row.name, 'xa_pred']
    else:
        row['xa_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,xGChain_5,xGBuildup_5,xP_5,selected_5,pts_bps,whh,whd,wha,xg_pred,xa_pred
19,0.0,0.0,0.0,0.0,0.0,14.0,0.00,0.00,0.00,4.46,...,0.44,0.44,4.1,440897.0,1.0,0.351,0.263,0.386,0.000000,0.000000
39,0.0,0.0,0.0,0.0,1.9,388.0,0.02,0.04,0.02,1.91,...,1.38,0.74,23.9,3414251.0,0.0,0.717,0.164,0.119,0.060483,0.103306
57,0.0,0.0,9.0,0.0,13.8,217.0,0.03,0.09,0.06,1.84,...,0.58,0.33,9.3,79060.0,1.0,0.543,0.250,0.207,0.095729,0.067574
87,0.0,0.0,4.0,0.0,12.0,453.0,0.00,0.00,0.00,2.94,...,3.23,0.69,12.0,552787.0,1.0,0.351,0.263,0.386,0.265463,0.036136
99,0.0,0.0,2.0,0.0,0.0,463.0,0.00,0.00,0.00,0.96,...,0.02,0.00,-0.5,5643.0,1.0,0.351,0.263,0.386,0.062366,0.060773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6292,0.0,0.0,-1.0,0.0,14.3,135.0,0.04,0.15,0.11,0.29,...,0.76,0.48,9.8,226849.0,0.0,0.225,0.249,0.526,0.110177,0.060260
6310,0.0,0.0,19.0,0.0,18.2,600.0,0.01,0.05,0.04,0.88,...,1.39,0.61,8.6,18169.0,2.0,0.225,0.249,0.526,0.126854,0.086651
6326,0.0,0.0,2.0,0.0,0.7,111.0,0.02,0.10,0.08,0.00,...,0.97,0.66,4.6,1471073.0,1.0,0.351,0.263,0.386,0.055883,0.053082
6350,1.0,2.0,37.0,1.0,15.9,110.0,0.11,1.49,1.38,0.29,...,0.91,0.07,18.1,5515000.0,9.0,0.351,0.263,0.386,0.296086,0.045563


## Clean Sheets


In [15]:
cs_data= data[(data['position'] != 'FWD')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'clean_sheets', 'clean_sheets_3', 'clean_sheets_5',
                                                                         'expected_goals_conceded_3', 'expected_goals_conceded_5', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5', 'whh', 'whd', 'wha']]

cs_data_tar = data_tar[ (data_tar['position'] != 'FWD')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'clean_sheets', 'clean_sheets_3', 'clean_sheets_5',
                                                                         'expected_goals_conceded_3', 'expected_goals_conceded_5', 'ict_index_3',  'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5',
                                                                         'threat_3', 'threat_5', 'whh', 'whd', 'wha']]

In [16]:
cs = cs_data[['clean_sheets']]
feats = cs_data.drop('clean_sheets', axis=1)

cs_tar = cs_data_tar[['clean_sheets']].copy()
feats_tar = cs_data_tar.drop('clean_sheets', axis=1)

In [17]:
cs_hy_params = {'bootstrap': False, 'min_samples_split': 2, 'n_estimators': 100}
cs_pred = Random_Forest_Classifier(feats, feats_tar, column_or_1d(cs), cs_hy_params)
cs_tar['cs_pred'] = cs_pred
# cs_tar = cs_tar.drop('clean_sheets', axis=1)
# comb_data = comb_data.join(cs_tar, rsuffix='x')
# comb_data

def add_preds(row):
    if row.name in list(cs_tar.index):
        # print(row.name, cs_tar.loc[row.name, 'cs_pred'])
        row['cs_pred'] = cs_tar.loc[row.name, 'cs_pred']
    else:
        row['cs_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,xGBuildup_5,xP_5,selected_5,pts_bps,whh,whd,wha,xg_pred,xa_pred,cs_pred
19,0.0,0.0,0.0,0.0,0.0,14.0,0.00,0.00,0.00,4.46,...,0.44,4.1,440897.0,1.0,0.351,0.263,0.386,0.000000,0.000000,0
39,0.0,0.0,0.0,0.0,1.9,388.0,0.02,0.04,0.02,1.91,...,0.74,23.9,3414251.0,0.0,0.717,0.164,0.119,0.060483,0.103306,0
57,0.0,0.0,9.0,0.0,13.8,217.0,0.03,0.09,0.06,1.84,...,0.33,9.3,79060.0,1.0,0.543,0.250,0.207,0.095729,0.067574,0
87,0.0,0.0,4.0,0.0,12.0,453.0,0.00,0.00,0.00,2.94,...,0.69,12.0,552787.0,1.0,0.351,0.263,0.386,0.265463,0.036136,0
99,0.0,0.0,2.0,0.0,0.0,463.0,0.00,0.00,0.00,0.96,...,0.00,-0.5,5643.0,1.0,0.351,0.263,0.386,0.062366,0.060773,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6292,0.0,0.0,-1.0,0.0,14.3,135.0,0.04,0.15,0.11,0.29,...,0.48,9.8,226849.0,0.0,0.225,0.249,0.526,0.110177,0.060260,0
6310,0.0,0.0,19.0,0.0,18.2,600.0,0.01,0.05,0.04,0.88,...,0.61,8.6,18169.0,2.0,0.225,0.249,0.526,0.126854,0.086651,0
6326,0.0,0.0,2.0,0.0,0.7,111.0,0.02,0.10,0.08,0.00,...,0.66,4.6,1471073.0,1.0,0.351,0.263,0.386,0.055883,0.053082,0
6350,1.0,2.0,37.0,1.0,15.9,110.0,0.11,1.49,1.38,0.29,...,0.07,18.1,5515000.0,9.0,0.351,0.263,0.386,0.296086,0.045563,0


## Yellow Cards


In [18]:
#
yc_data= data[ (data['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'yellow_cards', 'yellow_cards_3', 'yellow_cards_5', 'whh', 'whd', 'wha']]

yc_data_tar = data_tar[ (data_tar['position'] != 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'yellow_cards', 'yellow_cards_3', 'yellow_cards_5', 'whh', 'whd', 'wha']]


In [19]:
yc = yc_data[['yellow_cards']]
feats = yc_data.drop('yellow_cards', axis=1)

yc_tar = yc_data_tar[['yellow_cards']].copy()
feats_tar = yc_data_tar.drop('yellow_cards', axis=1)

In [20]:
params =  {'bootstrap': False, 'min_samples_split': 10, 'n_estimators': 300}
yc_pred = Random_Forest_Classifier(feats, feats_tar, column_or_1d(yc), params)
yc_tar['yc_pred'] = yc_pred
# yc_tar = yc_tar.drop('yellow_cards', axis=1)
# comb_data = comb_data.merge(yc_tar, left_index=True, right_index=True, how='inner')
# comb_data

def add_preds(row):
    if row.name in list(yc_tar.index):
        # print(row.name, yc_tar.loc[row.name, 'yc_pred'])
        row['yc_pred'] = yc_tar.loc[row.name, 'yc_pred']
    else:
        row['yc_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,xP_5,selected_5,pts_bps,whh,whd,wha,xg_pred,xa_pred,cs_pred,yc_pred
19,0.0,0.0,0.0,0.0,0.0,14.0,0.00,0.00,0.00,4.46,...,4.1,440897.0,1.0,0.351,0.263,0.386,0.000000,0.000000,0,0
39,0.0,0.0,0.0,0.0,1.9,388.0,0.02,0.04,0.02,1.91,...,23.9,3414251.0,0.0,0.717,0.164,0.119,0.060483,0.103306,0,0
57,0.0,0.0,9.0,0.0,13.8,217.0,0.03,0.09,0.06,1.84,...,9.3,79060.0,1.0,0.543,0.250,0.207,0.095729,0.067574,0,0
87,0.0,0.0,4.0,0.0,12.0,453.0,0.00,0.00,0.00,2.94,...,12.0,552787.0,1.0,0.351,0.263,0.386,0.265463,0.036136,0,0
99,0.0,0.0,2.0,0.0,0.0,463.0,0.00,0.00,0.00,0.96,...,-0.5,5643.0,1.0,0.351,0.263,0.386,0.062366,0.060773,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6292,0.0,0.0,-1.0,0.0,14.3,135.0,0.04,0.15,0.11,0.29,...,9.8,226849.0,0.0,0.225,0.249,0.526,0.110177,0.060260,0,0
6310,0.0,0.0,19.0,0.0,18.2,600.0,0.01,0.05,0.04,0.88,...,8.6,18169.0,2.0,0.225,0.249,0.526,0.126854,0.086651,0,0
6326,0.0,0.0,2.0,0.0,0.7,111.0,0.02,0.10,0.08,0.00,...,4.6,1471073.0,1.0,0.351,0.263,0.386,0.055883,0.053082,0,0
6350,1.0,2.0,37.0,1.0,15.9,110.0,0.11,1.49,1.38,0.29,...,18.1,5515000.0,9.0,0.351,0.263,0.386,0.296086,0.045563,0,0


## Saves


In [21]:
sv_data= data[ (data['position'] == 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'clean_sheets_3', 'clean_sheets_5', 'expected_goals_conceded_3', 'expected_goals_conceded_5','saves',
                                                                          'saves_3', 'saves_5', 'whh', 'whd', 'wha']]

sv_data_tar = data_tar[(data_tar['position'] == 'GK')][['position', 'minutes_3', 'minutes_5', 'team_h_difficulty', 'team_a_difficulty', 'was_home', 'ict_index_3',
                                                                         'ict_index_5', 'influence_3', 'influence_5', 'creativity_3', 'creativity_5', 'threat_3', 'threat_5',
                                                                         'clean_sheets_3', 'clean_sheets_5', 'expected_goals_conceded_3', 'expected_goals_conceded_5','saves',
                                                                          'saves_3', 'saves_5', 'whh', 'whd', 'wha']]


In [22]:
sv = sv_data[['saves']]
feats = sv_data.drop('saves', axis=1)

sv_tar = sv_data_tar[['saves']].copy()
feats_tar = sv_data_tar.drop('saves', axis=1)

In [23]:
sv_pred = Linear_regression(feats, feats_tar, sv)
sv_tar['sv_pred'] = sv_pred
# sv_tar = sv_tar.drop('saves', axis=1)
# comb_data = comb_data.join(sv_tar, rsuffix='x')
# comb_data

def add_preds(row):
    if row.name in list(sv_tar.index):
        # print(row.name, sv_tar.loc[row.name, 'sv_pred'])
        row['sv_pred'] = sv_tar.loc[row.name, 'sv_pred']
    else:
        row['sv_pred'] = 0
    return row
data_tar_preds = data_tar_preds.apply(add_preds, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,selected_5,pts_bps,whh,whd,wha,xg_pred,xa_pred,cs_pred,yc_pred,sv_pred
19,0.0,0.0,0.0,0.0,0.0,14.0,0.00,0.00,0.00,4.46,...,440897.0,1.0,0.351,0.263,0.386,0.000000,0.000000,0,0,3.378631
39,0.0,0.0,0.0,0.0,1.9,388.0,0.02,0.04,0.02,1.91,...,3414251.0,0.0,0.717,0.164,0.119,0.060483,0.103306,0,0,0.000000
57,0.0,0.0,9.0,0.0,13.8,217.0,0.03,0.09,0.06,1.84,...,79060.0,1.0,0.543,0.250,0.207,0.095729,0.067574,0,0,0.000000
87,0.0,0.0,4.0,0.0,12.0,453.0,0.00,0.00,0.00,2.94,...,552787.0,1.0,0.351,0.263,0.386,0.265463,0.036136,0,0,0.000000
99,0.0,0.0,2.0,0.0,0.0,463.0,0.00,0.00,0.00,0.96,...,5643.0,1.0,0.351,0.263,0.386,0.062366,0.060773,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6292,0.0,0.0,-1.0,0.0,14.3,135.0,0.04,0.15,0.11,0.29,...,226849.0,0.0,0.225,0.249,0.526,0.110177,0.060260,0,0,0.000000
6310,0.0,0.0,19.0,0.0,18.2,600.0,0.01,0.05,0.04,0.88,...,18169.0,2.0,0.225,0.249,0.526,0.126854,0.086651,0,0,0.000000
6326,0.0,0.0,2.0,0.0,0.7,111.0,0.02,0.10,0.08,0.00,...,1471073.0,1.0,0.351,0.263,0.386,0.055883,0.053082,0,0,0.000000
6350,1.0,2.0,37.0,1.0,15.9,110.0,0.11,1.49,1.38,0.29,...,5515000.0,9.0,0.351,0.263,0.386,0.296086,0.045563,0,0,0.000000


In [24]:
data_tar_preds[['sv_pred']].max()

sv_pred    3.791921
dtype: float64

## Total Points


In [25]:
# For playing up to 60 minutes	1
# For playing 60 minutes or more (excluding stoppage time)	2
# For each goal scored by a goalkeeper	10
# For each goal scored by a defender	6
# For each goal scored by a midfielder	5
# For each goal scored by a forward	4
# For each goal assist	3
# For a clean sheet by a goalkeeper or defender	4
# For a clean sheet by a midfielder	1
# For every 3 shot saves by a goalkeeper	1
# For each penalty save	5
# For each penalty miss	-2
# Bonus points for the best players in a match	1-3
# For every 2 goals conceded by a goalkeeper or defender	-1
# For each yellow card	-1
# For each red card	-3
# For each own goal	-2

## GK
## ---> >60  = 2
## ---> cs*4
## ---> ((sv)/3)*1
## ---> gc
## ---> yc*-1


## DEF
## ---> >60  = 2
## ---> xg*6
## ---> xa*3
## ---> cs*4
## ---> gc
## ---> yc*-1


## MID
## ---> >60  = +2
## ---> xg*5
## ---> xa*3
## ---> cs*1
## ---> yc*-1


## FWD
## ---> >60  = 2
## ---> xg*4
## ---> xa*3
## ---> yc*-1

In [26]:
def calc_total_pts(row):
    if(row.position == 'GK'):
                ## ---> >60  = 2
        ## ---> cs*4
        ## ---> ((sv)/3)*1
        ## ---> gc
        ## ---> yc*-1
        row['xPts'] = 2 + row['cs_pred']*4 + ((row['sv_pred'])/3)*1 + row['yc_pred']*(-1)

    elif(row.position == 'DEF'):
        ## ---> >60  = 2
        ## ---> xg*6
        ## ---> xa*3
        ## ---> cs*4
        ## ---> gc
        ## ---> yc*-1
        row['xPts'] = 2 + row['xg_pred']*6 + row['xa_pred']*3 + row['cs_pred']*4 + row['yc_pred']*(-1)
    elif(row.position == 'MID'):
        ## MID
        ## ---> >60  = +2
        ## ---> xg*5
        ## ---> xa*3
        ## ---> cs*1
        ## ---> yc*-1

        row['xPts'] = 2 + row['xg_pred']*5 + row['xa_pred']*3 + row['cs_pred']*1 + row['yc_pred']*(-1)

    elif(row.position == 'FWD'):
        ## ---> >60  = 2
        ## ---> xg*4
        ## ---> xa*3
        ## ---> yc*-1

        row['xPts'] = 2 + row['xg_pred']*4 + row['xa_pred']*3 + row['yc_pred']*(-1)

    return row

data_tar_preds = data_tar_preds.apply(calc_total_pts, axis=1)
data_tar_preds

Unnamed: 0,assists_x,bonus,bps,clean_sheets,creativity,element,expected_assists,expected_goal_involvements,expected_goals,expected_goals_conceded,...,pts_bps,whh,whd,wha,xg_pred,xa_pred,cs_pred,yc_pred,sv_pred,xPts
19,0.0,0.0,0.0,0.0,0.0,14.0,0.00,0.00,0.00,4.46,...,1.0,0.351,0.263,0.386,0.000000,0.000000,0,0,3.378631,3.126210
39,0.0,0.0,0.0,0.0,1.9,388.0,0.02,0.04,0.02,1.91,...,0.0,0.717,0.164,0.119,0.060483,0.103306,0,0,0.000000,2.672815
57,0.0,0.0,9.0,0.0,13.8,217.0,0.03,0.09,0.06,1.84,...,1.0,0.543,0.250,0.207,0.095729,0.067574,0,0,0.000000,2.681370
87,0.0,0.0,4.0,0.0,12.0,453.0,0.00,0.00,0.00,2.94,...,1.0,0.351,0.263,0.386,0.265463,0.036136,0,0,0.000000,3.170258
99,0.0,0.0,2.0,0.0,0.0,463.0,0.00,0.00,0.00,0.96,...,1.0,0.351,0.263,0.386,0.062366,0.060773,0,0,0.000000,2.494148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6292,0.0,0.0,-1.0,0.0,14.3,135.0,0.04,0.15,0.11,0.29,...,0.0,0.225,0.249,0.526,0.110177,0.060260,0,0,0.000000,2.731664
6310,0.0,0.0,19.0,0.0,18.2,600.0,0.01,0.05,0.04,0.88,...,2.0,0.225,0.249,0.526,0.126854,0.086651,0,0,0.000000,2.894222
6326,0.0,0.0,2.0,0.0,0.7,111.0,0.02,0.10,0.08,0.00,...,1.0,0.351,0.263,0.386,0.055883,0.053082,0,0,0.000000,2.438661
6350,1.0,2.0,37.0,1.0,15.9,110.0,0.11,1.49,1.38,0.29,...,9.0,0.351,0.263,0.386,0.296086,0.045563,0,0,0.000000,3.321032


## Player Points


In [27]:
data_tar_preds[['element', 'position', 'xPts']]

Unnamed: 0,element,position,xPts
19,14.0,GK,3.126210
39,388.0,DEF,2.672815
57,217.0,MID,2.681370
87,453.0,FWD,3.170258
99,463.0,MID,2.494148
...,...,...,...
6292,135.0,MID,2.731664
6310,600.0,MID,2.894222
6326,111.0,MID,2.438661
6350,110.0,FWD,3.321032


In [30]:
data_tar_preds.to_csv('./predicted.csv', index=False)