In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd
pd.set_option('display.max_columns', None)

<center>
    <img src="https://logodownload.org/wp-content/uploads/2016/03/premier-league-5.png" width="75" alt="cognitiveclass.ai logo">
</center>

# Premier League API Retrieval and Database Building: Create & Access SQLite database using Python

<!-- Estimated time needed: **15** minutes -->

## Functions of notebook

This notebook was build to:

*   Retrieve data from Premier League API
-   Create a database from data
*   Insert retrieved data into database
*   Query data from the table to build visualization and statistics


In [None]:
# pip install scikit-learn numpy pandas matplotlib --upgrade --user

In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))
from src.functions.api_operations import FPLAPIParser
from src.functions.raw_data_compiler import RawDataCompiler
from src.functions.data_processing import DataAnalytics
from src.functions.helper_fns import GeneralHelperFns
from src.functions.notebook_operations import VisualizationOperations

api_ops = FPLAPIParser()
data_compiler = RawDataCompiler(api_ops)
helper_fns = GeneralHelperFns(api_ops, data_compiler)

In [None]:
data_analytics = DataAnalytics(api_ops, data_compiler, helper_fns)

## Understat

In [None]:
from src.functions.helper_fns import UnderStatHelperFns
from src.functions.understat_operations import UnderstatProcessing

In [None]:
understat_helper_fns = UnderStatHelperFns(api_ops, data_compiler)
understat_ops = UnderstatProcessing(api_ops, data_analytics, helper_fns, understat_helper_fns)

In [None]:
understat_ops.tabulate_ratings_table()

In [None]:
# help(understat_ops)

In [None]:
team_data = understat_ops.fetch_all_team_expanded_stats()
new_data = []
for d in team_data:
    new_dict = {}
    for key, value in d.items():
        if isinstance(value, list) and all(isinstance(item, dict) for item in value):
            for item in value:
                for sub_key, sub_value in item.items():
                    new_dict_key = f"{key}_{sub_key}"
                    if new_dict_key not in new_dict:
                        new_dict[new_dict_key] = []
                    new_dict[new_dict_key].append(sub_value)
        else:
            new_dict[key] = value
    new_data.append(new_dict)

team_dict = {d['id']: d for d in new_data}
team_dict = dict(sorted(team_dict.items(), key=lambda x: x[1]['title']))

team_dict

In [None]:
team_df = pd.DataFrame(team_data[12])

def extract_values(row, key):
    return row[key]

for col in team_df.columns:
    # Check if the column contains dictionaries
    if all(isinstance(val, dict) for val in team_df[col]):
        # Iterate over keys in dictionaries and create new columns
        for key in team_df[col][0].keys():
            new_col_name = f"{col}_{key}"
            team_df[new_col_name] = team_df[col].apply(lambda x: x.get(key))
        # Drop the original column
        team_df.drop(col, axis=1, inplace=True)

team_df

In [None]:
understat_helper_fns.team_nums

In [None]:
# understat = UnderstatClient()
# player_shot_data = understat.player(player=str(understat_helper_fns.grab_player_USID_from_FPLID(308))).get_shot_data()
# pd.DataFrame(data=player_shot_data)

In [None]:
team_data = understat.league(league="EPL").get_team_data(season="2023")

In [None]:
team_data

In [None]:
team_data['71']['history']

In [None]:
# from understatapi import UnderstatClient

# understat = UnderstatClient()
# player_shot_data = understat.player(player=str(understat_helper_fns.grab_player_USID_from_FPLID(308))).get_shot_data()
# player_shot_data

In [None]:
understat_helper_fns.grab_team_USname_from_FPLID(11)

## Forming master ML data

#### *Model on data you have*

Initializing with raw element summary from API

In [None]:
import pandas as pd
from datetime import datetime
import pytz
from tqdm import tqdm_notebook

ml_data = pd.DataFrame(data_compiler.master_summary_temp)
# ml_data['team'] = ml_data['element'].apply(lambda x: helper_fns.grab_player_team_id(x))

Merging raw element summary with bootstrap for teams positions and names

In [None]:
raw_data_cols_of_interest = ['id', 'team', 'element_type', 'first_name', 'second_name']
bootstrap_df = pd.DataFrame(api_ops.raw_data['elements'])
ml_data = pd.merge(ml_data, bootstrap_df[raw_data_cols_of_interest], left_on='element', right_on='id', how='left')
ml_data.drop('id', axis=1, inplace=True)

Only analyzing MIDs and FWDs (for this iteration), and taking players who played that game

In [None]:
# filtered_ml_data = ml_data.loc[ml_data['element_type'].isin([3,4])]
# filtered_ml_data = filtered_ml_data[grab_fpl_stats_col_names(filtered_ml_data)]
# filtered_ml_data = filtered_ml_data.loc[filtered_ml_data['minutes'] > 0].reset_index(drop=True)
# filtered_ml_data = pd.get_dummies(data=filtered_ml_data, columns=['opponent_team', 'team'])
# filtered_ml_data.replace([False, True], [0,1], inplace=True)
# filtered_ml_data

In [None]:
filtered_ml_data = ml_data.loc[ml_data['element_type'].isin([3,4])]
filtered_ml_data = filtered_ml_data.loc[filtered_ml_data['minutes'] > 0].reset_index(drop=True)
# filtered_ml_data = pd.get_dummies(data=filtered_ml_data, columns=['opponent_team', 'team'])
filtered_ml_data.replace([False, True], [0,1], inplace=True)

Adding understat team data per match

In [None]:
team_data = understat.league(league="EPL").get_team_data(season="2023")

In [None]:
def grab_understat_history(df):
    fpl_api_kickoff_time_dt = datetime.fromisoformat(df['kickoff_time'].replace('Z', '+00:00')).strftime('%Y-%m-%d')
    relevant_team_data_history_understat = team_data[str(understat_helper_fns.grab_team_USID_from_FPLID(df['team']))]['history']
    return next((x for x in iter(relevant_team_data_history_understat) if datetime.strptime(x['date'], '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') == fpl_api_kickoff_time_dt), None)

updated_ml_data = filtered_ml_data.copy()
prefix_id = 'team'
for idx, row in tqdm_notebook(updated_ml_data.iterrows()):
    result_dict = grab_understat_history(row)
    if result_dict is not None:
        for key, value in result_dict.items():
            if key != 'date':
                if isinstance(value, dict):
                    for sub_key, sub_value in value.items():
                        new_column_name = f"{key}_{sub_key}"
                        if f'{prefix_id}_{new_column_name}' not in updated_ml_data.columns:
                            updated_ml_data[f'{prefix_id}_{new_column_name}'] = None
                        updated_ml_data.at[idx, f'{prefix_id}_{new_column_name}'] = sub_value
                else:
                    if f'{prefix_id}_{key}' not in updated_ml_data.columns:
                        updated_ml_data[f'{prefix_id}_{key}'] = None
                    updated_ml_data.at[idx, f'{prefix_id}_{key}'] = value

In [None]:
updated_ml_data.dropna(inplace=True)

In [None]:
updated_ml_data

Adding averages, sums and std deviations for several team and player performance metrics 

Building column parser for direct grab of relevant information as desired

In [None]:
def grab_fpl_stats_col_names(data: pd.DataFrame = ml_data, col_filter = None):
#     remove_col_descripts = ['chance', 'cost', 'rank', 'start', 'name', 'dreamteam', 'news', 'photo','id', 'code', 'special', 'squad_number', 'percent', 'text', 'transfers', 'order','ep_', 'status', 'cards','element','missed','saved','minutes','own_goals', 'value']
    remove_col_descripts = ['kickoff_time', 'red_cards', 'yellow_cards', 'missed', 'saved', 'name', 'transfers', 'element', 'fixture', 'selected', 'value', 'round', 'own_goals', 'score']
    remove_col_descripts += ['saves', 'clean_sheets','conceded']
    remove_col_descripts += ['team_result', 'team_wins','team_draws','team_loses','team_pts', 'team_h_a']
    if col_filter is not None:
        if col_filter == 'known': 
            remove_col_descripts += ['minutes', 'assists', 'bonus','bps', 'influence','creativity', 'threat','ict_index', 'starts', 'expected_goals', 'expected_assists', 'expected_goal_involvements']
            remove_col_descripts += ['team_xG', 'team_xGA', 'team_npxG','team_npxGA', 'team_xpts']
    return [x for x in data.keys() if all(y not in x for y in remove_col_descripts)]

#     if position_filters is None:
#         return [x for x in data.keys() if all(y not in x for y in remove_col_descripts)]
#     else:
#         filtered_elements = data.loc[data['element_type'].isin(position_filters)]
#         if position_filters == [3,4]:
#             remove_col_descripts += ['saves', 'clean_sheets','conceded']
#             return [x for x in data.keys() if all(y not in x for y in remove_col_descripts)]
#         else:
#             return None

In [None]:
updated_ml_data[grab_fpl_stats_col_names(updated_ml_data)]

In [None]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Assuming df_player contains player performance metrics and df_team contains team metrics
# # Let's assume we want to analyze the correlation between player performance metrics and team metrics

# # Select relevant columns
# us_cols = ['team_xG', 'team_xGA', 'team_npxG', 'team_npxGA', 'team_ppda_att', 'team_ppda_def', 'team_ppda_allowed_att', 'team_ppda_allowed_def', 'team_deep', 'team_deep_allowed', 'team_xpts', 'team_npxGD']
# player_cols = ['total_points', 'was_home', 'goals_scored', 'assists']

# team_data = updated_ml_data[us_cols]
# player_data = updated_ml_data[player_cols]

# # Calculate correlation matrix
# correlation_matrix = player_data.apply(lambda x: x.corr(team_data.mean(axis=1)))


# # Visualize correlation matrix
# plt.figure(figsize=(10, 8))
# sns.heatmap(correlation_matrix.to_frame(), annot=True, cmap='coolwarm', fmt=".2f")
# plt.title('Correlation Matrix between Player and Team Metrics')
# plt.xlabel('Team Metrics')
# plt.ylabel('Player Metrics')
# plt.show()


In [None]:
correlation_matrix

---

# Model Testing

In [None]:
import pandas as pd
from prettytable import PrettyTable
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
def convert_param_to_label(param_name: str):
    reference_data = api_ops.raw_data['element_stats']
    if param_name in [x['name'] for x in reference_data]:
        return next(x['label'] for x in reference_data if x['name'] == param_name)
    else:
        return ' '.join([word.capitalize() for word in re.split('[_-]', param_name)])

### SLR

In [None]:
# api_ops.raw_data.keys()

In [None]:
# api_ops.raw_data['element_types']

In [None]:
# api_ops.raw_data['element_stats']

In [None]:
# raw_elements = pd.json_normalize(api_ops.raw_data['elements'])
# raw_elements.columns

In [None]:
# raw_players = pd.json_normalize(api_ops.raw_data['element_stats'])
# raw_positions = pd.json_normalize(api_ops.raw_data['element_types'])

In [None]:
def plot_slr(input_att: str, output_att: str, data: pd.DataFrame, naming_data = api_ops.raw_data['element_stats']):
    
    def df_with_numeric_values(param_cols_to_apply_condition, min_value, df):
        return df[df[param_cols_to_apply_condition].apply(lambda x: x.ge(min_value) & x.apply(lambda y: isinstance(y, (int, float)) and not np.isnan(y)), axis=1)]
    
    lm = LinearRegression()

    params_of_interest = [input_att, output_att]
    reg_df = data[params_of_interest].astype('float')
    reg_df = df_with_numeric_values(params_of_interest, 0, reg_df)

    width = 6
    height = 5

    plt.figure(figsize=(width, height))
    sns.regplot(x=input_att, y=output_att, data=reg_df, line_kws={"color": "black"})
    plt.ylim(0,)
    
    param_name_x = convert_param_to_label(input_att)
    param_name_y = convert_param_to_label(output_att)
    plt.title(f'Simple Linear Regression for {param_name_y} vs {param_name_x}')
    plt.xlabel(f'{param_name_x}')
    plt.ylabel(f'{param_name_y}')

    plt.figure(figsize=(width, height))
    sns.residplot(x=reg_df[input_att], y=reg_df[output_att])
    
    plt.title(f'Residual Error for {param_name_y} vs {param_name_x}')
    plt.xlabel(f'{param_name_x} (Indep. Variable)')
    plt.ylabel(f'Residuals of SLR model')
    
    plt.show()

In [None]:
# grab_fpl_stats_col_names([3, 4])

In [None]:
plot_slr("ict_index", "bps", ml_data)

### MLR

In [None]:
import re

def plot_mlr(input_atts: list, output_att: str, data: pd.DataFrame):
    
    def df_with_numeric_values(param_cols_to_apply_condition, min_value, df):
        return df[df[param_cols_to_apply_condition].apply(lambda x: x.ge(min_value) & x.apply(lambda y: isinstance(y, (int, float)) and not np.isnan(y)), axis=1)]
    
    _naming_data = api_ops.raw_data['element_stats']
    
    lm = LinearRegression()

    params_of_interest = [output_att]+input_atts
    filtered_data = data[params_of_interest].copy().astype('float')
    filtered_data = df_with_numeric_values(params_of_interest, 0.1, filtered_data).dropna()

    output_df = filtered_data[output_att]
    
    Z = filtered_data[input_atts]
    lm.fit(Z, filtered_data[output_att])
    print(f"LM Intercept: {lm.intercept_}")
    print(f"LM Coefficients: {lm.coef_}")
    Y_hat = lm.predict(Z)
    
    plt.figure(figsize=(6, 5))

    ax1 = sns.distplot(output_df, hist=False, color="r", label="Actual Value")
    sns.distplot(Y_hat, hist=False, color="b", label="Fitted Values" , ax=ax1)

    if output_att in [x['name'] for x in _naming_data]:
        param_name = next(x['label'] for x in _naming_data if x['name'] == output_att)
    else:
        param_name = result = ' '.join([word.capitalize() for word in re.split('[_-]', output_att)])

    
    plt.title(f'Distribution Plot of Actual vs Fitted Values for {param_name}')
    plt.xlabel(f'{param_name}')
    plt.ylabel(f'Proportion of {param_name}')

    plt.show()
    plt.close()

In [None]:
output_attribute = 'bps'
# attributes = ['ict_index', 'expected_goal_involvements', 'bps']
attributes = grab_fpl_stats_col_names(updated_ml_data, col_filter='known')

# _position_data = api_ops.raw_data['element_types']
# ml_data = ml_data.loc[ml_data['element_type'].isin([1,2])]
plot_mlr(attributes, output_attribute, updated_ml_data)

### PR

In [None]:
def PlotPolly(model, independent_variable, dependent_variable, param_names):
    x_new = np.linspace(min(independent_variable), max(independent_variable), 100)
    y_new = model(x_new)

    plt.plot(independent_variable, dependent_variable, '.', x_new, y_new, '-')
    plt.title(f'Polynomial Fit with Matplotlib for {param_names[1]}')
    ax = plt.gca()
    ax.set_facecolor((0.898, 0.898, 0.898))
    fig = plt.gcf()
    
    plt.xlabel(convert_param_to_label(param_names[0]))
    plt.ylabel(convert_param_to_label(param_names[1]))

    plt.show()
    plt.close()

In [None]:
def plot_polyreg(input_att: str, output_att: str, data: pd.DataFrame, order: int):
    
    def df_with_numeric_values(param_cols_to_apply_condition, min_value, df):
        return df[df[param_cols_to_apply_condition].apply(lambda x: x.ge(min_value) & x.apply(lambda y: isinstance(y, (int, float)) and not np.isnan(y)), axis=1)]
    
    params_of_interest = [input_att, output_att]
    filtered_data = data[params_of_interest].copy().astype('float')
    filtered_data = df_with_numeric_values(params_of_interest, 0.1, filtered_data).dropna()
    
    x = filtered_data[input_att]
    y = filtered_data[output_att]

    f = np.polyfit(x, y, order)
    p = np.poly1d(f)
    print(p)

    PlotPolly(p, x, y, [input_att, output_att])

    np.polyfit(x, y, order)
    return

In [None]:
input_att = 'ict_index'
output_att = 'bps'

plot_polyreg(input_att, output_att, filtered_ml_data, order = 5)

### HeatMap

In [None]:
# all_params = ['id','ict_index', 'expected_goal_involvements', 'event_points','bonus']
# filtered_raw_elements = raw_elements[all_params].copy().astype('float')
# filtered_raw_elements = df_with_numeric_values(all_params, 0.1, filtered_raw_elements).dropna()
# df_group = filtered_raw_elements[all_params].astype('float')

# params_of_interest = ['event_points','ict_index', 'expected_goal_involvements','bonus']
# df_group = df_group.groupby(params_of_interest[1],as_index=False).mean()

# grouped_pivot = df_group.pivot(index=params_of_interest[0],columns=params_of_interest[1])

# width = 30
# height = 11

# plt.figure(figsize=(width, height))
# plt.pcolor(grouped_pivot, cmap='RdBu')
# plt.colorbar()
# plt.show()

# Model Evaluation & Refinement

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    ax1 = sns.kdeplot(RedFunction, color="r", label=RedName)
    ax2 = sns.kdeplot(BlueFunction, color="b", label=BlueName, ax=ax1)

    plt.title(Title)
    plt.xlabel('Distribution Parameter')
    plt.ylabel('Proportion')
    plt.show()
    plt.close()

In [None]:
def PollyPlot(xtrain, xtest, y_train, y_test, lr, poly_transform, axes_names):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))
    
    
    #training data 
    #testing data 
    # lr:  linear regression object 
    #poly_transform:  polynomial transformation object 
 
    xmax=max([xtrain.values.max(), xtest.values.max()])

    xmin=min([xtrain.values.min(), xtest.values.min()])

    x=np.arange(xmin, xmax, 0.1)


    plt.plot(xtrain, y_train, 'ro', label='Training Data')
    plt.plot(xtest, y_test, 'go', label='Test Data')
    plt.plot(x, lr.predict(poly_transform.fit_transform(x.reshape(-1, 1))), label='Predicted Function')
#     plt.ylim([-10000, 60000])
    plt.xlabel(f"{convert_param_to_label(axes_names[0])}")
    plt.ylabel(f"{convert_param_to_label(axes_names[1])}")
    plt.legend()

## Training/Testing

In [None]:
def filter_raw_df_and_return_split(df, indep_vars, dep_var, data_threshold = 0.001):
    
    def df_with_numeric_values(param_cols_to_apply_condition, min_value, df):
        return df[df[param_cols_to_apply_condition].apply(lambda x: x.ge(min_value) & x.apply(lambda y: isinstance(y, (int, float)) and not np.isnan(y)), axis=1)]
    
    params_of_interest = indep_vars + [dep_var]
    filtered_data = df[params_of_interest].copy().astype('float')
    filtered_data = df_with_numeric_values(params_of_interest, data_threshold, filtered_data).dropna()
    
    return filtered_data[indep_vars], filtered_data[dep_var]

def build_data_to_model(input_atts: list, output_att: str, data: pd.DataFrame):
    def df_with_numeric_values(param_cols_to_apply_condition, min_value, df):
        return df[df[param_cols_to_apply_condition].apply(lambda x: x.ge(min_value) & x.apply(lambda y: isinstance(y, (int, float)) and not np.isnan(y)), axis=1)]
    
    params_of_interest = input_atts + [output_att]
    filtered_data = data[params_of_interest].copy().astype('float')
    filtered_data = df_with_numeric_values(params_of_interest, 0, filtered_data).dropna()
    
    y_data = filtered_data[output_att]
    x_data = filtered_data.drop(output_att,axis=1)

    return x_data, y_data        
#     x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_size, random_state=random_state)

#     print("[0] number of test samples :", x_test.shape[0])
#     print("[0] number of training samples:",x_train.shape[0])

In [None]:
output_attribute = 'bps'
attributes = [x for x in grab_fpl_stats_col_names(updated_ml_data, col_filter='known') if output_attribute not in x and x not in ['opponent_team', 'team']]

# filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, updated_ml_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
attributes

### Testing single param using trained model

In [None]:
attribute_in = "team_ppda_att"

In [None]:
lre=LinearRegression()
lre.fit(x_train[[attribute_in]], y_train)
print(lre.score(x_test[[attribute_in]], y_test))
print(lre.score(x_train[[attribute_in]], y_train))

### Cross-Validation

Splitting up into training and testing datasets are by design the first portion to train and the last portion to test. Cross-validation uses folds to split up this proportion across the entire dataset and then averages to determine a better generalized view.

In [None]:
def cross_validate_stats(x_data, y_data, input_att, folds):
    Rcross = cross_val_score(lre, x_data[[attribute_in]], y_data, cv=folds)
    print(f"R_cross: {Rcross}\n")
    print(f"Folds_mean: {Rcross.mean()} | Folds_std_dev: {Rcross.std()}\n")
    -1 * cross_val_score(lre,x_data[[attribute_in]], y_data, cv=folds, scoring='neg_mean_squared_error')
    yhat = cross_val_predict(lre,x_data[[attribute_in]], y_data,cv=4)
    print(f"Predictions for input_att for [0:5]: {yhat[0:5]}")

In [None]:
attribute_in = "team_ppda_att"

In [None]:
cross_validate_stats(pd.concat([x_train, x_test]), pd.concat([y_train, y_test]), attribute_in, folds = 4)

## Overfitting, Underfitting & Model Selection

In [None]:
output_attribute = 'bps'
attributes = [x for x in grab_fpl_stats_col_names(updated_ml_data, col_filter='known') if output_attribute not in x]

# filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, updated_ml_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
# params_of_interest = ['ict_index', 'expected_goal_involvements','bonus']
# filtered_raw_elements = raw_elements[params_of_interest].copy().astype('float')
# filtered_raw_elements = df_with_numeric_values(params_of_interest, 0.1, filtered_raw_elements).dropna()

# attribute_out = "bonus"

# reg_df = filtered_raw_elements[params_of_interest].copy().astype('float')

# y_data = reg_df[attribute_out]
# x_data = reg_df.drop(attribute_out,axis=1)

# x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.10, random_state=1)

In [None]:
lr = LinearRegression()
lr.fit(x_train[x_data.columns.tolist()], y_train)

In [None]:
yhat_train = lr.predict(x_train[x_data.columns.tolist()])
yhat_train[0:5]

In [None]:
yhat_test = lr.predict(x_test[x_data.columns.tolist()])
yhat_test[0:5]

In [None]:
Title = 'Distribution  Plot of  Predicted Value Using Training Data vs Training Data Distribution'
DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values (Train)", Title)

In [None]:
Title='Distribution  Plot of  Predicted Value Using Test Data vs Data Distribution of Test Data'
DistributionPlot(y_test,yhat_test,"Actual Values (Test)","Predicted Values (Test)",Title)

### Overfitting

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual

In [None]:
output_attribute = 'bps'
attributes = [x for x in grab_fpl_stats_col_names(updated_ml_data, col_filter='known') if output_attribute not in x]

# filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, updated_ml_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
attribute_in = "bps"

pr = PolynomialFeatures(degree=6)
x_train_pr = pr.fit_transform(x_train[[attribute_in]])
x_test_pr = pr.fit_transform(x_test[[attribute_in]])

poly = LinearRegression()
poly.fit(x_train_pr, y_train)
yhat = poly.predict(x_test_pr)
PollyPlot(x_train[attribute_in], x_test[attribute_in], y_train, y_test, poly,pr, [attribute_in, output_attribute])

In [None]:
yhat = poly.predict(x_test_pr)
# yhat[0:15]
print("Predicted values:", yhat[0:6])
print("True values:", y_test[0:6].values)

In [None]:
print(poly.score(x_train_pr, y_train))
print(poly.score(x_test_pr, y_test))

In [None]:
def evaluate_R2_across_orders(x_train, x_test, y_train, y_test, input_atts: list = None):
    
    if input_atts is None:
        input_atts = pd.concat([x_train, x_test]).columns
    
    R2_order_pairs = []
    iter_order = np.arange(1,13)
    for n in iter_order:
        pr = PolynomialFeatures(degree=n)

        x_train_pr = pr.fit_transform(x_train[input_atts])

        x_test_pr = pr.fit_transform(x_test[input_atts])    

        lr.fit(x_train_pr, y_train)
        R2 = lr.score(x_test_pr, y_test)
        if R2 > 0 or n == 1:
            R2_order_pairs.append((R2, n))
    R2s = [x[0] for x in R2_order_pairs]
    orders = [x[1] for x in R2_order_pairs]
    plt.plot(orders, R2s)
    plt.xlabel('order')
    plt.ylabel('R^2')
    plt.title('R^2 Using Test Data')
evaluate_R2_across_orders(x_train, x_test, y_train, y_test, [attribute_in])

In [None]:
output_attribute = 'bps'
attributes = [x for x in grab_fpl_stats_col_names(filtered_ml_data) if output_attribute not in x]

# filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, filtered_ml_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
def f(order, test_data, attribute_in):
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=test_data, random_state=0)
    pr = PolynomialFeatures(degree=order)
    x_train_pr = pr.fit_transform(x_train[[attribute_in]])
    x_test_pr = pr.fit_transform(x_test[[attribute_in]])
    poly = LinearRegression()
    poly.fit(x_train_pr,y_train)
    PollyPlot(x_train[attribute_in], x_test[attribute_in], y_train, y_test, poly,pr, [attribute_in, output_attribute])
    
x_data=pd.concat([x_test, x_train])
y_data=pd.concat([y_test, y_train])
interact(f, order=(0, 6, 1), test_data=(0.05, 0.95, 0.05), attribute_in="ict_index")

## Ridge Regression

Ridge regression is a method of estimating the coefficients of multiple-regression models in scenarios where the independent variables are highly correlated.

In [None]:
from sklearn.linear_model import Ridge
from tqdm import tqdm_notebook

In [None]:
output_attribute = 'total_points'
attributes = [x for x in grab_fpl_stats_col_names(filtered_ml_data) if output_attribute not in x]

# filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, filtered_ml_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
evaluate_R2_across_orders(x_train, x_test, y_train, y_test, ['ict_index'])

In [None]:
pr=PolynomialFeatures(degree=3)
x_train_pr=pr.fit_transform(x_train[x_data.columns.tolist()])
x_test_pr=pr.fit_transform(x_test[x_data.columns.tolist()])

In [None]:
RigeModel=Ridge(alpha=1)
RigeModel.fit(x_train_pr, y_train)
yhat = RigeModel.predict(x_test_pr)
print('predicted:', yhat[0:4])
print('test set :', y_test[0:4].values)

In [None]:
Rsqu_test = []
Rsqu_train = []
dummy1 = []
Alpha = 10 * np.array(range(0,100))
pbar = tqdm_notebook(Alpha)

for alpha in pbar:
    RigeModel = Ridge(alpha=alpha) 
    RigeModel.fit(x_train_pr, y_train)
    test_score, train_score = RigeModel.score(x_test_pr, y_test), RigeModel.score(x_train_pr, y_train)
    
    pbar.set_postfix({"Test Score": test_score, "Train Score": train_score})

    Rsqu_test.append(test_score)
    Rsqu_train.append(train_score)

In [None]:
width = 12
height = 10
plt.figure(figsize=(width, height))

plt.plot(Alpha,Rsqu_test, label='validation data  ')
plt.plot(Alpha,Rsqu_train, 'r', label='training Data ')
plt.xlabel('alpha')
plt.ylabel('R^2')
plt.legend()

## Grid Search

Used to find best hyperparameters used for model

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
output_attribute = 'bps'
attributes = [x for x in grab_fpl_stats_col_names(filtered_ml_data) if output_attribute not in x]

# filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, filtered_ml_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
# !pip install scikit-learn==0.20.1

In [None]:
parameters1= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000, 10000000]}]
RR=Ridge()
Grid1 = GridSearchCV(RR, parameters1,cv=4)
Grid1.fit(x_data[x_data.columns.tolist()], y_data)
BestRR=Grid1.best_estimator_
BestRR.score(x_test[x_data.columns.tolist()], y_test)

## Regression Trees

In [None]:
from sklearn.tree import DecisionTreeRegressor

The important parameters of `DecisionTreeRegressor` are

`criterion`: {"mse", "friedman_mse", "mae", "poisson"} - The function used to measure error

`max_depth` - The max depth the tree can be

`min_samples_split` - The minimum number of samples required to split a node

`min_samples_leaf` - The minimum number of samples that a leaf can contain

`max_features`: {"auto", "sqrt", "log2"} - The number of feature we examine looking for the best one, used to speed up training

In [None]:
grab_fpl_stats_col_names([3, 4])

In [None]:
output_attribute = 'bps'
# attributes = [x for x in [x for x in grab_fpl_stats_col_names([3, 4]) if 'expected' in x] if output_attribute not in x]
attributes = ['ict_index', 'expected_goal_involvements']

filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, filtered_raw_elements)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
regression_tree = DecisionTreeRegressor(criterion = 'mse', max_depth=4, min_samples_split=5, min_samples_leaf=5)
regression_tree.fit(x_train, y_train)
regression_tree.score(x_test, y_test)
prediction = regression_tree.predict(x_test)

print((prediction - y_test).abs().mean())

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

# Plot the decision tree
plt.figure(figsize=(100,20))
plot_tree(regression_tree, filled=True, rounded=True, feature_names=x_train.columns)
plt.show()


## Comparing different models

In [None]:
# Surpress warnings:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
try:
    from sklearn.metrics import jaccard_score
except:
    from sklearn.metrics import jaccard_similarity_score as jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
output_attribute = 'event_points'
attributes = [x for x in ['influence', 'creativity', 'threat', 'expected_goal_involvements','bps'] if output_attribute not in x]
# attributes = [x for x in grab_fpl_stats_col_names([3, 4]) if output_attribute not in x]
attributes

In [None]:
filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, filtered_raw_elements)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
y_data

In [None]:
LinearReg = LinearRegression().fit(x_train, y_train)
predictions = LinearReg.predict(x_test)

LinearRegression_MAE = mean_absolute_error(y_test, predictions)
LinearRegression_MSE = mean_squared_error(y_test, predictions)
LinearRegression_R2 = r2_score(y_test, predictions)

data = {
    'LinearRegression': [LinearRegression_MAE, LinearRegression_MSE, LinearRegression_R2],
}

indices = ['MAE', 'MSE', 'R2']

Report = pd.DataFrame(data, index=indices)
Report

In [None]:
predictions

#### Comparing different models for CLASSIFICATION

In [None]:
# help(f1_score)

In [None]:
# KNN = KNeighborsClassifier(n_neighbors = 4).fit(x_train, y_train)
# predictions = KNN.predict(x_test)
# KNN_Accuracy_Score = accuracy_score(y_test, predictions)
# KNN_JaccardIndex = jaccard_score(y_test, predictions)
# KNN_F1_Score = f1_score(y_test, predictions, average='weighted')

# Tree = DecisionTreeClassifier().fit(x_train,y_train)
# predictions = Tree.predict(x_test)
# Tree_Accuracy_Score = accuracy_score(y_test, predictions)
# Tree_JaccardIndex = jaccard_score(y_test, predictions)
# Tree_F1_Score = f1_score(y_test, predictions, average='weighted')


# x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=1)
# LR = LogisticRegression().fit(x_train, y_train)
# predictions = LR.predict(x_test)
# predict_proba = LR.predict_proba(x_test)

# LR_Accuracy_Score = accuracy_score(y_test, predictions)
# LR_JaccardIndex = jaccard_score(y_test, predictions)
# LR_F1_Score = f1_score(y_test, predictions, average='weighted')
# # LR_Log_Loss = log_loss(y_test, predict_proba)
# LR_Log_Loss = None

# SVM = svm.SVC().fit(x_train, y_train)
# predictions = SVM.predict(x_test)
# SVM_Accuracy_Score = accuracy_score(y_test, predictions)
# SVM_JaccardIndex = jaccard_score(y_test, predictions)
# SVM_F1_Score = f1_score(y_test, predictions, average='weighted')

# data = {
#     'KNN': [KNN_Accuracy_Score, KNN_JaccardIndex, KNN_F1_Score, None],
#     'Tree': [Tree_Accuracy_Score, Tree_JaccardIndex, Tree_F1_Score, None],
#     'LR': [LR_Accuracy_Score, LR_JaccardIndex, LR_F1_Score, LR_Log_Loss],
#     'SVM': [SVM_Accuracy_Score, SVM_JaccardIndex, SVM_F1_Score, None]
# }

# indices = ['Accuracy', 'Jaccard Index', 'F1 Score', 'LogLoss']

# Report = pd.DataFrame(data, index=indices)
# Report

## xGBoost

In [None]:
filtered_ml_data.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
output_attribute = 'total_points'
# attributes = [x for x in ['influence', 'creativity', 'threat', 'expected_goal_involvements','bps'] if output_attribute not in x]
attributes = [x for x in grab_fpl_stats_col_names(filtered_ml_data) if output_attribute not in x]
attributes

# filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_data, y_data = build_data_to_model(attributes, output_attribute, filtered_ml_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.45, random_state=1)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Create an XGBoost regressor
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

# Fit the model to the training data
xg_reg.fit(X_train, y_train)

# Predict the points scored on the test set
y_pred = xg_reg.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))


learning_rate: Controls the step size shrinkage used in each boosting round. Lower values make the model more robust but require more boosting rounds.

max_depth: Maximum depth of a tree. Deeper trees can model more complex relationships but are more prone to overfitting.

n_estimators: Number of boosting rounds (trees) to build. Higher values can lead to overfitting, so it's important to tune this parameter carefully.

subsample: Subsample ratio of the training instances. Lower values can prevent overfitting by introducing randomness.

colsample_bytree: Subsample ratio of columns when constructing each tree. Similar to subsample, but for features.

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Create an XGBoost regressor
xg_reg = xgb.XGBRegressor(objective='reg:squarederror')

# Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Create a new XGBoost regressor with the best hyperparameters
best_xg_reg = xgb.XGBRegressor(objective='reg:squarederror', **best_params)

# Fit the model to the training data
best_xg_reg.fit(X_train, y_train)

# Predict the points scored on the test set
y_pred = best_xg_reg.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Best RMSE: %f" % (rmse))

In [None]:
import matplotlib.pyplot as plt

# Plotting the predicted vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('Actual Points')
plt.ylabel('Predicted Points')
plt.title('Actual vs Predicted Points')
plt.show()

## kMeans

In [None]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

In [None]:
all_attributes = ['influence', 'creativity', 'threat', 'expected_goal_involvements','bps', 'total_points']
# attributes = [x for x in grab_fpl_stats_col_names([3, 4]) if output_attribute not in x]

#Grab MID and FWDs
# filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])][all_attributes]
filtered_raw_elements = filtered_ml_data[all_attributes]
#Convert teams to numerical data
# filtered_raw_elements = pd.get_dummies(data=filtered_raw_elements, columns=['team'])
# Create a new categorical variable based on points scored ranges
filtered_raw_elements['points_category'] = pd.cut(filtered_raw_elements['total_points'], bins=[-np.inf, 6, 10, np.inf], labels=['bad', 'good', 'above_expectation'])
X = filtered_raw_elements.drop(['total_points', 'points_category'], axis=1)

In [None]:
# Perform clustering
kmeans = KMeans(n_clusters=3, random_state=0)
filtered_raw_elements['cluster'] = kmeans.fit_predict(X)

# Check if the clusters align with the predefined categories
cluster_mapping = {
    0: 'bad',
    1: 'good',
    2: 'above_expectation'
}
filtered_raw_elements['cluster_category'] = filtered_raw_elements['cluster'].map(cluster_mapping)

# Evaluate the clusters
accuracy = (filtered_raw_elements['points_category'] == filtered_raw_elements['cluster_category']).mean()
print("Accuracy:", accuracy)

In [None]:
filtered_raw_elements

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

# Initialize an empty list to store inertia values
inertia = []

# Test different numbers of clusters
for n_clusters in range(2, 21):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the inertia values
plt.figure(figsize=(10, 6))
plt.plot(range(2, 21), inertia, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

# Choose the optimal number of clusters based on the elbow method
optimal_n_clusters = 3  # Change this based on the plot

# Perform clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=0)
filtered_raw_elements['cluster'] = kmeans.fit_predict(X)

# Visualize the clustering
plt.figure(figsize=(10, 6))
for cluster in range(optimal_n_clusters):
    cluster_data = filtered_raw_elements[filtered_raw_elements['cluster'] == cluster]
    plt.scatter(cluster_data['bps'], cluster_data['total_points'], label=f'Cluster {cluster}')
plt.xlabel('Metric 1')
plt.ylabel('Metric 2')
plt.title('Clustering Visualization')
plt.legend()
plt.show()


In [None]:
filtered_raw_elements

In [None]:
import seaborn as sns

# Calculate the correlation matrix
corr_matrix = filtered_raw_elements.corr()

# Plot the heatmap
plt.figure(figsize=(24, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

# Remove the target variable and clustering columns from the DataFrame
X_corr = filtered_raw_elements.drop(['total_points', 'cluster', 'points_category', 'cluster_category'], axis=1).astype(float)

# Plot the clustermap
plt.figure(figsize=(10, 8))
sns.clustermap(X_corr, cmap='coolwarm', standard_scale=1)
plt.title('Clustermap of Metrics')
plt.show()

In [None]:
corr_matrix

## Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
filtered_raw_elements = filtered_ml_data.copy()
filtered_raw_elements = filtered_raw_elements.drop(['minutes', 'assists', 'bonus','bps', 'influence','creativity', 'threat','ict_index', 'starts', 'expected_goals', 'expected_assists', 'expected_goal_involvements'], axis=1)

In [None]:
filtered_raw_elements['points_category'] = pd.cut(filtered_raw_elements['total_points'], bins=[-np.inf, 4, 7, 9, np.inf], labels=['bad', 'good', 'great', 'above_expectation'])
X = filtered_raw_elements.drop(['total_points', 'points_category'], axis=1)
y_data = filtered_raw_elements['points_category']

In [None]:
X

In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y_data)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
print("[0] number of test samples :", x_test.shape[0])
print("[0] number of training samples:",x_train.shape[0])

In [None]:
# Fit the logistic regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Decode the predicted values
y_pred_decoded = le.inverse_transform(y_pred)

# Evaluate the model
print(classification_report(y_test, y_pred))

###

# Custom Testing

In [None]:
from itertools import combinations
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [None]:
def loop_optimum_indep_variables(input_atts: list, output_att: str, data: pd.DataFrame):

    # Define the model
    rf = RandomForestRegressor()

    # Define the list of metrics (independent variables)
    metrics = input_atts  # Replace ... with your actual metrics

    best_score = float('-inf')
    best_features = None
    
    summed_metrics = []
    # Iterate over all possible feature combinations
    for r in tqdm(range(1, len(metrics)+1)):
        for feature_combination in combinations(metrics, r):
            X_subset, y = filter_raw_df_and_return_split(data, input_atts, output_att)
            scores = -1 * cross_val_score(rf, X_subset, y, cv=3, scoring='neg_mean_squared_error')
            mean_score = scores.mean()
#             print(f"{feature_combination}: {mean_score}")
            summed_metrics.append((feature_combination, mean_score))
#             if mean_score > best_score:
#                 best_score = mean_score
#                 best_features = feature_combination

#     print("Best Score:", best_score)
#     print("Best Features:", best_features)
    return summed_metrics

In [None]:
[x for x in grab_fpl_stats_col_names([3, 4])]

In [None]:
output_attribute = 'event_points'
input_atts = [x for x in ['team', 'bps', 'expected_goal_involvements', 'influence', 'creativity', 'threat'] if output_attribute not in x]
summed_metrics = loop_optimum_indep_variables(input_atts, output_attribute, raw_elements)

In [None]:
summed_metrics

In [None]:
output_attribute = 'bps'
# attributes = [x for x in [x for x in grab_fpl_stats_col_names([3, 4]) if 'expected' in x] if output_attribute not in x]
attributes = ['team']

filtered_raw_elements = raw_elements.loc[raw_elements['element_type'].isin([3,4])]
x_train, x_test, y_train, y_test = build_model_sets(attributes, output_attribute, raw_elements, test_size = 0.45, random_state=1)

In [None]:
plot_mlr(attributes, output_attribute, raw_elements)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Random Forest Regressor
rf = RandomForestRegressor()

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')

# Perform the grid search
grid_search.fit(x_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_rf.predict(x_test)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
x_train

---

In [None]:
df = data_compiler.total_summary
[col for col in df.columns if df[col].apply(type).eq(list).any()]

In [None]:
gw_offset = 1
evaluation_param = 'ict_index'
sample_size = 20

replacement_options = [x['id'] for x in data_analytics.replacement_players]
beacon_picks = list(data_analytics.beacon_effective_ownership.keys())
combined_prospects = list(set(replacement_options).union(set(beacon_picks)))

tab_data=[]
for player_id in combined_prospects:
    df_index = data_compiler.total_summary.loc[data_compiler.total_summary['id_player'] == player_id].index.values[0]
    df_sliced = data_compiler.total_summary.iloc[df_index]
    # Create sample data
    df = pd.DataFrame({'player_id': [df_sliced['id_player']]*len(df_sliced['round'][:len(df_sliced['round'])-gw_offset*bool(gw_offset)]),
                            'game_week': df_sliced['round'][:len(df_sliced['round'])-gw_offset*bool(gw_offset)],
                            'was_home': df_sliced['was_home'][:len(df_sliced['was_home'])-gw_offset*bool(gw_offset)],
#                             'opponent_team': DataTransformer.all_df.iloc[DF_INDEX]['opponent_team'],
                            'fdr': [helper_fns.team_rank(x) for x in df_sliced['opponent_team']][:len(df_sliced['opponent_team'])-gw_offset*bool(gw_offset)],
                            evaluation_param: df_sliced[evaluation_param][:len(df_sliced[evaluation_param])-gw_offset*bool(gw_offset)]})
    # split the data into training and testing sets
    prev_score = 0
    outputscore, outputmodel, outputsize, outputrandstate = 0, 0, 0, 0
    for test_size in [0.2,0.3]:
        for random_state in range(10,90,1):
            try:
                train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

                # define the features and target variable
                X_train = train_df.drop(['player_id', 'game_week', evaluation_param], axis=1)
                y_train = train_df[evaluation_param]
                # print(X_train)
                # print(y_train)
                # train a linear regression model
                model = LinearRegression()
                model.fit(X_train, y_train)

                # evaluate the model on the test set
                X_test = test_df.drop(['player_id', 'game_week', evaluation_param], axis=1)
                y_test = test_df[evaluation_param]
                score = model.score(X_test, y_test)
                if score > 0 and score > prev_score:
                    prev_score = score
                    outputscore = score
                    outputmodel = model
                    outputsize = test_size
                    outputrandstate = random_state
            except Exception as e: pass
#     if outputscore > 0.5 and outputscore < 1:
    if outputscore > 0 and outputscore < 1:
#         print("\n")
        player_name = helper_fns.grab_player_name(player_id)
#         print(player_name)
#         print("\n")
        # TIME TO PREDICT
        team_id = helper_fns.grab_player_team_id(df_sliced['id_player'])
        upcoming_fixtures = helper_fns.grab_player_fixtures('fwd',team_id,sample_size,api_ops.latest_gw-gw_offset)
        # print(upcoming_fixtures)
        game_week, was_home, opponent_team, fdr = [], [], [], []
        for fixture_tup in upcoming_fixtures:
            gw, fixtures = fixture_tup
            for fixture in fixtures:
                opponent_id, opponent_name, loc_val, fdr_val = fixture
                game_week.append(gw)
                if loc_val == 'H':
                    was_home.append(1)
                elif loc_val == 'A':
                    was_home.append(0)
                opponent_team.append(opponent_id)
                fdr.append(fdr_val)
        # use the model for predictions
        new_data = pd.DataFrame({'player_id': [df_sliced['id_player']]*len(game_week),
                                 'game_week': game_week,
                                 'was_home': was_home,
#                                  'opponent_team': opponent_team,
                                 'fdr': fdr})
        X_new = new_data.drop(['player_id', 'game_week'], axis=1)
#         teams_against = [GrabFunctions.grab_3ltr_team_name(x) for x in X_new['opponent_team']]
        predictions = outputmodel.predict(X_new)
        # print('R-squared score:', outputscore)
#         print(f'Model (test_size[{outputsize}] + rand_state[{outputrandstate}]) determined that {round(100*outputscore,2)} % of variance in dependent variables can be explained by {PARAM}')
#         if outputscore < 0.75:
#             print('*Not a great correlation, would suggest adjustments to aim above 75 %...')
#         print(f'Predicted {PARAM}: {predictions}')
#         print(f'Teams Against: {teams_against}')
        prediction_tuple = (np.mean(predictions), predictions)
        marker_dict = {'ict_index':'ict',
                      'expected_goal_involvements':'xGI',
                      'history':'history',
                      'bps':'bps'}
        tab_data.append([player_name,
                     player_id,
                     evaluation_param,
                     outputsize,
                     outputrandstate,
                     round(100*outputscore,2),
                     visualization_specs.compile_static_color_str(prediction_tuple,marker_dict[evaluation_param]),
                     visualization_specs.get_colored_fixtures(helper_fns.grab_player_team_id(player_id),sample_size,api_ops.latest_gw-gw_offset)])
df = pd.DataFrame(tab_data, columns = ['Player','ID','Param','test_size','random_state','Model Score',f'Predicted {evaluation_param}','Upcoming Fixtures'])
df = df.sort_values(by=['Model Score'], ascending=False)
table = PrettyTable()
table.field_names = df.columns
for row in df.values:
    table.add_row(row)
table.align[f'Predicted {evaluation_param}'] = "l"
print(table)