# Imports and Config

In [81]:
import glob, os
import pandas as pd
import numpy as np
import configparser
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

In [82]:

configParser = configparser.RawConfigParser()   
configFilePath = r'config.txt'
configParser.read(configFilePath)
cacheDfPath = r'cache/gw19trainingdf.pkl'


# Data aggregation

In [83]:

def create_initial_dataframe(gw_dir_path='data/gws'): 
    gw_dfs = []
    gw_file_list = glob.glob(f'{gw_dir_path}/gw*.csv')
    for gw_file in gw_file_list:
        curr_gw_csv = pd.read_csv(gw_file)
        # Keep this list of columns
        column_names = configParser.get("Data", "pred_column_names").split(',') + configParser.get("Data", "res_column_names").split(',')
        curr_gw_csv = curr_gw_csv[column_names]
        curr_gw_csv['gw'] = int((gw_file.removeprefix(f"{gw_dir_path}\\gw")).removesuffix(".csv"))
        gw_dfs.append(curr_gw_csv)

    agg_df = pd.concat(gw_dfs, ignore_index=True)
    return agg_df

init_df = create_initial_dataframe()

display(init_df)


Unnamed: 0,name,position,team,creativity,influence,threat,was_home,opponent_team,xP,assists,...,expected_goals,expected_goals_conceded,goals_conceded,goals_scored,minutes,saves,selected,starts,total_points,gw
0,Femi Seriki,DEF,Sheffield Utd,0.0,0.0,0.0,True,8,0.5,0,...,0.00,0.00,0,0,0,0,0,0,0,1
1,Jack Hinshelwood,MID,Brighton,0.0,0.0,0.0,True,12,1.5,0,...,0.00,0.00,0,0,0,0,822,0,0,1
2,Jadon Sancho,MID,Man Utd,11.3,3.8,8.0,True,20,3.0,0,...,0.00,1.08,0,0,22,0,83993,0,1,1
3,Rhys Norrington-Davies,DEF,Sheffield Utd,0.0,0.0,0.0,True,8,0.1,0,...,0.00,0.00,0,0,0,0,6456,0,0,1
4,Vitaly Janelt,MID,Brentford,11.5,14.6,17.0,True,18,2.1,0,...,0.02,1.26,2,0,90,0,6508,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11552,Kyle Walker,DEF,Man City,3.1,8.6,0.0,True,5,2.3,0,...,0.00,0.83,1,0,90,0,1669338,1,2,9
11553,Jacob Brown,FWD,Luton,2.3,2.2,27.0,False,16,1.3,0,...,0.38,2.87,1,0,71,0,15587,1,2,9
11554,Vicente Guaita,GK,Crystal Palace,0.0,0.0,0.0,False,15,0.0,0,...,0.00,0.00,0,0,0,0,30016,0,0,9
11555,Braian Ojeda Rodríguez,MID,Nott'm Forest,0.0,0.0,0.0,True,12,0.0,0,...,0.00,0.00,0,0,0,0,243,0,0,9


In [78]:

def preprocess_df(df):
    # First get total stats so far this season
    total_stat_list = configParser.get("Preprocessing", "total_stat_list").split(',')

    for stat in total_stat_list:
        def tot_stat_helper(df_row):
            temp_df = df
            temp_df = temp_df[temp_df["name"] == df_row["name"]]
            temp_df = temp_df[temp_df["gw"] < int(df_row["gw"])]
            return np.sum(temp_df[stat])
        
        df[f'tot_{stat}'] = df.progress_apply(tot_stat_helper, axis=1)

    # Filter out players with <= min_starts
    min_starts = int(configParser.get("Preprocessing", "min_starts"))
    min_starts_df = df[df['tot_starts'] >= min_starts]
    
    # Now calculate recent stats
    recent_stat_list = configParser.get("Preprocessing", "recent_stat_list").split(',')
    recent_num_gws = min_starts = int(configParser.get("Preprocessing", "recent_num_gws"))

    for stat in recent_stat_list:
        # We take the previous 3 games
        def recent_stat_helper(df_row):
            temp_df = df
            temp_df = temp_df[temp_df["name"] == df_row["name"]]
            temp_df = temp_df[temp_df["gw"] < int(df_row["gw"])]
            temp_df = temp_df[temp_df["starts"]==1].sort_values("gw")

            return np.mean(temp_df.tail(recent_num_gws)[stat])
        
        min_starts_df[f'avg_{stat}'] = min_starts_df.progress_apply(recent_stat_helper, axis=1)
    
    # Now calculate avg stats
    avg_stat_list = configParser.get("Preprocessing", "avg_stat_list").split(',')
    for stat in avg_stat_list:
        # We take the overall average across starts
        def avg_stat_helper(df_row):
            temp_df = df
            temp_df = temp_df[temp_df["name"] == df_row["name"]]
            temp_df = temp_df[temp_df["gw"] < int(df_row["gw"])]
            temp_df = temp_df[temp_df["starts"]==1]

            return np.mean(temp_df[stat])
        
        min_starts_df[f'avg_{stat}'] = min_starts_df.progress_apply(avg_stat_helper, axis=1)

    return min_starts_df 

if os.path.exists(cacheDfPath):
    preprocessed_df = pd.read_pickle(cacheDfPath)
else:
    preprocessed_df = preprocess_df(init_df)
    preprocessed_df.to_pickle(cacheDfPath)

display(preprocessed_df)


100%|██████████| 11557/11557 [00:16<00:00, 696.70it/s]
100%|██████████| 11557/11557 [00:16<00:00, 720.32it/s]
100%|██████████| 2926/2926 [00:05<00:00, 571.50it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_starts_df[f'avg_{stat}'] = min_starts_df.progress_apply(avg_stat_helper, axis=1)
100%|██████████| 2926/2926 [00:05<00:00, 569.70it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  min_starts_df[f'avg_{stat}'] = min_starts_df.progress_apply(avg_stat_helper, axis=1)
100%|██████████| 2926/2926 [00:05<00:00, 577.70it/s]
A value is trying to be set on 

Unnamed: 0,name,position,team,creativity,influence,threat,was_home,opponent_team,xP,assists,...,avg_clean_sheets,avg_expected_assists,avg_expected_goal_involvements,avg_expected_goals,avg_expected_goals_conceded,avg_goals_conceded,avg_goals_scored,avg_minutes,avg_saves,avg_total_points
665,Vitaly Janelt,MID,Brentford,0.8,10.4,19.0,False,7,2.3,0,...,0.333333,0.068889,0.106667,0.037778,0.860000,0.777778,0.000,79.888889,0.0,2.444444
673,Curtis Jones,MID,Liverpool,0.0,0.0,0.0,True,16,0.0,0,...,0.250000,0.017500,0.045000,0.027500,1.610000,1.000000,0.000,63.500000,0.0,1.000000
682,Oliver Norwood,MID,Sheffield Utd,2.2,14.6,0.0,False,1,0.3,0,...,0.000000,0.085714,0.105714,0.020000,2.072857,2.285714,0.000,84.285714,0.0,1.714286
684,Bernardo Veiga de Carvalho e Silva,MID,Man City,42.0,34.6,21.0,False,14,3.8,1,...,0.200000,0.278000,0.348000,0.070000,0.642000,0.800000,0.200,90.000000,0.0,4.200000
685,Charlie Taylor,DEF,Burnley,1.1,34.8,9.0,False,3,1.5,0,...,0.000000,0.018333,0.031667,0.013333,1.825000,2.000000,0.000,90.000000,0.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11528,Dominic Solanke,FWD,Bournemouth,2.1,35.8,44.0,True,20,3.3,0,...,0.125000,0.070000,0.447500,0.377500,2.407500,2.250000,0.375,90.000000,0.0,4.000000
11535,Matty Cash,DEF,Aston Villa,3.3,24.6,2.0,True,19,3.3,0,...,0.250000,0.103750,0.493750,0.390000,1.495000,1.375000,0.250,88.375000,0.0,4.500000
11545,Nélson Cabral Semedo,DEF,Wolves,0.0,0.0,0.0,False,3,0.0,0,...,0.125000,0.041250,0.080000,0.040000,1.738750,1.750000,0.000,89.125000,0.0,1.500000
11552,Kyle Walker,DEF,Man City,3.1,8.6,0.0,True,5,2.3,0,...,0.375000,0.108750,0.116250,0.007500,0.698750,0.750000,0.000,90.000000,0.0,3.875000


In [79]:
def compute_team_agg(df, team, gw):
    df = df[df["team"] == team] 
    df = df[df["gw"] == gw]
    df = df[df["starts"] == 1]
    
    return df

display(compute_team_agg(preprocessed_df, "Man Utd", 15))

Unnamed: 0,name,position,team,creativity,influence,threat,was_home,opponent_team,xP,assists,...,avg_clean_sheets,avg_expected_assists,avg_expected_goal_involvements,avg_expected_goals,avg_expected_goals_conceded,avg_goals_conceded,avg_goals_scored,avg_minutes,avg_saves,avg_total_points
4418,Rasmus Højlund,FWD,Man Utd,20.9,8.6,23.0,True,7,1.7,0,...,0.375,0.03875,0.32875,0.29,1.14875,0.875,0.0,77.875,0.0,2.0
4446,Luke Shaw,DEF,Man Utd,40.2,20.2,4.0,True,7,2.7,0,...,0.5,0.14,0.145,0.005,2.075,0.75,0.0,86.25,0.0,3.5
4664,Antony Matheus dos Santos,MID,Man Utd,23.7,17.6,22.0,True,7,1.3,0,...,0.333333,0.115,0.236667,0.121667,1.263333,0.833333,0.0,72.166667,0.0,2.166667
4693,Scott McTominay,MID,Man Utd,2.5,80.8,129.0,True,7,6.0,0,...,0.5,0.00875,0.09125,0.0825,1.795,1.0,0.125,85.75,0.0,3.125
4696,Alejandro Garnacho,MID,Man Utd,41.4,16.2,61.0,True,7,4.7,1,...,0.666667,0.041667,0.19,0.148333,1.338333,0.333333,0.166667,75.0,0.0,3.333333
4756,Sofyan Amrabat,MID,Man Utd,12.5,9.6,0.0,True,7,1.5,0,...,0.0,0.045,0.05,0.005,1.17,1.0,0.0,77.75,0.0,1.25
4782,Bruno Borges Fernandes,MID,Man Utd,68.2,19.0,32.0,True,7,3.3,0,...,0.357143,0.301429,0.545,0.243571,1.748571,1.214286,0.214286,90.0,0.0,4.5
4812,André Onana,GK,Man Utd,0.0,15.0,0.0,True,7,6.0,0,...,0.357143,0.000714,0.000714,0.0,1.748571,1.214286,0.0,90.0,3.857143,3.928571
4837,Diogo Dalot Teixeira,DEF,Man Utd,36.4,13.2,2.0,True,7,5.7,0,...,0.333333,0.09,0.123333,0.033333,1.700833,1.25,0.083333,90.0,0.0,4.25
4886,Victor Lindelöf,DEF,Man Utd,0.7,4.0,7.0,True,7,6.0,0,...,0.333333,0.031111,0.055556,0.024444,1.484444,1.0,0.111111,84.777778,0.0,4.333333


# Basic Linear Regressions

In [80]:
result = sm.ols(formula="total_points ~ recent_total_points + recent_xP", data=preprocessed_df).fit()
result.summary()

PatsyError: Error evaluating factor: NameError: name 'recent_xP' is not defined
    total_points ~ recent_total_points + recent_xP
                                         ^^^^^^^^^