# Imports and Config

In [108]:
import glob, os
import pandas as pd
import numpy as np
import configparser
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

In [173]:
configParser = configparser.RawConfigParser()   
configFilePath = r'config.txt'
configParser.read(configFilePath)
cacheDfPath = r'cache/gw19trainingdf.pkl'
display_cols = ["name", "position", "team", "gw", "xP", "total_points", "tot_total_points", "recent_total_points", "avg_total_points"]

id_to_team_name_df = pd.read_csv("data/teams.csv")
id_to_team_name_map = id_to_team_name_df.set_index("id")["name"].to_dict()

# Data aggregation

In [167]:

def create_initial_dataframe(gw_dir_path='data/gws'): 
    gw_dfs = []
    gw_file_list = glob.glob(f'{gw_dir_path}/gw*.csv')
    for gw_file in gw_file_list:
        curr_gw_csv = pd.read_csv(gw_file)
        # Keep this list of columns
        column_names = configParser.get("Data", "pred_column_names").split(',') + configParser.get("Data", "res_column_names").split(',')
        curr_gw_csv = curr_gw_csv[column_names]
        curr_gw_csv['gw'] = int((gw_file.removeprefix(f"{gw_dir_path}\\gw")).removesuffix(".csv"))
        gw_dfs.append(curr_gw_csv)

    agg_df = pd.concat(gw_dfs, ignore_index=True)
    return agg_df

init_df = create_initial_dataframe()

In [None]:
def preprocess_df(df):
    # First we ignore all non-starts 
    df = df[df["starts"] == 1].copy()

    # Iterate through each player now
    player_dfs = []
    min_starts = int(configParser.get("Preprocessing", "min_starts"))
    stat_list = configParser.get("Preprocessing", "stat_list").split(',')
    recent_num_gws = int(configParser.get("Preprocessing", "recent_num_gws"))
    
    for name, player_df in tqdm(df.groupby(by="name")):
        # Filter only players with >= min_starts
        if(len(player_df) < min_starts):
            continue 

        player_df = player_df.sort_values("gw")

        for stat in stat_list:
            # Shift to ignore current row
            player_df[f"tot_{stat}"] = player_df[f"{stat}"].shift(fill_value=0).cumsum()
            player_df[f"avg_{stat}"] = player_df[f"tot_{stat}"] / player_df[f"tot_starts"]
            player_df[f"avg_{stat}"].fillna(0, inplace=True) 

            # closed = 'left' to ignore current row
            player_df[f"recent_{stat}"] = player_df[f"{stat}"].rolling(recent_num_gws,closed="left").mean()

        player_dfs.append(player_df)


    return pd.concat(player_dfs)


def compute_team_agg(df, team, gw):
    df = df[df["team"] == team] 
    df = df[df["gw"] == gw]
    df = df[df["starts"] == 1]
    
    return df

if os.path.exists(cacheDfPath):
    preprocessed_df = pd.read_pickle(cacheDfPath)
else:
    preprocessed_df = preprocess_df(init_df)
    
    def compute_opp_team_stats(df, df_row, position, stat):
        opp_df = compute_team_agg(df, id_to_team_name_map[df_row["opponent_team"]], df_row["gw"])
        return np.mean(opp_df[opp_df["position"] == position][stat])

    preprocessed_df["opp_def_recent_total_points"] = preprocessed_df.progress_apply(lambda df_row: compute_opp_team_stats(preprocessed_df, df_row, "DEF", "recent_total_points"),axis=1)
    preprocessed_df.to_pickle(cacheDfPath)

display(preprocessed_df[display_cols])


Unnamed: 0,name,position,team,creativity,influence,threat,was_home,opponent_team,xP,assists,...,tot_minutes,avg_minutes,recent_minutes,tot_saves,avg_saves,recent_saves,tot_total_points,avg_total_points,recent_total_points,opp_def_recent_total_points
460,Aaron Hickey,DEF,Brentford,10.8,11.0,4.0,True,18,2.0,0,...,0,0.000000,,0,0.0,,0,0.000000,,
6300,Aaron Hickey,DEF,Brentford,1.5,0.0,0.0,False,10,2.5,0,...,71,71.000000,,0,0.0,,0,0.000000,,
6951,Aaron Hickey,DEF,Brentford,3.8,12.4,2.0,True,8,2.3,0,...,148,74.000000,,0,0.0,,5,2.500000,,
7648,Aaron Hickey,DEF,Brentford,1.2,15.6,1.0,True,3,2.0,0,...,238,79.333333,79.333333,0,0.0,0.0,7,2.333333,2.333333,1.333333
8361,Aaron Hickey,DEF,Brentford,1.3,10.0,21.0,False,15,0.8,0,...,319,79.750000,82.666667,0,0.0,0.0,8,2.000000,2.666667,1.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2526,Zeki Amdouni,FWD,Burnley,0.2,5.2,9.0,False,1,0.7,0,...,661,66.100000,65.666667,0,0.0,0.0,22,2.200000,1.666667,4.000000
3270,Zeki Amdouni,FWD,Burnley,12.1,11.2,23.0,True,19,1.5,0,...,740,67.272727,67.000000,0,0.0,0.0,24,2.181818,1.666667,0.833333
4023,Zeki Amdouni,FWD,Burnley,26.8,62.8,34.0,True,17,4.2,1,...,812,67.666667,76.000000,0,0.0,0.0,26,2.166667,2.000000,1.466667
4777,Zeki Amdouni,FWD,Burnley,14.3,10.0,12.0,False,20,3.7,0,...,888,68.307692,75.666667,0,0.0,0.0,37,2.846154,5.000000,1.416667


# Basic Linear Regressions

In [None]:
fwd_df = preprocessed_df[preprocessed_df["position"] == "FWD"]

In [165]:
result = sm.ols(formula="total_points ~ recent_total_points + recent_xP", data=preprocessed_df).fit()
result.summary()

0,1,2,3
Dep. Variable:,total_points,R-squared:,0.027
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,33.14
Date:,"Sun, 05 Jan 2025",Prob (F-statistic):,6.4e-15
Time:,16:13:00,Log-Likelihood:,-6120.5
No. Observations:,2375,AIC:,12250.0
Df Residuals:,2372,BIC:,12260.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.4041,0.131,18.342,0.000,2.147,2.661
recent_total_points,-0.0275,0.054,-0.506,0.613,-0.134,0.079
recent_xP,0.3092,0.058,5.302,0.000,0.195,0.424

0,1,2,3
Omnibus:,781.254,Durbin-Watson:,1.889
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2399.144
Skew:,1.687,Prob(JB):,0.0
Kurtosis:,6.587,Cond. No.,10.7
