# Imports and Config

In [257]:
import glob, os
import pandas as pd
import numpy as np
import configparser
from tqdm import tqdm

tqdm.pandas()

import matplotlib.pyplot as plt
import statsmodels.formula.api as sm

In [258]:
configParser = configparser.RawConfigParser()
configFilePath = r"config.txt"
configParser.read(configFilePath)
cacheDfPath = r"cache/gw19trainingdf.pkl"
display_cols = [
    "name",
    "position",
    "team",
    "gw",
    "xP",
    "total_points",
    "tot_total_points",
    "recent_total_points",
    "avg_total_points",
]

id_to_team_name_df = pd.read_csv("data/teams.csv")
id_to_team_name_map = id_to_team_name_df.set_index("id")["name"].to_dict()

# Data aggregation

In [190]:
def create_initial_dataframe(gw_dir_path="data/gws"):
    gw_dfs = []
    gw_file_list = glob.glob(f"{gw_dir_path}/gw*.csv")
    for gw_file in gw_file_list:
        curr_gw_csv = pd.read_csv(gw_file)
        # Keep this list of columns
        column_names = configParser.get("Data", "pred_column_names").split(
            ","
        ) + configParser.get("Data", "res_column_names").split(",")
        curr_gw_csv = curr_gw_csv[column_names]
        curr_gw_csv["gw"] = int(
            (gw_file.removeprefix(f"{gw_dir_path}\\gw")).removesuffix(".csv")
        )
        gw_dfs.append(curr_gw_csv)

    agg_df = pd.concat(gw_dfs, ignore_index=True)
    return agg_df


init_df = create_initial_dataframe()

In [275]:
def preprocess_df(df):
    # First we ignore all non-starts
    df = df[df["starts"] == 1].copy()

    # Iterate through each player now
    player_dfs = []
    min_starts = int(configParser.get("Preprocessing", "min_starts"))
    stat_list = configParser.get("Preprocessing", "stat_list").split(",")
    recent_num_gws = int(configParser.get("Preprocessing", "recent_num_gws"))

    for name, player_df in tqdm(df.groupby(by="name")):
        # Filter only players with >= min_starts
        if len(player_df) < min_starts:
            continue

        player_df = player_df.sort_values("gw")

        for stat in stat_list:
            # Shift to ignore current row
            player_df[f"tot_{stat}"] = player_df[f"{stat}"].shift(fill_value=0).cumsum()
            player_df[f"avg_{stat}"] = (
                player_df[f"tot_{stat}"] / player_df[f"tot_starts"]
            )
            player_df[f"avg_{stat}"].fillna(0, inplace=True)

            # closed = 'left' to ignore current row
            player_df[f"recent_{stat}"] = (
                player_df[f"{stat}"].rolling(recent_num_gws, closed="left").mean()
            )

        player_dfs.append(player_df)

    return pd.concat(player_dfs)


def compute_team_agg(df, team, gw):
    df = df[df["team"] == team]
    df = df[df["gw"] == gw]
    df = df[df["starts"] == 1]

    return df


if os.path.exists(cacheDfPath):
    preprocessed_df = pd.read_pickle(cacheDfPath)
else:
    preprocessed_df = preprocess_df(init_df)

    def compute_opp_team_stats(df, df_row, position, stat):
        opp_df = compute_team_agg(
            df, id_to_team_name_map[df_row["opponent_team"]], df_row["gw"]
        )
        return np.mean(opp_df[opp_df["position"] == position][stat])

    def compute_team_stats(df, df_row, position, stat):
        team_df = compute_team_agg(df, df_row["team"], df_row["gw"])
        return np.mean(team_df[team_df["position"] == position][stat])

    for position in tqdm(["GK", "DEF", "MID", "FWD"]):
        for stat in configParser.get("Preprocessing", "stat_list").split(","):
            preprocessed_df[f"opp_{position}_recent_{stat}"] = preprocessed_df.apply(
                lambda df_row: compute_opp_team_stats(
                    preprocessed_df, df_row, position, f"recent_{stat}"
                ),
                axis=1,
            )
            preprocessed_df[f"team_{position}_recent_{stat}"] = preprocessed_df.apply(
                lambda df_row: compute_opp_team_stats(
                    preprocessed_df, df_row, position, f"recent_{stat}"
                ),
                axis=1,
            )

    preprocessed_df.to_pickle(cacheDfPath)

display(preprocessed_df[display_cols])

  0%|          | 0/429 [00:00<?, ?it/s]

100%|██████████| 429/429 [00:04<00:00, 93.54it/s] 
  preprocessed_df[f"opp_{position}_recent_{stat}"] = preprocessed_df.apply(lambda df_row: compute_opp_team_stats(preprocessed_df, df_row, position, f"recent_{stat}"),axis=1)
  preprocessed_df[f"team_{position}_recent_{stat}"] = preprocessed_df.apply(lambda df_row: compute_opp_team_stats(preprocessed_df, df_row, position, f"recent_{stat}"),axis=1)
  preprocessed_df[f"opp_{position}_recent_{stat}"] = preprocessed_df.apply(lambda df_row: compute_opp_team_stats(preprocessed_df, df_row, position, f"recent_{stat}"),axis=1)
  preprocessed_df[f"team_{position}_recent_{stat}"] = preprocessed_df.apply(lambda df_row: compute_opp_team_stats(preprocessed_df, df_row, position, f"recent_{stat}"),axis=1)
  preprocessed_df[f"opp_{position}_recent_{stat}"] = preprocessed_df.apply(lambda df_row: compute_opp_team_stats(preprocessed_df, df_row, position, f"recent_{stat}"),axis=1)
  preprocessed_df[f"team_{position}_recent_{stat}"] = preprocessed_df.apply(l

Unnamed: 0,name,position,team,gw,xP,total_points,tot_total_points,recent_total_points,avg_total_points
8334,Aaron Ramsdale,GK,Southampton,3,0.2,2,0,,0.000000
8985,Aaron Ramsdale,GK,Southampton,4,0.7,3,2,,2.000000
9644,Aaron Ramsdale,GK,Southampton,5,2.0,3,5,,2.500000
10306,Aaron Ramsdale,GK,Southampton,6,2.2,2,8,2.666667,2.666667
10970,Aaron Ramsdale,GK,Southampton,7,1.0,2,10,2.666667,2.500000
...,...,...,...,...,...,...,...,...,...
3699,Łukasz Fabiański,GK,West Ham,14,4.0,2,19,4.666667,3.800000
4362,Łukasz Fabiański,GK,West Ham,15,2.2,2,21,2.333333,3.500000
5024,Łukasz Fabiański,GK,West Ham,16,2.6,4,23,1.000000,3.285714
5725,Łukasz Fabiański,GK,West Ham,17,2.7,3,27,2.666667,3.375000


# Basic Linear Regressions

In [286]:
def normalize_columns(df, cols):
    for col in cols:
        if df[col].std() > 0.0001:
            df[col] = (df[col] - df[col].mean()) / df[col].std()

    return df


def flatten(xss):
    return [x for xs in xss for x in xs]


all_preds = (
    ["was_home"]
    + flatten(
        [
            [f"recent_{stat}", f"avg_{stat}"]
            for stat in configParser.get("Preprocessing", "pred_stat_list").split(",")
        ]
    )
    + flatten(
        [
            [
                f"opp_DEF_recent_{stat}",
                f"opp_GK_recent_{stat}",
                f"opp_MID_recent_{stat}",
                f"opp_FWD_recent_{stat}",
            ]
            for stat in configParser.get("Preprocessing", "pred_stat_list").split(",")
        ]
    )
    + flatten(
        [
            [
                f"team_DEF_recent_{stat}",
                f"team_GK_recent_{stat}",
                f"team_MID_recent_{stat}",
                f"team_FWD_recent_{stat}",
            ]
            for stat in configParser.get("Preprocessing", "pred_stat_list").split(",")
        ]
    )
)


def basic_lasso(df, position):
    print(position)
    position_df = df[df["position"] == position].copy()

    position_df = normalize_columns(position_df, all_preds)
    predictor_string = " + ".join(all_preds)
    model = sm.ols(
        formula=f"total_points ~ {predictor_string}", data=position_df
    ).fit_regularized(alpha=[0] + [0.3] * len(all_preds), L1_wt=1)
    for param, value in zip(model.params.index, model.params):
        if abs(value) > 0.0001:
            print(f"{param}: {value}")

In [287]:
for position in ["GK", "DEF", "MID", "FWD"]:
    basic_lasso(preprocessed_df, position)

GK
Intercept: 3.08907090511682
opp_FWD_recent_xP: -0.006330605150482923
opp_GK_recent_expected_goals: 0.2803909152300517
DEF
Intercept: 2.618405603158278
opp_FWD_recent_xP: -0.08782954877904618
opp_MID_recent_expected_goals_conceded: 0.09169092836542868
MID
Intercept: 3.352468759489459
avg_xP: 0.42331308846747573
avg_expected_goal_involvements: 0.279393049423836
opp_FWD_recent_clean_sheets: 0.02585468977446701
FWD
Intercept: 4.754775972621852
was_home: 0.07442961870900427
recent_bps: 0.10568787572731192
opp_MID_recent_xP: -0.17935448480026528
opp_GK_recent_bps: 0.10675113489991497
opp_GK_recent_expected_goal_involvements: -0.07753868220710669
opp_MID_recent_expected_goal_involvements: -0.019125282784378642
opp_GK_recent_goals_conceded: -0.05069670891375556
opp_DEF_recent_goals_scored: -0.02117411340237527
opp_GK_recent_total_points: 0.0774923699572253
team_MID_recent_xP: -0.017612469823629812
team_GK_recent_expected_goal_involvements: -0.004826726954604846
team_GK_recent_total_points: 