In [55]:
import pandas as pd

In [56]:
matches = pd.read_csv("matches.csv", index_col=0)

In [57]:
del matches["comp"]

In [58]:
del matches["notes"]

In [59]:
matches["date"] = pd.to_datetime(matches["date"])

In [60]:
matches["target"] = (matches["result"] == "W").astype("int")

In [61]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [62]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [63]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [64]:
matches["day_code"] = matches["date"].dt.dayofweek

In [65]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [67]:
train = matches[matches["date"] < '2022-01-01']

In [68]:
test = matches[matches["date"] > '2022-01-01']

In [69]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [70]:
rf.fit(train[predictors], train["target"])

In [71]:
preds = rf.predict(test[predictors])

In [72]:
from sklearn.metrics import accuracy_score

In [73]:
error = accuracy_score(test["target"], preds)

In [74]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [75]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141,31
1,76,28


In [76]:
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

np.float64(0.4745762711864407)

In [77]:
grouped_matches = matches.groupby("team")

In [78]:
group = grouped_matches.get_group("Manchester City").sort_values("date")

In [79]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [80]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

rolling_averages(group, cols, new_cols)

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-17,17:30,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,1.5,...,17,5,2.000000,2.333333,17.333333,4.666667,18.900000,1.333333,0.333333,0.333333
7,2020-10-24,12:30,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,1.1,...,12,5,1.333333,2.000000,17.333333,3.666667,17.733333,0.666667,0.000000,0.000000
9,2020-10-31,12:30,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield Utd,1.5,...,12,5,1.000000,0.666667,16.666667,4.333333,18.233333,0.666667,0.000000,0.000000
11,2020-11-08,16:30,Matchweek 8,Sun,Home,D,1.0,1.0,Liverpool,1.6,...,16,6,1.000000,0.333333,14.333333,6.666667,18.466667,1.000000,0.000000,0.000000
12,2020-11-21,17:30,Matchweek 9,Sat,Away,L,0.0,2.0,Tottenham,1.3,...,17,5,1.000000,0.666667,12.000000,5.666667,19.366667,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2022-03-14,20:00,Matchweek 29,Mon,Away,D,0.0,0.0,Crystal Palace,2.3,...,20,0,2.333333,1.333333,19.000000,7.000000,15.366667,0.333333,0.333333,0.333333
44,2022-04-02,15:00,Matchweek 31,Sat,Away,W,2.0,0.0,Burnley,1.8,...,15,5,1.666667,0.333333,18.333333,7.333333,16.000000,0.333333,0.000000,0.000000
46,2022-04-10,16:30,Matchweek 32,Sun,Home,D,2.0,2.0,Liverpool,2.0,...,16,6,2.000000,0.333333,20.000000,6.666667,16.133333,0.333333,0.000000,0.000000
49,2022-04-20,20:00,Matchweek 30,Wed,Home,W,3.0,0.0,Brighton,1.2,...,20,2,1.333333,0.666667,15.666667,4.666667,16.700000,0.333333,0.000000,0.000000


In [81]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [82]:
matches_rolling = matches_rolling.droplevel('team')

In [83]:
matches_rolling.index = range(matches_rolling.shape[0])

In [84]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [85]:
combined, error = make_predictions(matches_rolling, predictors + new_cols)

In [86]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [87]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton",
              "Manchester United": "Manchester Utd",
              "Newcastle United": "Newcastle Utd",
              "Tottenham Hotspur": "Tottenham",
              "West Ham United": "West Ham",
              "Wolverhampton Wanderers": "Wolves"}
mapping = MissingDict(**map_values)

In [88]:
combined["new_team"] = combined["team"].map(mapping)

In [89]:
merged = combined.merge(combined, left_on=["date", "new_team"], right_on=["date", "opponent"])

In [90]:
merged[(merged["predicted_x"] == 1) & (merged["predicted_y"] ==0)]["actual_x"].value_counts()

actual_x
1    27
0    13
Name: count, dtype: int64