In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("matches.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0.0,1.0,Tottenham,...,Match Report,,18.0,4.0,16.9,1.0,0.0,0.0,2022,Manchester City
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5.0,0.0,Norwich City,...,Match Report,,16.0,4.0,17.3,1.0,0.0,0.0,2022,Manchester City
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5.0,0.0,Arsenal,...,Match Report,,25.0,10.0,14.3,0.0,0.0,0.0,2022,Manchester City
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Away,W,1.0,0.0,Leicester City,...,Match Report,,25.0,8.0,14.0,0.0,0.0,0.0,2022,Manchester City
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0.0,0.0,Southampton,...,Match Report,,16.0,1.0,15.7,1.0,0.0,0.0,2022,Manchester City


In [4]:
matches["date"] = pd.to_datetime(matches["date"])

In [5]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [6]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [7]:
matches["hour"] = matches["time"].str.replace(":.+","", regex= True).astype("int")

In [8]:
matches["day_code"] = matches["date"].dt.dayofweek

In [9]:
matches["target"] = (matches["result"] == "W").astype("int")

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)

In [12]:
train = matches[matches["date"] < "2022-01-01"]

In [13]:
test = matches[matches["date"] > "2022-01-01"]

In [14]:
predictors = ["venue_code","opp_code","hour","day_code"]

In [15]:
rf.fit(train[predictors], train["target"])

In [16]:
preds = rf.predict(test[predictors])

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
acc = accuracy_score(test["target"], preds)

In [69]:
acc

0.6123188405797102

In [41]:
combined = pd.DataFrame(dict(actual=test["target"],prediction = preds))

In [43]:
pd.crosstab(index=combined["actual"],columns = combined["prediction"])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,141,31
1,76,28


In [45]:
from sklearn.metrics import precision_score

In [47]:
precision_score(test["target"], preds)

0.4745762711864407

In [49]:
grouped_matches = matches.groupby("team")

In [68]:
group = grouped_matches.get_group("Manchester City")

In [64]:
def rolling_averages(group,cols,new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed="left").mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [60]:
cols = ["gf","ga","sh","sot","dist","fk","pk","pkatt"]
new_cols =[f"{c}_rolling" for c in cols]

In [70]:
rolling_averages(group,cols,new_cols)

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
5,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Home,W,1.0,0.0,Arsenal,...,5,1,2.000000,2.333333,17.333333,4.666667,18.900000,1.333333,0.333333,0.333333
7,2020-10-24,12:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,West Ham,...,5,0,1.333333,2.000000,17.333333,3.666667,17.733333,0.666667,0.000000,0.000000
9,2020-10-31,12:30,Premier League,Matchweek 7,Sat,Away,W,1.0,0.0,Sheffield Utd,...,5,1,1.000000,0.666667,16.666667,4.333333,18.233333,0.666667,0.000000,0.000000
11,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Home,D,1.0,1.0,Liverpool,...,6,0,1.000000,0.333333,14.333333,6.666667,18.466667,1.000000,0.000000,0.000000
12,2020-11-21,17:30,Premier League,Matchweek 9,Sat,Away,L,0.0,2.0,Tottenham,...,5,0,1.000000,0.666667,12.000000,5.666667,19.366667,1.000000,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2022-03-14,20:00,Premier League,Matchweek 29,Mon,Away,D,0.0,0.0,Crystal Palace,...,0,0,2.333333,1.333333,19.000000,7.000000,15.366667,0.333333,0.333333,0.333333
44,2022-04-02,15:00,Premier League,Matchweek 31,Sat,Away,W,2.0,0.0,Burnley,...,5,1,1.666667,0.333333,18.333333,7.333333,16.000000,0.333333,0.000000,0.000000
46,2022-04-10,16:30,Premier League,Matchweek 32,Sun,Home,D,2.0,2.0,Liverpool,...,6,0,2.000000,0.333333,20.000000,6.666667,16.133333,0.333333,0.000000,0.000000
49,2022-04-20,20:00,Premier League,Matchweek 30,Wed,Home,W,3.0,0.0,Brighton,...,2,1,1.333333,0.666667,15.666667,4.666667,16.700000,0.333333,0.000000,0.000000


In [74]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x,cols,new_cols))

  matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x,cols,new_cols))


In [78]:
matches_rolling = matches_rolling.droplevel("team")

In [83]:
matches_rolling.index = range(matches_rolling.shape[0])