In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("/content/drive/MyDrive/matches.csv", index_col=0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
5,2022-09-07,21:00,Champions Lg,Group stage,Wed,Home,L,0.0,2.0,de Bayern Munich,...,Match Report,,9,2,13.5,0.0,0,0,2023,Internazionale
7,2022-09-13,18:45,Champions Lg,Group stage,Tue,Away,W,2.0,0.0,cz Viktoria Plzeň,...,Match Report,,19,8,15.1,1.0,0,0,2023,Internazionale
10,2022-10-04,21:00,Champions Lg,Group stage,Tue,Home,W,1.0,0.0,es Barcelona,...,Match Report,,5,2,23.1,0.0,0,0,2023,Internazionale
12,2022-10-12,21:00,Champions Lg,Group stage,Wed,Away,D,3.0,3.0,es Barcelona,...,Match Report,,11,8,12.9,0.0,0,0,2023,Internazionale
15,2022-10-26,18:45,Champions Lg,Group stage,Wed,Home,W,4.0,0.0,cz Viktoria Plzeň,...,Match Report,,22,9,14.5,0.0,0,0,2023,Internazionale


In [4]:
matches.shape

(72, 27)

In [5]:
matches["team"].value_counts()

Internazionale     36
Manchester City    36
Name: team, dtype: int64

In [9]:
matches["round"].value_counts()

Group stage       36
Round of 16       12
Quarter-finals    12
Semi-finals       12
Name: round, dtype: int64

In [10]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes            object
sh                int64
sot               int64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [11]:
matches["date"]= pd.to_datetime(matches["date"])

In [14]:
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                   object
sh                       int64
sot                      int64
dist                   float64
fk                     float64
pk                       int64
pkatt                    int64
season                   int64
team                    object
dtype: object

In [15]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [17]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [18]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")

In [19]:
matches["day_code"] = matches["date"].dt.dayofweek

In [21]:
matches["target"] = (matches["result"] == "W").astype("int")

In [23]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
5,2022-09-07,21:00,Champions Lg,Group stage,Wed,Home,L,0.0,2.0,de Bayern Munich,...,0.0,0,0,2023,Internazionale,1,1,21,2,0
7,2022-09-13,18:45,Champions Lg,Group stage,Tue,Away,W,2.0,0.0,cz Viktoria Plzeň,...,1.0,0,0,2023,Internazionale,0,0,18,1,1
10,2022-10-04,21:00,Champions Lg,Group stage,Tue,Home,W,1.0,0.0,es Barcelona,...,0.0,0,0,2023,Internazionale,1,5,21,1,1
12,2022-10-12,21:00,Champions Lg,Group stage,Wed,Away,D,3.0,3.0,es Barcelona,...,0.0,0,0,2023,Internazionale,0,5,21,2,0
15,2022-10-26,18:45,Champions Lg,Group stage,Wed,Home,W,4.0,0.0,cz Viktoria Plzeň,...,0.0,0,0,2023,Internazionale,1,0,18,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41,2023-03-14,20:00,Champions Lg,Round of 16,Tue,Home,W,7.0,0.0,de RB Leipzig,...,0.0,1,1,2021,Manchester City,1,3,20,1,1
45,2023-04-11,20:00,Champions Lg,Quarter-finals,Tue,Home,W,3.0,0.0,de Bayern Munich,...,0.0,0,0,2021,Manchester City,1,1,20,1,1
47,2023-04-19,21:00,Champions Lg,Quarter-finals,Wed,Away,D,1.0,1.0,de Bayern Munich,...,0.0,0,1,2021,Manchester City,0,1,21,2,0
53,2023-05-09,21:00,Champions Lg,Semi-finals,Tue,Away,D,1.0,1.0,es Real Madrid,...,0.0,0,0,2021,Manchester City,0,6,21,1,0


In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [26]:
train = matches[matches["date"] < '2023-01-01']

In [27]:
test = matches[matches["date"] > '2023-01-01']

In [28]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [29]:
rf.fit(train[predictors], train["target"])

In [30]:
preds = rf.predict(test[predictors])

In [31]:
from sklearn.metrics import accuracy_score

In [33]:
acc = accuracy_score(test["target"], preds)

In [34]:
acc

0.8333333333333334

In [35]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [36]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,9,6
1,0,21


In [37]:
from sklearn.metrics import precision_score

In [38]:
precision_score(test["target"], preds)

0.7777777777777778