In [1]:
import pandas as pd

In [2]:
matches = pd.read_csv("matches.csv", index_col = 0)

In [3]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,Match Report,,10.0,2.0,14.6,1.0,0.0,0.0,2023,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,Match Report,,19.0,7.0,13.0,0.0,0.0,0.0,2023,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,Match Report,,14.0,6.0,14.8,0.0,0.0,0.0,2023,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,Match Report,,22.0,8.0,15.5,1.0,0.0,0.0,2023,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,Match Report,,22.0,8.0,16.3,1.0,0.0,0.0,2023,Arsenal


In [4]:
matches.shape

(1134, 27)

In [5]:
matches["team"].value_counts()

Leicester City              57
Newcastle United            57
Southampton                 57
Tottenham Hotspur           57
Everton                     57
West Ham United             57
Brentford                   57
Wolverhampton Wanderers     57
Chelsea                     57
Aston Villa                 57
Arsenal                     56
Leeds United                56
Crystal Palace              56
Manchester City             56
Liverpool                   56
Brighton and Hove Albion    56
Manchester United           56
Burnley                     38
Watford                     38
Norwich City                38
Fulham                      20
Nottingham Forest           19
Bournemouth                 19
Name: team, dtype: int64

In [6]:
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk              float64
pkatt           float64
season            int64
team             object
dtype: object

In [7]:
matches["date"] = pd.to_datetime(matches["date"])

In [8]:
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                     float64
pkatt                  float64
season                   int64
team                    object
dtype: object

In [None]:
# machine learning system can only deal with numbers without object so we have to cast some of the columns

In [9]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [10]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [11]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex = True).astype("int")

In [12]:
matches["day_code"] = matches["date"].dt.dayofweek

In [13]:
matches.loc[matches['result'] == 'W', 'target'] = 2
matches.loc[matches['result'] == 'D', 'target'] = 1
matches.loc[matches['result'] == 'L', 'target'] = 0




In [14]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,...,1.0,0.0,0.0,2023,Arsenal,0,7,20,4,2.0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,...,0.0,0.0,0.0,2023,Arsenal,1,11,15,5,2.0
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,...,0.0,0.0,0.0,2023,Arsenal,0,2,17,5,2.0
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,1.0,0.0,0.0,2023,Arsenal,1,9,17,5,2.0
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,1.0,0.0,0.0,2023,Arsenal,1,1,19,2,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2022-04-30,15:00,Premier League,Matchweek 35,Sat,Away,L,0.0,2.0,Aston Villa,...,0.0,0.0,0.0,2022,Norwich City,0,1,15,5,0.0
39,2022-05-08,14:00,Premier League,Matchweek 36,Sun,Home,L,0.0,4.0,West Ham,...,1.0,0.0,0.0,2022,Norwich City,1,21,14,6,0.0
40,2022-05-11,19:45,Premier League,Matchweek 21,Wed,Away,L,0.0,3.0,Leicester City,...,0.0,0.0,0.0,2022,Norwich City,0,11,19,2,0.0
41,2022-05-15,14:00,Premier League,Matchweek 37,Sun,Away,D,1.0,1.0,Wolves,...,0.0,0.0,0.0,2022,Norwich City,0,22,14,6,1.0


In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)

In [17]:
train = matches[matches["date"] < '2022-07-01']

In [18]:
test = matches[matches["date"] > '2022-07-01']

In [19]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [20]:
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [21]:
preds = rf.predict(test[predictors])

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
acc = accuracy_score(test["target"], preds)

In [24]:
print(f'accuracy:', acc)

accuracy: 0.4037433155080214


In [None]:
# The results of the tests:

In [25]:
combined = pd.DataFrame(dict(actual = test["target"], prediction = preds))

In [26]:
pd.crosstab(index = combined["actual"], columns = combined["prediction"])

prediction,0.0,1.0,2.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,83,21,42
1.0,34,6,42
2.0,60,24,62


In [None]:
# checking the accuracy rate:

In [27]:
from sklearn.metrics import precision_score

In [28]:
precision_score(test["target"], preds, average ='micro')

0.4037433155080214

In [None]:
# in order to increase it we'll add more data like follows:

In [29]:
grouped_matches = matches.groupby("team")

In [30]:
group = grouped_matches.get_group("Manchester City")

In [31]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,1.0,1.0,1.0,2023,Manchester City,0,21,16,6,2.0
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,0.0,0.0,0.0,2023,Manchester City,1,2,15,5,2.0
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,...,1.0,0.0,0.0,2023,Manchester City,0,15,16,6,1.0
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,0.0,0.0,0.0,2023,Manchester City,1,7,15,5,2.0
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,0.0,0.0,0.0,2023,Manchester City,1,17,19,2,2.0
6,2022-09-03,17:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,Aston Villa,...,1.0,0.0,0.0,2023,Manchester City,0,1,17,5,1.0
9,2022-09-17,12:30,Premier League,Matchweek 8,Sat,Away,W,3.0,0.0,Wolves,...,0.0,0.0,0.0,2023,Manchester City,0,22,12,5,2.0
10,2022-10-02,14:00,Premier League,Matchweek 9,Sun,Home,W,6.0,3.0,Manchester Utd,...,1.0,0.0,0.0,2023,Manchester City,1,14,14,6,2.0
12,2022-10-08,15:00,Premier League,Matchweek 10,Sat,Home,W,4.0,0.0,Southampton,...,0.0,0.0,0.0,2023,Manchester City,1,18,15,5,2.0
14,2022-10-16,16:30,Premier League,Matchweek 11,Sun,Away,L,0.0,1.0,Liverpool,...,0.0,0.0,0.0,2023,Manchester City,0,12,16,6,0.0


In [None]:
# below method will take in count the stats of 3 games before in order to take in count the momentum effect

In [32]:
def rolling_averages(group,cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset = new_cols)
    return group

In [33]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk","pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [34]:
new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [35]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))

In [None]:
# now we get the table with calculated rolling stats:

In [36]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,2.0,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
Arsenal,5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,2.0,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
Arsenal,7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,2.0,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
Arsenal,8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,1.0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
Arsenal,9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,1.0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,16,2022-11-12,19:45,Premier League,Matchweek 16,Sat,Home,L,0.0,2.0,Arsenal,...,5,0.0,1.000000,2.666667,13.666667,4.333333,18.133333,0.333333,0.333333,0.333333
Wolverhampton Wanderers,18,2022-12-26,15:00,Premier League,Matchweek 17,Mon,Away,W,2.0,1.0,Everton,...,0,2.0,1.000000,2.000000,10.333333,3.333333,19.566667,0.333333,0.333333,0.333333
Wolverhampton Wanderers,19,2022-12-31,12:30,Premier League,Matchweek 18,Sat,Home,L,0.0,1.0,Manchester Utd,...,5,0.0,1.333333,2.000000,8.333333,3.333333,18.966667,0.333333,0.333333,0.333333
Wolverhampton Wanderers,20,2023-01-04,20:00,Premier League,Matchweek 19,Wed,Away,D,1.0,1.0,Aston Villa,...,2,1.0,0.666667,1.333333,9.333333,3.000000,18.633333,1.000000,0.000000,0.000000


In [37]:
matches_rolling = matches_rolling.droplevel('team')

In [38]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,2.0,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
5,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,2.0,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
7,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,2.0,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
8,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,1.0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
9,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,1.0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,2022-11-12,19:45,Premier League,Matchweek 16,Sat,Home,L,0.0,2.0,Arsenal,...,5,0.0,1.000000,2.666667,13.666667,4.333333,18.133333,0.333333,0.333333,0.333333
18,2022-12-26,15:00,Premier League,Matchweek 17,Mon,Away,W,2.0,1.0,Everton,...,0,2.0,1.000000,2.000000,10.333333,3.333333,19.566667,0.333333,0.333333,0.333333
19,2022-12-31,12:30,Premier League,Matchweek 18,Sat,Home,L,0.0,1.0,Manchester Utd,...,5,0.0,1.333333,2.000000,8.333333,3.333333,18.966667,0.333333,0.333333,0.333333
20,2023-01-04,20:00,Premier League,Matchweek 19,Wed,Away,D,1.0,1.0,Aston Villa,...,2,1.0,0.666667,1.333333,9.333333,3.000000,18.633333,1.000000,0.000000,0.000000


In [39]:
matches_rolling.index = range(matches_rolling.shape[0])

In [40]:
matches_rolling

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2021-09-11,15:00,Premier League,Matchweek 4,Sat,Home,W,1.0,0.0,Norwich City,...,5,2.0,0.000000,3.000000,9.666667,2.333333,14.833333,0.333333,0.000000,0.000000
1,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Away,W,1.0,0.0,Burnley,...,5,2.0,0.333333,2.333333,12.333333,3.000000,14.133333,0.333333,0.000000,0.000000
2,2021-09-26,16:30,Premier League,Matchweek 6,Sun,Home,W,3.0,1.0,Tottenham,...,6,2.0,0.666667,1.666667,14.666667,3.000000,14.800000,0.666667,0.000000,0.000000
3,2021-10-02,17:30,Premier League,Matchweek 7,Sat,Away,D,0.0,0.0,Brighton,...,5,1.0,1.666667,0.333333,18.333333,5.333333,18.433333,0.666667,0.000000,0.000000
4,2021-10-18,20:00,Premier League,Matchweek 8,Mon,Home,D,2.0,2.0,Crystal Palace,...,0,1.0,1.333333,0.333333,11.000000,4.000000,19.833333,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1060,2022-11-12,19:45,Premier League,Matchweek 16,Sat,Home,L,0.0,2.0,Arsenal,...,5,0.0,1.000000,2.666667,13.666667,4.333333,18.133333,0.333333,0.333333,0.333333
1061,2022-12-26,15:00,Premier League,Matchweek 17,Mon,Away,W,2.0,1.0,Everton,...,0,2.0,1.000000,2.000000,10.333333,3.333333,19.566667,0.333333,0.333333,0.333333
1062,2022-12-31,12:30,Premier League,Matchweek 18,Sat,Home,L,0.0,1.0,Manchester Utd,...,5,0.0,1.333333,2.000000,8.333333,3.333333,18.966667,0.333333,0.333333,0.333333
1063,2023-01-04,20:00,Premier League,Matchweek 19,Wed,Away,D,1.0,1.0,Aston Villa,...,2,1.0,0.666667,1.333333,9.333333,3.000000,18.633333,1.000000,0.000000,0.000000


In [None]:
# below method will create predictions for the model with new tables for future improvments

In [41]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-07-01']
    test = data[data["date"] > '2022-07-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual = test["target"], predicted = preds), index = test.index)
    precision = precision_score(test["target"], preds, average ='micro')
    return combined, precision


In [42]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [43]:
print(f'precision:', precision)

precision: 0.4958904109589041


In [None]:
# we got improvment of 0.09% and almost total of 50% which is way better than random gambling which has 33.333% wining-rate

In [44]:
combined

Unnamed: 0,actual,predicted
35,2.0,0.0
36,2.0,2.0
37,2.0,2.0
38,2.0,2.0
39,2.0,2.0
...,...,...
1060,0.0,0.0
1061,2.0,1.0
1062,0.0,0.0
1063,1.0,2.0


In [45]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index = True, right_index = True)

In [46]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
35,2.0,0.0,2022-08-05,Arsenal,Crystal Palace,W
36,2.0,2.0,2022-08-13,Arsenal,Leicester City,W
37,2.0,2.0,2022-08-20,Arsenal,Bournemouth,W
38,2.0,2.0,2022-08-27,Arsenal,Fulham,W
39,2.0,2.0,2022-08-31,Arsenal,Aston Villa,W
...,...,...,...,...,...,...
1060,0.0,0.0,2022-11-12,Wolverhampton Wanderers,Arsenal,L
1061,2.0,1.0,2022-12-26,Wolverhampton Wanderers,Everton,W
1062,0.0,0.0,2022-12-31,Wolverhampton Wanderers,Manchester Utd,L
1063,1.0,2.0,2023-01-04,Wolverhampton Wanderers,Aston Villa,D


In [47]:
class MissingDic(dict):
    __missing__ = lambda self, key: key
    
map_values = {
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves"
}    

mapping = MissingDic(**map_values)

In [48]:
mapping["Wolverhampton Wanderers"]

'Wolves'

In [60]:
combined["team"] = combined["team"].map(mapping)

In [63]:
combined.to_csv("predictions.csv")

In [56]:
combined["isRight"] = combined["predicted"] == combined["actual"]

In [57]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result,new_team,isRight
35,2.0,0.0,2022-08-05,Arsenal,Crystal Palace,W,Arsenal,False
36,2.0,2.0,2022-08-13,Arsenal,Leicester City,W,Arsenal,True
37,2.0,2.0,2022-08-20,Arsenal,Bournemouth,W,Arsenal,True
38,2.0,2.0,2022-08-27,Arsenal,Fulham,W,Arsenal,True
39,2.0,2.0,2022-08-31,Arsenal,Aston Villa,W,Arsenal,True
...,...,...,...,...,...,...,...,...
1060,0.0,0.0,2022-11-12,Wolves,Arsenal,L,Wolves,True
1061,2.0,1.0,2022-12-26,Wolves,Everton,W,Wolves,False
1062,0.0,0.0,2022-12-31,Wolves,Manchester Utd,L,Wolves,True
1063,1.0,2.0,2023-01-04,Wolves,Aston Villa,D,Wolves,False


In [51]:
merged = combined.merge(combined, left_on =["date", "new_team"], right_on =["date", "opponent"])

In [52]:
merged

Unnamed: 0,actual_x,predicted_x,date,team_x,opponent_x,result_x,new_team_x,actual_y,predicted_y,team_y,opponent_y,result_y,new_team_y
0,2.0,0.0,2022-08-05,Arsenal,Crystal Palace,W,Arsenal,0.0,0.0,Crystal Palace,Arsenal,L,Crystal Palace
1,2.0,2.0,2022-08-13,Arsenal,Leicester City,W,Arsenal,0.0,0.0,Leicester City,Arsenal,L,Leicester City
2,2.0,2.0,2022-08-27,Arsenal,Fulham,W,Arsenal,0.0,0.0,Fulham,Arsenal,L,Fulham
3,2.0,2.0,2022-08-31,Arsenal,Aston Villa,W,Arsenal,0.0,0.0,Aston Villa,Arsenal,L,Aston Villa
4,0.0,2.0,2022-09-04,Arsenal,Manchester Utd,L,Arsenal,2.0,0.0,Manchester United,Arsenal,W,Manchester Utd
...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,0.0,0.0,2022-11-12,Wolverhampton Wanderers,Arsenal,L,Wolves,2.0,2.0,Arsenal,Wolves,W,Arsenal
336,2.0,1.0,2022-12-26,Wolverhampton Wanderers,Everton,W,Wolves,0.0,2.0,Everton,Wolves,L,Everton
337,0.0,0.0,2022-12-31,Wolverhampton Wanderers,Manchester Utd,L,Wolves,2.0,2.0,Manchester United,Wolves,W,Manchester Utd
338,1.0,2.0,2023-01-04,Wolverhampton Wanderers,Aston Villa,D,Wolves,1.0,0.0,Aston Villa,Wolves,D,Aston Villa
