# Checking for Missing Values

In [43]:
import pandas as pd

In [44]:
#dropping the index column
matches = pd.read_csv("matches.csv", index_col=0)

In [45]:
#viewing to see changes
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,Match Report,,13.0,1.0,17.8,0.0,0,0,2024,Liverpool
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,Match Report,,25.0,9.0,16.8,1.0,0,1,2024,Liverpool
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,Match Report,,9.0,4.0,17.2,1.0,0,0,2024,Liverpool
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,Match Report,,17.0,4.0,14.7,0.0,0,0,2024,Liverpool
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,Match Report,,16.0,5.0,15.8,0.0,0,0,2024,Liverpool


In [46]:
matches.shape

(1216, 27)

In [47]:
#Data was called Febrary  5, 2024 where most teams had played a max of 23 games in the 23/24 season.
#As of the time this code is being written, 23 games have been played in the 23/24 season, while the 22/23 season has been completed (38 games).
# 23/24 season = 23 matches * 20  squads while 22/23 = 38 matches * 20 squads
(23 * 20) + (38 * 20)

1220

In [48]:
# At the time the code is being written, two fixture are outstanding. This means that 4 teams have played 22 matches.
# This is the cause of the difference in row size (1220-1216 = 4)
# The code below is used see each team in the league and howmany games theyve played in the two seasons at the point this code is being written.
matches["team"].value_counts().sum()

# some teams have only 38 games completed because they got relegated out of the league after the 22/23 season
# some other teams have only 23 games or less because they got promoted into the league in the 23/24 season.

1216

### ... ∴ no missing values

# Cleaning our Data For Machine Learning

In [49]:
#overview of the current data types before machine learning
matches.dtypes

date             object
time             object
comp             object
round            object
day              object
venue            object
result           object
gf              float64
ga              float64
opponent         object
xg              float64
xga             float64
poss            float64
attendance      float64
captain          object
formation        object
referee          object
match report     object
notes           float64
sh              float64
sot             float64
dist            float64
fk              float64
pk                int64
pkatt             int64
season            int64
team             object
dtype: object

In [50]:
#changing date to a more useful format
matches["date"] = pd.to_datetime(matches["date"])

In [51]:
#checking the change
matches.dtypes

date            datetime64[ns]
time                    object
comp                    object
round                   object
day                     object
venue                   object
result                  object
gf                     float64
ga                     float64
opponent                object
xg                     float64
xga                    float64
poss                   float64
attendance             float64
captain                 object
formation               object
referee                 object
match report            object
notes                  float64
sh                     float64
sot                    float64
dist                   float64
fk                     float64
pk                       int64
pkatt                    int64
season                   int64
team                    object
dtype: object

# Creating Predictors for Machine Learning

In [52]:
#ranking Home/Away data in numerical form inorder to incorporate the idea of home advantage in our prediction
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [53]:
#transforming opposition data into numerical form
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [54]:
#creating an 'hour' column from the 'time' column 
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")


In [55]:
#show days of week as numeric values
matches["day_code"] = matches["date"].dt.dayofweek

In [56]:
#displaying wins as 1's and losses/draws as 0's with new column
matches["target"] = (matches["result"] == "W").astype("int")

In [57]:
matches

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
0,2023-08-13,16:30,Premier League,Matchweek 1,Sun,Away,D,1.0,1.0,Chelsea,...,0.0,0,0,2024,Liverpool,0,6,16,6,0
1,2023-08-19,15:00,Premier League,Matchweek 2,Sat,Home,W,3.0,1.0,Bournemouth,...,1.0,0,1,2024,Liverpool,1,2,15,5,1
2,2023-08-27,16:30,Premier League,Matchweek 3,Sun,Away,W,2.0,1.0,Newcastle Utd,...,1.0,0,0,2024,Liverpool,0,16,16,6,1
3,2023-09-03,14:00,Premier League,Matchweek 4,Sun,Home,W,3.0,0.0,Aston Villa,...,0.0,0,0,2024,Liverpool,1,1,14,6,1
4,2023-09-16,12:30,Premier League,Matchweek 5,Sat,Away,W,3.0,1.0,Wolves,...,0.0,0,0,2024,Liverpool,0,22,12,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2023-04-30,14:00,Premier League,Matchweek 34,Sun,Away,L,1.0,3.0,Newcastle Utd,...,0.0,0,0,2023,Southampton,0,16,14,6,0
43,2023-05-08,20:00,Premier League,Matchweek 35,Mon,Away,L,3.0,4.0,Nott'ham Forest,...,0.0,1,1,2023,Southampton,0,17,20,0,0
44,2023-05-13,15:00,Premier League,Matchweek 36,Sat,Home,L,0.0,2.0,Fulham,...,0.0,0,0,2023,Southampton,1,9,15,5,0
45,2023-05-21,14:00,Premier League,Matchweek 37,Sun,Away,L,1.0,3.0,Brighton,...,1.0,0,0,2023,Southampton,0,4,14,6,0


#  Creating the Initial Machine Learning Model

In [58]:
# RnadomForest is being used because it picks up non-linearities in the data eg; opponent codes dont rank opponents, it's just used to display them. Random Forest Classifier accounts for this. 
from sklearn.ensemble import RandomForestClassifier

In [59]:
#defining rf parameters
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [60]:
#defining the data range to be used for training
train = matches[matches["date"] < '2023-05-28']

In [61]:
#defining the data range to be used for testing
test = matches[matches["date"] > '2023-05-28']

In [62]:
#consolidating all the previously created predictors into a list
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [63]:
#fitting the random forest model inorder to predict our target
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [64]:
#generating predicitions
preds = rf.predict(test[predictors])

In [65]:
#a metric to check the accuracy of the match predicitons
from sklearn.metrics import accuracy_score

In [66]:
acc = accuracy_score(test["target"], preds)

In [67]:
# therefore when we predicted something will happen based on the test data, 59% of the time it happened
acc

0.5899122807017544

In [68]:
# trying to see the situations where our accuracy was high vs low
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))

In [69]:
#using a pandas crosstab to view our accuracy
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,216,60
1,127,53


In [70]:
# to see the percentage of time the teams actually won when we predicted a win
from sklearn.metrics import precision_score

precision_score(test["target"], preds)

0.4690265486725664

# Creating New Predictors to Improve our Prediction Model

In [71]:
grouped_matches = matches.groupby("team")

In [72]:
group = grouped_matches.get_group("Manchester City").sort_values("date")

In [73]:
group

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,opp_code,hour,day_code,target
1,2022-08-07,16:30,Premier League,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,...,1.0,1,1,2023,Manchester City,0,21,16,6,1
2,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,...,0.0,0,0,2023,Manchester City,1,2,15,5,1
3,2022-08-21,16:30,Premier League,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,...,1.0,0,0,2023,Manchester City,0,16,16,6,0
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,0.0,0,0,2023,Manchester City,1,7,15,5,1
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,0.0,0,0,2023,Manchester City,1,17,19,2,1
6,2022-09-03,17:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,Aston Villa,...,1.0,0,0,2023,Manchester City,0,1,17,5,0
9,2022-09-17,12:30,Premier League,Matchweek 8,Sat,Away,W,3.0,0.0,Wolves,...,0.0,0,0,2023,Manchester City,0,22,12,5,1
10,2022-10-02,14:00,Premier League,Matchweek 9,Sun,Home,W,6.0,3.0,Manchester Utd,...,1.0,0,0,2023,Manchester City,1,15,14,6,1
12,2022-10-08,15:00,Premier League,Matchweek 10,Sat,Home,W,4.0,0.0,Southampton,...,0.0,0,0,2023,Manchester City,1,19,15,5,1
14,2022-10-16,16:30,Premier League,Matchweek 11,Sun,Away,L,0.0,1.0,Liverpool,...,0.0,0,0,2023,Manchester City,0,12,16,6,0


In [74]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [75]:
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

new_cols

['gf_rolling',
 'ga_rolling',
 'sh_rolling',
 'sot_rolling',
 'dist_rolling',
 'fk_rolling',
 'pk_rolling',
 'pkatt_rolling']

In [76]:

rolling_averages(group, cols, new_cols).head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
4,2022-08-27,15:00,Premier League,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,...,5,1,3.0,1.0,17.666667,6.0,17.466667,0.666667,0.333333,0.333333
5,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,...,2,1,3.666667,1.666667,19.333333,7.333333,15.933333,0.333333,0.0,0.0
6,2022-09-03,17:30,Premier League,Matchweek 6,Sat,Away,D,1.0,1.0,Aston Villa,...,5,0,4.333333,1.666667,18.666667,8.0,15.033333,0.333333,0.0,0.0
9,2022-09-17,12:30,Premier League,Matchweek 8,Sat,Away,W,3.0,0.0,Wolves,...,5,1,3.666667,1.0,16.0,6.0,15.233333,0.333333,0.0,0.0
10,2022-10-02,14:00,Premier League,Matchweek 9,Sun,Home,W,6.0,3.0,Manchester Utd,...,6,1,3.333333,0.333333,15.333333,6.666667,17.0,0.333333,0.0,0.0


In [77]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))


In [78]:
matches_rolling

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,day_code,target,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,...,5,1,3.000000,0.666667,14.333333,5.000000,14.133333,0.333333,0.000000,0.000000
Arsenal,4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,...,2,1,3.000000,1.000000,18.333333,7.000000,14.433333,0.333333,0.000000,0.000000
Arsenal,5,2022-09-04,16:30,Premier League,Matchweek 6,Sun,Away,L,1.0,3.0,Manchester Utd,...,6,0,2.333333,0.666667,19.333333,7.333333,15.533333,0.666667,0.000000,0.000000
Arsenal,7,2022-09-18,12:00,Premier League,Matchweek 8,Sun,Away,W,3.0,0.0,Brentford,...,6,1,1.666667,1.666667,20.000000,6.333333,16.800000,1.000000,0.000000,0.000000
Arsenal,8,2022-10-01,12:30,Premier League,Matchweek 9,Sat,Home,W,3.0,1.0,Tottenham,...,5,1,2.000000,1.333333,17.000000,6.000000,17.700000,0.666667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,20,2023-12-27,19:30,Premier League,Matchweek 19,Wed,Away,W,4.0,1.0,Brentford,...,2,1,1.000000,1.666667,12.666667,4.333333,17.466667,0.333333,0.000000,0.000000
Wolverhampton Wanderers,21,2023-12-30,15:00,Premier League,Matchweek 20,Sat,Home,W,3.0,0.0,Everton,...,5,1,2.000000,1.666667,13.000000,4.666667,18.333333,0.333333,0.000000,0.000000
Wolverhampton Wanderers,24,2024-01-22,19:45,Premier League,Matchweek 21,Mon,Away,D,0.0,0.0,Brighton,...,0,0,3.000000,0.666667,12.333333,5.666667,15.966667,0.333333,0.000000,0.000000
Wolverhampton Wanderers,26,2024-02-01,20:15,Premier League,Matchweek 22,Thu,Home,L,3.0,4.0,Manchester Utd,...,3,0,2.333333,0.333333,10.333333,4.666667,16.600000,0.000000,0.000000,0.000000


In [79]:
matches_rolling = matches_rolling.droplevel('team')

In [80]:
matches_rolling.index = range(matches_rolling.shape[0])

In [81]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2023-05-28']
    test = data[data["date"] > '2023-05-28']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [82]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)

In [92]:
precision
#it improved by about 10 points.

0.5363636363636364

In [84]:
combined

Unnamed: 0,actual,predicted
35,1,1
36,1,1
37,0,1
38,1,1
39,1,1
...,...,...
1142,1,0
1143,1,1
1144,0,1
1145,0,0


In [85]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)


In [86]:
combined

Unnamed: 0,actual,predicted,date,team,opponent,result
35,1,1,2023-08-12,Arsenal,Nott'ham Forest,W
36,1,1,2023-08-21,Arsenal,Crystal Palace,W
37,0,1,2023-08-26,Arsenal,Fulham,D
38,1,1,2023-09-03,Arsenal,Manchester Utd,W
39,1,1,2023-09-17,Arsenal,Everton,W
...,...,...,...,...,...,...
1142,1,0,2023-12-27,Wolverhampton Wanderers,Brentford,W
1143,1,1,2023-12-30,Wolverhampton Wanderers,Everton,W
1144,0,1,2024-01-22,Wolverhampton Wanderers,Brighton,D
1145,0,0,2024-02-01,Wolverhampton Wanderers,Manchester Utd,L


In [None]:
#Things to do to improve
#use all competitions data
#involve other unused columns
