In [68]:
#https://www.youtube.com/watch?v=0irmDBWLrco
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
import pandas as pd
PL = pd.read_csv('PL.csv', low_memory = False)
PL.shape

(3040, 34)

In [69]:
train = PL[PL.DATE <= '2023-08-01']
test =  PL[PL.DATE > '2023-08-01']
print( train.shape, test.shape)


(2280, 34) (760, 34)


In [70]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)
predictors = ['VENUE_CD', 'OPPONENT_CD', 'DAY_CD']
rf.fit(train[predictors], train['TARGET'])
preds = rf.predict(test[predictors])

In [71]:
acc = accuracy_score(test['TARGET'], preds)
acc

0.6381578947368421

In [72]:
combined = pd.DataFrame(dict(actual=test.TARGET, prediction = preds))
# Crosstab
pd.crosstab(index = combined ['actual'], columns = combined['prediction'])
# This method predicts from accurately the loss/draw not the win 

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,412,91
1,184,73


In [73]:
prec = precision_score(test['TARGET'], preds)
prec

0.4451219512195122

In [74]:
grouped_matches = PL.groupby('TEAM')
group = grouped_matches.get_group('Liverpool')

In [75]:
def rolling_avergs(group, cols, new_cols):
    group = group.sort_values(by = 'DATE')
    rollins_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rollins_stats
    group = group.dropna(subset = new_cols)
    return group

In [76]:
cols = ['GF', 'GA', 'SH', 'SOT', 'DIST', 'FK', 'PK', 'PKATT']
new_cols = [f'{c}_rolling' for c in cols]
PL_rolling = PL.groupby('TEAM').apply(lambda x: rolling_avergs(x, cols, new_cols))
PL_rolling = PL_rolling.droplevel('TEAM').reset_index(drop = True)
PL_rolling

  PL_rolling = PL.groupby('TEAM').apply(lambda x: rolling_avergs(x, cols, new_cols))


Unnamed: 0,DATE,TIME,COMP,ROUND,DAY,VENUE,RESULT,GF,GA,OPPONENT,...,TARGET,TARGET2,GF_rolling,GA_rolling,SH_rolling,SOT_rolling,DIST_rolling,FK_rolling,PK_rolling,PKATT_rolling
0,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,1,2,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,0,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,0,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,1,2,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,0,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,2024-04-02,19:45,Premier League,Matchweek 31,Tue,Away,D,1.0,1.0,Burnley,...,0,1,0.666667,2.000000,11.000000,3.000000,15.600000,0.333333,0.000000,0.000000
2876,2024-04-06,15:00,Premier League,Matchweek 32,Sat,Home,L,1.0,2.0,West Ham,...,0,0,1.000000,1.333333,9.666667,2.666667,14.433333,0.333333,0.000000,0.000000
2877,2024-04-13,15:00,Premier League,Matchweek 33,Sat,Away,D,2.0,2.0,Nott'ham Forest,...,0,1,0.666667,1.666667,11.333333,3.666667,17.466667,0.666667,0.333333,0.333333
2878,2024-04-20,19:30,Premier League,Matchweek 34,Sat,Home,L,0.0,2.0,Arsenal,...,0,0,1.333333,1.666667,10.666667,3.666667,17.966667,0.333333,0.333333,0.333333


In [77]:
def make_predictions(data, predictors, target = 'TARGET'):
    datel = '2022-01-01'
    test =  data[data.DATE > datel]
    train = data[data.DATE <= datel]
    rf = RandomForestClassifier(criterion = 'entropy', n_estimators = 50, min_samples_split = 10, random_state = 1)
    rf.fit(train[predictors], train[target])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test[target], prediction = preds), index = test.index)
    precision = precision_score(test[target], preds, average = 'weighted')
    return combined, precision

In [78]:
combined, precision = make_predictions (PL_rolling[(PL_rolling.DATE > '2021-08-01') & (PL_rolling.DATE < '2023-01-01')], predictors + new_cols)
precision

0.5828302430116235

In [79]:
combined

Unnamed: 0,actual,prediction
55,0,0
56,1,0
57,1,0
58,1,0
59,1,1
...,...,...
2820,0,0
2821,0,0
2822,0,0
2823,1,0


In [80]:
combined = combined.merge(PL_rolling[['DATE', 'TEAM', 'OPPONENT', 'RESULT']], left_index = True, right_index = True)
combined

Unnamed: 0,actual,prediction,DATE,TEAM,OPPONENT,RESULT
55,0,0,2022-01-23,Arsenal,Burnley,D
56,1,0,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,0,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
2820,0,0,2022-10-29,Wolves,Brentford,D
2821,0,0,2022-11-05,Wolves,Brighton,L
2822,0,0,2022-11-12,Wolves,Arsenal,L
2823,1,0,2022-12-26,Wolves,Everton,W


In [84]:
merged = combined.merge(combined, left_on = ['DATE', "TEAM"], right_on = ['DATE', 'OPPONENT'])
merged

Unnamed: 0,actual_x,prediction_x,DATE,TEAM_x,OPPONENT_x,RESULT_x,actual_y,prediction_y,TEAM_y,OPPONENT_y,RESULT_y
0,0,0,2022-01-23,Arsenal,Burnley,D,0,0,Burnley,Arsenal,D
1,1,0,2022-02-10,Arsenal,Wolves,W,0,0,Wolves,Arsenal,L
2,1,0,2022-02-19,Arsenal,Brentford,W,0,0,Brentford,Arsenal,L
3,1,0,2022-02-24,Arsenal,Wolves,W,0,0,Wolves,Arsenal,L
4,1,1,2022-03-06,Arsenal,Watford,W,0,1,Watford,Arsenal,L
...,...,...,...,...,...,...,...,...,...,...,...
699,0,0,2022-10-29,Wolves,Brentford,D,0,0,Brentford,Wolves,D
700,0,0,2022-11-05,Wolves,Brighton,L,1,1,Brighton,Wolves,W
701,0,0,2022-11-12,Wolves,Arsenal,L,1,1,Arsenal,Wolves,W
702,1,0,2022-12-26,Wolves,Everton,W,0,0,Everton,Wolves,L


In [85]:
merged[merged.TEAM_x=='Arsenal'].sort_values(by = 'DATE')

Unnamed: 0,actual_x,prediction_x,DATE,TEAM_x,OPPONENT_x,RESULT_x,actual_y,prediction_y,TEAM_y,OPPONENT_y,RESULT_y
0,0,0,2022-01-23,Arsenal,Burnley,D,0,0,Burnley,Arsenal,D
1,1,0,2022-02-10,Arsenal,Wolves,W,0,0,Wolves,Arsenal,L
2,1,0,2022-02-19,Arsenal,Brentford,W,0,0,Brentford,Arsenal,L
3,1,0,2022-02-24,Arsenal,Wolves,W,0,0,Wolves,Arsenal,L
4,1,1,2022-03-06,Arsenal,Watford,W,0,1,Watford,Arsenal,L
5,1,1,2022-03-13,Arsenal,Leicester City,W,0,1,Leicester City,Arsenal,L
6,0,1,2022-03-16,Arsenal,Liverpool,L,1,1,Liverpool,Arsenal,W
7,1,0,2022-03-19,Arsenal,Aston Villa,W,0,1,Aston Villa,Arsenal,L
8,0,0,2022-04-04,Arsenal,Crystal Palace,L,1,0,Crystal Palace,Arsenal,W
9,0,0,2022-04-09,Arsenal,Brighton,L,1,1,Brighton,Arsenal,W


In [95]:
combined2, precision2 = make_predictions (PL_rolling[(PL_rolling.DATE > '2021-08-01') & (PL_rolling.DATE < '2023-01-01')], predictors + new_cols, target = 'TARGET2')
combined2 = combined2.merge(PL_rolling[['DATE', 'TEAM', 'OPPONENT', 'RESULT']], left_index = True, right_index = True)
precision2

0.4289161560176194

In [96]:
merged2 = combined2.merge(combined2, left_on = ['DATE', "TEAM"], right_on = ['DATE', 'OPPONENT'])
merged2

Unnamed: 0,actual_x,prediction_x,DATE,TEAM_x,OPPONENT_x,RESULT_x,actual_y,prediction_y,TEAM_y,OPPONENT_y,RESULT_y
0,1,2,2022-01-23,Arsenal,Burnley,D,1,0,Burnley,Arsenal,D
1,2,0,2022-02-10,Arsenal,Wolves,W,0,0,Wolves,Arsenal,L
2,2,2,2022-02-19,Arsenal,Brentford,W,0,0,Brentford,Arsenal,L
3,2,0,2022-02-24,Arsenal,Wolves,W,0,0,Wolves,Arsenal,L
4,2,2,2022-03-06,Arsenal,Watford,W,0,2,Watford,Arsenal,L
...,...,...,...,...,...,...,...,...,...,...,...
699,1,0,2022-10-29,Wolves,Brentford,D,1,0,Brentford,Wolves,D
700,0,2,2022-11-05,Wolves,Brighton,L,2,2,Brighton,Wolves,W
701,0,0,2022-11-12,Wolves,Arsenal,L,2,2,Arsenal,Wolves,W
702,2,1,2022-12-26,Wolves,Everton,W,0,0,Everton,Wolves,L


In [97]:
merged2.actual_x.value_counts()

actual_x
2    277
0    277
1    150
Name: count, dtype: int64

In [98]:
pd.crosstab(index = combined2['actual'], columns = combined2['prediction'])


prediction,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,157,44,78
1,66,24,61
2,115,30,135


In [99]:
pd.crosstab(index = combined['actual'], columns = combined['prediction'])


prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,356,74
1,205,75
