In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [2]:
PL = pd.read_csv('PL.csv', low_memory = False)
PL.shape

(3040, 33)

In [3]:
train = PL[PL.DATE <= '2023-08-01']
test =  PL[PL.DATE > '2023-08-01']
print( train.shape, test.shape)


(2280, 33) (760, 33)


In [4]:
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)
predictors = ['VENUE_CD', 'VOPPONENT_CD', 'DAY_CD']
rf.fit(train[predictors], train['TARGET'])
preds = rf.predict(test[predictors])

In [5]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(test['TARGET'], preds)
acc

0.6381578947368421

In [6]:
combined = pd.DataFrame(dict(actual=test.TARGET, prediction = preds))
# Crosstab
pd.crosstab(index = combined ['actual'], columns = combined['prediction'])
# This method predicts from accurately the loss/draw not the win 

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,412,91
1,184,73


In [7]:
from sklearn.metrics import precision_score
precision_score(test['TARGET'], preds)


0.4451219512195122

In [8]:
grouped_matches = PL.groupby('TEAM')
group = grouped_matches.get_group('Liverpool')

In [9]:
def rolling_avergs(group, cols, new_cols):
    group = group.sort_values(by = 'DATE')
    rollins_stats = group[cols].rolling(3, closed = 'left').mean()
    group[new_cols] = rollins_stats
    group = group.dropna(subset = new_cols)
    return group
                                        


In [10]:
cols = ['GF', 'GA', 'SH', 'SOT', 'DIST', 'FK', 'PK', 'PKATT']
new_cols = [f'{c}_rolling' for c in cols]
PL_rolling = PL.groupby('TEAM').apply(lambda x: rolling_avergs(x, cols, new_cols))
PL_rolling = PL_rolling.droplevel('TEAM').reset_index(drop = True)
PL_rolling

  PL_rolling = PL.groupby('TEAM').apply(lambda x: rolling_avergs(x, cols, new_cols))


Unnamed: 0,DATE,TIME,COMP,ROUND,DAY,VENUE,RESULT,GF,GA,OPPONENT,...,DAY_CD,TARGET,GF_rolling,GA_rolling,SH_rolling,SOT_rolling,DIST_rolling,FK_rolling,PK_rolling,PKATT_rolling
0,2020-10-04,14:00,Premier League,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Premier League,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,2024-04-02,19:45,Premier League,Matchweek 31,Tue,Away,D,1.0,1.0,Burnley,...,1,0,0.666667,2.000000,11.000000,3.000000,15.600000,0.333333,0.000000,0.000000
2876,2024-04-06,15:00,Premier League,Matchweek 32,Sat,Home,L,1.0,2.0,West Ham,...,5,0,1.000000,1.333333,9.666667,2.666667,14.433333,0.333333,0.000000,0.000000
2877,2024-04-13,15:00,Premier League,Matchweek 33,Sat,Away,D,2.0,2.0,Nott'ham Forest,...,5,0,0.666667,1.666667,11.333333,3.666667,17.466667,0.666667,0.333333,0.333333
2878,2024-04-20,19:30,Premier League,Matchweek 34,Sat,Home,L,0.0,2.0,Arsenal,...,5,0,1.333333,1.666667,10.666667,3.666667,17.966667,0.333333,0.333333,0.333333


In [18]:
def make_predictions(data, predictors):
    datel = '2022-01-01'
    test =  data[data.DATE > datel]
    train = data[data.DATE <= datel]
    # rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)
    rf.fit(train[predictors], train['TARGET'])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test.TARGET, prediction = preds), index = test.index)
    precision = precision_score(test['TARGET'], preds)
    return combined, precision




In [19]:
combined, precision = make_predictions (PL_rolling, predictors + new_cols)
precision

0.539568345323741

In [20]:
combined

Unnamed: 0,actual,prediction
55,0,1
56,1,1
57,1,0
58,1,0
59,1,1
...,...,...
2875,0,0
2876,0,0
2877,0,0
2878,0,0


In [21]:
combined = combined.merge(PL_rolling[['DATE', 'TEAM', 'OPPONENT', 'RESULT']], left_index = True, right_index = True)

Unnamed: 0,actual,prediction,DATE,TEAM,OPPONENT,RESULT
55,0,1,2022-01-23,Arsenal,Burnley,D
56,1,1,2022-02-10,Arsenal,Wolves,W
57,1,0,2022-02-19,Arsenal,Brentford,W
58,1,0,2022-02-24,Arsenal,Wolves,W
59,1,1,2022-03-06,Arsenal,Watford,W
...,...,...,...,...,...,...
2875,0,0,2024-04-02,Wolverhampton-Wanderers,Burnley,D
2876,0,0,2024-04-06,Wolverhampton-Wanderers,West Ham,L
2877,0,0,2024-04-13,Wolverhampton-Wanderers,Nott'ham Forest,D
2878,0,0,2024-04-20,Wolverhampton-Wanderers,Arsenal,L


In [None]:
CHECK THE MISSIMG MAPPING CLASS AT THE END OF THE VIDEOP 