In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
matches = pd.read_csv("final_all_matches_played.csv", index_col=0)
matches


Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,xga,poss,match report,team
1,2022-08-07,16:30,Matchweek 1,Sun,Away,W,2.0,0.0,West Ham,2.2,0.5,75.0,Match Report,Manchester City
2,2022-08-13,15:00,Matchweek 2,Sat,Home,W,4.0,0.0,Bournemouth,1.7,0.1,67.0,Match Report,Manchester City
3,2022-08-21,16:30,Matchweek 3,Sun,Away,D,3.0,3.0,Newcastle Utd,2.1,1.8,69.0,Match Report,Manchester City
4,2022-08-27,15:00,Matchweek 4,Sat,Home,W,4.0,2.0,Crystal Palace,2.2,0.1,74.0,Match Report,Manchester City
5,2022-08-31,19:30,Matchweek 5,Wed,Home,W,6.0,0.0,Nott'ham Forest,3.3,0.7,74.0,Match Report,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,2023-04-15,15:00,Matchweek 31,Sat,Home,L,0.0,2.0,Crystal Palace,0.8,0.9,58.0,Match Report,Southampton
40,2023-04-21,20:00,Matchweek 32,Fri,Away,D,3.0,3.0,Arsenal,1.3,2.5,26.0,Match Report,Southampton
41,2023-04-27,19:45,Matchweek 33,Thu,Home,L,0.0,1.0,Bournemouth,0.5,0.9,57.0,Match Report,Southampton
42,2023-04-30,14:00,Matchweek 34,Sun,Away,L,1.0,3.0,Newcastle Utd,0.8,4.3,37.0,Match Report,Southampton


In [3]:
predict_matches = pd.read_csv("final_predict_matches.csv", index_col=0)
predict_matches

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,xga,poss,match report,team
54,2023-05-14,14:00,Matchweek 36,Sun,Away,,,,Everton,2.155882,0.761765,64.941176,Head-to-Head,Manchester City
56,2023-05-21,16:00,Matchweek 37,Sun,Home,,,,Chelsea,2.155882,0.761765,64.941176,Head-to-Head,Manchester City
57,2023-05-24,20:00,Matchweek 32,Wed,Away,,,,Brighton,2.155882,0.761765,64.941176,Head-to-Head,Manchester City
58,2023-05-28,16:30,Matchweek 38,Sun,Away,,,,Brentford,2.155882,0.761765,64.941176,Head-to-Head,Manchester City
46,2023-05-14,16:30,Matchweek 36,Sun,Home,,,,Brighton,1.925714,1.125714,59.457143,Head-to-Head,Arsenal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42,2023-05-21,13:30,Matchweek 37,Sun,Away,,,,West Ham,1.228571,1.725714,47.314286,Head-to-Head,Leeds United
43,2023-05-28,16:30,Matchweek 38,Sun,Home,,,,Tottenham,1.228571,1.725714,47.314286,Head-to-Head,Leeds United
44,2023-05-13,15:00,Matchweek 36,Sat,Home,,,,Fulham,1.002857,1.522857,45.371429,Head-to-Head,Southampton
45,2023-05-21,14:00,Matchweek 37,Sun,Away,,,,Brighton,1.002857,1.522857,45.371429,Head-to-Head,Southampton


In [4]:

matches['date'] = pd.to_datetime(matches['date'])
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek
matches["target"] = matches["result"].replace({"W": 3, "D": 1, "L": 0})

In [5]:
predict_matches['date'] = pd.to_datetime(predict_matches['date'])
predict_matches["venue_code"] = predict_matches["venue"].astype("category").cat.codes
predict_matches["opp_code"] = predict_matches["opponent"].astype("category").cat.codes
predict_matches["hour"] = predict_matches["time"].str.replace(":.+", "", regex=True).astype("int")
predict_matches["day_code"] = predict_matches["date"].dt.dayofweek


In [6]:
rf = RandomForestClassifier(n_estimators=100, min_samples_split=20, random_state=1)
# predict result in 2023 by data in 2022
train = matches[matches["date"] < '2023-01-01']
test = matches[matches["date"] > '2023-01-01']

predictors = ["venue_code", "opp_code", "hour", "day_code", "xg", "xga", "poss"]
rf.fit(train[predictors], train["target"])
preds2023 = rf.predict(test[predictors])
accuracy = accuracy_score(test["target"], preds2023)
print(f"accuracy with venue_code, opponent, hour, day, expected goal, expected goal against, possession: {accuracy}")
listpreds2023 = preds2023.tolist()
# predict without predict without xg, xga, poss

predictors = ["venue_code", "opp_code", "hour", "day_code"]
rf.fit(train[predictors], train["target"])
preds2023without = rf.predict(test[predictors])

listpreds2023without = preds2023without.tolist()
accuracy = accuracy_score(test["target"], preds2023without)
print(f"accuracy with venue_code, opponent, hour, day: {accuracy}")

accuracy with venue_code, opponent, hour, day, expected goal, expected goal against, possession: 0.5883977900552486
accuracy with venue_code, opponent, hour, day: 0.4585635359116022


In [7]:
test.to_csv("result2023WLD.csv")
df = pd.read_csv('result2023WLD.csv')
df['predict'] = listpreds2023
df['predict without xg, xga, poss'] = listpreds2023without
df.to_csv('result2023WLD.csv', index=False)

In [8]:
# rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
# predict unplayed matches (data crawled in 2023/05/10)
train = matches
test = predict_matches
predictors = ["venue_code", "opp_code", "hour", "day_code", "xg", "xga", "poss"]

rf.fit(train[predictors], train["target"])
preds_unplayed = rf.predict(test[predictors])

listpreds_unplayed = preds_unplayed.tolist()
predict_matches['predict'] = listpreds_unplayed


predictors = ["venue_code", "opp_code", "hour", "day_code"]

rf.fit(train[predictors], train["target"])
preds_unplayed_without = rf.predict(test[predictors])

listpreds_unplayed_without = preds_unplayed_without.tolist()
predict_matches['predict without xg, xga, poss'] = listpreds_unplayed_without
predict_matches.to_csv('result_unplayedWDL.csv', index=False)

In [10]:
validateDF = pd.read_csv("2205_predict_matchesWDL.csv")
accuracy = accuracy_score(validateDF["result"], validateDF["predict"])
accuracy

0.55