# CMSE 202 Semester Project

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [None]:
url = 'https://raw.githubusercontent.com/dataquestio/project-walkthroughs/master/football_matches/matches.csv'

matches = pd.read_csv(url, index_col=0)

In [None]:
matches["team"].value_counts()

Southampton                 72
Brighton and Hove Albion    72
Manchester United           72
West Ham United             72
Newcastle United            72
Burnley                     71
Leeds United                71
Crystal Palace              71
Manchester City             71
Wolverhampton Wanderers     71
Tottenham Hotspur           71
Arsenal                     71
Leicester City              70
Chelsea                     70
Aston Villa                 70
Everton                     70
Liverpool                   38
Fulham                      38
West Bromwich Albion        38
Sheffield United            38
Brentford                   34
Watford                     33
Norwich City                33
Name: team, dtype: int64

In [None]:
matches[matches["team"] == "Liverpool"].sort_values("date")

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,2020-09-12,17:30,Premier League,Matchweek 1,Sat,Home,W,4.0,3.0,Leeds United,...,Match Report,,20.0,4.0,17.0,0.0,2.0,2.0,2021,Liverpool
2,2020-09-20,16:30,Premier League,Matchweek 2,Sun,Away,W,2.0,0.0,Chelsea,...,Match Report,,17.0,5.0,17.7,1.0,0.0,0.0,2021,Liverpool
4,2020-09-28,20:00,Premier League,Matchweek 3,Mon,Home,W,3.0,1.0,Arsenal,...,Match Report,,21.0,9.0,16.8,0.0,0.0,0.0,2021,Liverpool
6,2020-10-04,19:15,Premier League,Matchweek 4,Sun,Away,L,2.0,7.0,Aston Villa,...,Match Report,,14.0,8.0,15.8,1.0,0.0,0.0,2021,Liverpool
7,2020-10-17,12:30,Premier League,Matchweek 5,Sat,Away,D,2.0,2.0,Everton,...,Match Report,,22.0,8.0,15.0,1.0,0.0,0.0,2021,Liverpool
9,2020-10-24,20:00,Premier League,Matchweek 6,Sat,Home,W,2.0,1.0,Sheffield Utd,...,Match Report,,17.0,5.0,18.2,1.0,0.0,0.0,2021,Liverpool
11,2020-10-31,17:30,Premier League,Matchweek 7,Sat,Home,W,2.0,1.0,West Ham,...,Match Report,,8.0,2.0,18.6,1.0,1.0,1.0,2021,Liverpool
13,2020-11-08,16:30,Premier League,Matchweek 8,Sun,Away,D,1.0,1.0,Manchester City,...,Match Report,,9.0,2.0,21.5,0.0,1.0,1.0,2021,Liverpool
14,2020-11-22,19:15,Premier League,Matchweek 9,Sun,Home,W,3.0,0.0,Leicester City,...,Match Report,,24.0,12.0,11.9,0.0,0.0,0.0,2021,Liverpool
16,2020-11-28,12:30,Premier League,Matchweek 10,Sat,Away,D,1.0,1.0,Brighton,...,Match Report,,6.0,2.0,20.9,0.0,0.0,0.0,2021,Liverpool


In [None]:
del matches["comp"]
del matches["notes"]

In [None]:
matches["date"] = pd.to_datetime(matches["date"])
matches["target"] = (matches["result"] == "W").astype("int")

In [None]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype("int")
matches["day_code"] = matches["date"].dt.dayofweek

In [None]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

In [None]:
train = matches[matches["date"] < '2022-01-01']
test = matches[matches["date"] > '2022-01-01']

In [None]:
predictors = ["venue_code", "opp_code", "hour", "day_code"]

In [None]:
rf.fit(train[predictors], train["target"])

RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)

In [None]:
preds = rf.predict(test[predictors])
accuracy = accuracy_score(test["target"], preds)
accuracy

0.6123188405797102

In [None]:
combined = pd.DataFrame(dict(actual=test["target"], predicted=preds))
precision_score(test["target"], preds)

0.4745762711864407

In [None]:
grouped_matches = matches.groupby("team")
group = grouped_matches.get_group("Manchester City").sort_values("date")

In [None]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group
cols = ["gf", "ga", "sh", "sot","dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]

In [None]:
matches_rolling = matches.groupby("team").apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling[matches_rolling['team']=='Liverpool']

Unnamed: 0_level_0,Unnamed: 1_level_0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,opp_code,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,fk_rolling,pk_rolling,pkatt_rolling
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Liverpool,6,2020-10-04,19:15,Matchweek 4,Sun,Away,L,2.0,7.0,Aston Villa,1.8,...,1,19,6,3.0,1.333333,19.333333,6.0,0.333333,0.666667,0.666667
Liverpool,7,2020-10-17,12:30,Matchweek 5,Sat,Away,D,2.0,2.0,Everton,2.2,...,7,12,5,2.333333,2.666667,17.333333,7.333333,0.666667,0.0,0.0
Liverpool,9,2020-10-24,20:00,Matchweek 6,Sat,Home,W,2.0,1.0,Sheffield Utd,2.5,...,16,20,5,2.333333,3.333333,19.0,8.333333,0.666667,0.0,0.0
Liverpool,11,2020-10-31,17:30,Matchweek 7,Sat,Home,W,2.0,1.0,West Ham,1.8,...,21,17,5,2.0,3.333333,17.666667,7.0,1.0,0.0,0.0
Liverpool,13,2020-11-08,16:30,Matchweek 8,Sun,Away,D,1.0,1.0,Manchester City,1.4,...,12,16,6,2.0,1.333333,15.666667,5.0,1.0,0.333333,0.333333
Liverpool,14,2020-11-22,19:15,Matchweek 9,Sun,Home,W,3.0,0.0,Leicester City,3.7,...,10,19,6,1.666667,1.0,11.333333,3.0,0.666667,0.666667,0.666667
Liverpool,16,2020-11-28,12:30,Matchweek 10,Sat,Away,D,1.0,1.0,Brighton,0.5,...,3,12,5,2.0,0.666667,13.666667,5.333333,0.333333,0.666667,0.666667
Liverpool,18,2020-12-06,19:15,Matchweek 11,Sun,Home,W,4.0,0.0,Wolves,1.2,...,22,19,6,1.666667,0.666667,13.0,5.333333,0.0,0.333333,0.333333
Liverpool,20,2020-12-13,16:30,Matchweek 12,Sun,Away,D,1.0,1.0,Fulham,2.1,...,8,16,6,2.666667,0.333333,13.666667,6.666667,0.333333,0.0,0.0
Liverpool,21,2020-12-16,20:00,Matchweek 13,Wed,Home,W,2.0,1.0,Tottenham,1.2,...,18,20,2,2.0,0.666667,9.333333,4.333333,0.666667,0.333333,0.333333


In [None]:
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])

In [None]:
matches_rolling

Unnamed: 0,date,time,round,day,venue,result,gf,ga,opponent,xg,...,opp_code,hour,day_code,gf_rolling,ga_rolling,sh_rolling,sot_rolling,fk_rolling,pk_rolling,pkatt_rolling
0,2020-10-04,14:00,Matchweek 4,Sun,Home,W,2.0,1.0,Sheffield Utd,0.4,...,16,14,6,2.000000,1.333333,7.666667,3.666667,0.666667,0.000000,0.000000
1,2020-10-17,17:30,Matchweek 5,Sat,Away,L,0.0,1.0,Manchester City,0.9,...,12,17,5,1.666667,1.666667,5.333333,3.666667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Matchweek 6,Sun,Home,L,0.0,1.0,Leicester City,0.9,...,10,19,6,1.000000,1.666667,7.000000,3.666667,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Matchweek 7,Sun,Away,W,1.0,0.0,Manchester Utd,1.1,...,13,16,6,0.666667,1.000000,9.666667,4.000000,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Matchweek 8,Sun,Home,L,0.0,3.0,Aston Villa,1.5,...,1,19,6,0.333333,0.666667,9.666667,2.666667,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1315,2022-03-13,14:00,Matchweek 29,Sun,Away,W,1.0,0.0,Everton,0.8,...,7,14,6,1.333333,1.000000,12.333333,3.666667,0.000000,0.000000,0.000000
1316,2022-03-18,20:00,Matchweek 30,Fri,Home,L,2.0,3.0,Leeds United,0.8,...,9,20,4,1.666667,0.666667,12.333333,4.333333,0.000000,0.000000,0.000000
1317,2022-04-02,15:00,Matchweek 31,Sat,Home,W,2.0,1.0,Aston Villa,1.2,...,1,15,5,2.333333,1.000000,13.000000,5.333333,0.000000,0.000000,0.000000
1318,2022-04-08,20:00,Matchweek 32,Fri,Away,L,0.0,1.0,Newcastle Utd,0.3,...,14,20,4,1.666667,1.333333,13.000000,5.000000,0.000000,0.000000,0.000000


In [None]:
def make_predictions(data, predictors):
    train = data[data["date"] < '2022-01-01']
    test = data[data["date"] > '2022-01-01']
    rf.fit(train[predictors], train["target"])
    preds = rf.predict(test[predictors])
    combined = pd.DataFrame(dict(actual=test["target"], predicted=preds), index=test.index)
    error = precision_score(test["target"], preds)
    return combined, error

In [None]:
combined, precision = make_predictions(matches_rolling, predictors + new_cols)
precision

0.5344827586206896

In [None]:
combined = combined.merge(matches_rolling[["date", "team", "opponent", "result"]], left_index=True, right_index=True)

In [None]:
dummy = combined[combined['team']=='Brentford']
dummy

Unnamed: 0,actual,predicted,date,team,opponent,result
150,1,0,2022-01-02,Brentford,Aston Villa,W
151,0,0,2022-01-11,Brentford,Southampton,L
152,0,0,2022-01-16,Brentford,Liverpool,L
153,0,0,2022-01-19,Brentford,Manchester Utd,L
154,0,0,2022-01-22,Brentford,Wolves,L
155,0,0,2022-02-09,Brentford,Manchester City,L
156,0,0,2022-02-12,Brentford,Crystal Palace,D
157,0,0,2022-02-19,Brentford,Arsenal,L
158,0,0,2022-02-26,Brentford,Newcastle Utd,L
159,1,0,2022-03-05,Brentford,Norwich City,W


In [None]:
len(dummy)

16

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('accuracy.csv')

In [None]:
df = df.sort_values('Accuracy Score',ascending=True)

accuracy = list(df['Accuracy Score'])
team = list(df['Team Name'])

In [None]:
plt.figure(figsize=(12,10))
plt.barh(team,accuracy,edgecolor='white',linewidth=2)
plt.title('Accuracy of Model for each Premier League Team',fontproperties=prop,fontsize=18)
plt.xlabel('Accuracy Score(%)',fontproperties=prop,fontsize=20)
plt.yticks(fontproperties=prop,rotation=0,fontsize=15)
plt.xticks(fontproperties=prop,fontsize=15)
plt.grid()

In [None]:
df_corr = pd.read_csv('correlation.csv')

In [None]:
df_corr = df_corr.dropna()

In [None]:
accuracy_score = df_corr['Accuracy Score']
league_position = df_corr['League Position']

In [None]:
plt.figure(figsize=(11,10))
plt.plot(accuracy_score,league_position,'o',color='white')
plt.title('Correlation between Accuracy Score and League Position from 21/22 Season',fontproperties=prop,fontsize=17)
plt.xlabel('Accuracy Score(%)',fontproperties=prop,fontsize=17)
plt.ylabel('League Position',fontproperties=prop,fontsize=17)
plt.yticks(fontproperties=prop,rotation=0,fontsize=15)
plt.xticks(fontproperties=prop,fontsize=15)
plt.yticks(np.arange(2,20,5),fontproperties=prop,fontsize=15)
plt.axvline(x=63.5, color="white", linestyle="--")
plt.text(60,18,'Median',fontproperties=prop,fontsize=16,color='white')
plt.axhline(y=9.5, color="white", linestyle="--")
plt.text(50,9,'Average',fontproperties=prop,fontsize=16,color='white')
plt.gca().invert_yaxis()
plt.scatter(50,1,s=350,color='#6cabdd')
plt.text(51,1,'Man City',fontproperties=prop,fontsize=15,color='#6cabdd')
plt.scatter(52,2,s=350,color='#c8102e')
plt.text(53,2.1,'Liverpool',fontproperties=prop,fontsize=15,color='#c8102e')
plt.scatter(58,3,s=350,color='#034694')
plt.text(59,3,'Chelsea',fontproperties=prop,fontsize=15,color='#034694')
plt.scatter(53,4,s=350,color='#ffffff',alpha=0.7)
plt.text(54,4.2,'Spurs',fontproperties=prop,fontsize=15,color='#ffffff')
plt.scatter(71,18,s=350,color='#6c1d45')
plt.text(71.8,18,'Burnley',fontproperties=prop,fontsize=15,color='#6c1d45')
plt.scatter(67,17,s=350,color='#ffffff',alpha=0.7)
plt.text(67,16.5,'Leeds Utd',fontproperties=prop,fontsize=15,color='#ffffff')
plt.scatter(73,16,s=350,color='#003399')
plt.text(73.5,16,'Everton',fontproperties=prop,fontsize=15,color='#003090')
plt.scatter(79,14,s=350,color='#6c1d45')
plt.text(74.5,14,'Aston Villa',fontproperties=prop,fontsize=15,color='#6c1d45')
plt.grid()
plt.savefig("CMSE202Plot.png",dpi=900)