In [1]:
import pandas as panda
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
#reads in previously scraped data
matchData = panda.read_csv("totalMatches.csv",index_col=0)

In [4]:
#converts date into datetime 
matchData["date"] = panda.to_datetime(matchData["date"])
#0 = away
#1 = home
matchData["homeAway"] = matchData["venue"].astype("category").cat.codes
#gives each team a code
matchData["teamCode"] = matchData["opponent"].astype("category").cat.codes
#converts the time into an int so that it can be processed by randomforest
matchData["hour"] = matchData["time"].str.replace(":+", "", regex=True).astype("int")
#converts the date into a set day of the week
matchData["dayCode"] = matchData["date"].dt.dayofweek
#convnerts the already kown nresults into numbers for wins, draws and losses
matchData["predict"] = (matchData["result"] == "W").astype("int")
matchData["predict"] += (matchData["result"] == "W").astype("int")
matchData["predict"] += (matchData["result"] == "D").astype("int")

In [9]:
cols = ["gf","ga","sh","sot"]
newCols = [f"{c}-Rolling" for c in cols]

In [10]:
#function to work calculate the averages of stats
def averages(group,cols,newCols):
    group = group.sort_values("date")
    rollingStats = group[cols].rolling(3,closed="left").mean()
    group[newCols] = rollingStats
    group = group.dropna(subset=newCols)
    return group

In [11]:
matchesRolling = matchData.groupby("team").apply(lambda x: averages(x,cols,newCols))

In [12]:
#predictors used to help predict result of games
predictors = ["homeAway","teamCode","hour","dayCode"]

In [13]:
randF = RandomForestClassifier(n_estimators=5000, random_state=1)
#number of decision trees
#number of samples per leaf of decision tree

In [14]:
#more predictors used from new calculates averages
cols = ["ga","sh"]
newCols = [f"{c}-Rolling" for c in cols]

In [15]:
#function to predict the outcome of games, using all of the data provided
def makePrediction(matchesRolling,predictors):
    train = matchesRolling[matchesRolling["date"] < "2023-04-01"]
    test = matchesRolling[matchesRolling["date"] > "2023-04-01"]
    randF.fit(train[predictors],train["predict"])
    preds = randF.predict(test[predictors])
    combinedData = panda.DataFrame(dict(actual=test["predict"], prediction=preds), index=test.index)
    precision = accuracy_score(test["predict"], preds)
    return combinedData, precision

In [16]:
combinedData, precision = makePrediction(matchesRolling, predictors+newCols)

In [17]:
precisionPercentage = precision * 100

In [18]:
print(precisionPercentage,"%")

40.74074074074074 %


In [19]:
combinedData.to_csv("predictions.csv")