In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor

In [2]:

def retrieveEPL():
    """
    retrieves about 7800 match results from EPL and stores them in the dataframe data
    """
    prem = pd.read_csv("http://www.football-data.co.uk/mmz4281/9394/E0.csv") #read data
    prem = prem[['HomeTeam','AwayTeam','FTHG','FTAG']]                       #remove unnecessary columns
    prem = prem.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})   #rename columns to be more readable
    years = []                                                               #this is where our data will be stored
    for i in range(94, 118):                                                 #for every year we have data available, create a string to insert into the url
                                                                             #in order to retrieve the correct data
        if i < 99:
            year = str(i) + str(i+1)
        elif i == 99:
            year = "9900"
        elif i == 100:
            year = "0001"
        #data from 2002 to 2004 is in a different format, can't extract csv file
        elif 101 < i < 105:
            continue
        elif i < 109:
            year = "0" + str(i-100) + "0" + str(i + 1 -100)
        elif i < 110:
            year = "0" + str(i-100) + str(i + 1 -100)
        else:
            year = str(i-100) + str(i + 1 -100)
        #print(year)
        nex = pd.read_csv("http://www.football-data.co.uk/mmz4281/" + year + "/E0.csv") #read data into dataframe
        nex = nex[['HomeTeam','AwayTeam','FTHG','FTAG']]                            #remove unnecessary columns
        nex = nex.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})        #rename columns to be more readable
        years.append(nex)
    data = pd.concat(years)
    data = data[ (data['HomeTeam'].notnull()) ]           #remove null values
    return data


In [3]:

def last5(teamName, index, matches):
    """
    Takes a string (teamName) and int index and outputs an array of their last 5 results
    """
    arr = []
    prevResults = matches[0:index]
    ##use or operator on arr to find where teamName == hometeam or awayteam column
    prevTeamResults = prevResults[ (prevResults['HomeTeam'] == teamName) | (prevResults['AwayTeam'] == teamName) ]
    last = prevTeamResults[-5 : prevTeamResults.shape[0]]   #find last five results
    #check which team is teamName and use homegoals and away goals to identify result
    for i in range(0, last.shape[0]):
        hg = int(last[i:i+1]['HomeGoals'])
        ag = int(last[i:i+1]['AwayGoals'])
        goalDifference = hg-ag
        if goalDifference > 4:
            goalDifference = 4
        if goalDifference < -4:
            goalDifference = -4
        if (last[i:i+1]['HomeTeam'] == teamName).bool():    #put results in array arr 
            arr.append(goalDifference)
        else:
            arr.append(-goalDifference)
    return arr

In [4]:

def prevMeetings(index, matches):
    """
    Finds home team h and away team a at index
    """
    home = matches[index-1:index]['HomeTeam']                 #find home team and away team names at index
    away = matches[index-1:index]['AwayTeam']
    home = " ".join(pd.Series.to_string(home).split()[1:])    #remove index from value and change to string
    away = " ".join(pd.Series.to_string(away).split()[1:])
    arr = []
    prevResults = matches[0:index]                            #matches before index
    #use or operator on arr to find where teamName == hometeam or awayteam column
    prevTeamResults = prevResults[ (((prevResults['HomeTeam'] == home) | (prevResults['HomeTeam'] == away)) & ((prevResults['AwayTeam'] == home) | (prevResults['AwayTeam'] == away))) ]
    last = prevTeamResults[-2 : prevTeamResults.shape[0]]        #previous two results between the two teams
    ##check which team is home and use homegoals and away goals to identify result
    homeTeam1 = last[0:1]['HomeTeam']
    homeTeam1 = " ".join(pd.Series.to_string(homeTeam1).split()[1:])
    homeTeam2 = last[1:2]['HomeTeam']
    homeTeam2 = " ".join(pd.Series.to_string(homeTeam2).split()[1:]) 
    if homeTeam1 == home:
        gd1 = int(last[0:1]['HomeGoals']) - int(last[0:1]['AwayGoals'])
    else:
        gd1 = int(last[0:1]['AwayGoals']) - int(last[0:1]['HomeGoals'])
    if homeTeam2 == home:
        gd2 = int(last[1:2]['HomeGoals']) - int(last[1:2]['AwayGoals'])
    else:
        gd2 = int(last[1:2]['AwayGoals']) - int(last[1:2]['HomeGoals'])
    #change to values to conform to the range [-4, 4]
    if gd1 > 4:
        gd1 = 4
    if gd2 > 4:
        gd2 = 4
    if gd1 < -4:
        gd1 = -4
    if gd2 < -4:
        gd2 = -4
    #put results in arr
    arr.append(gd1)
    arr.append(gd2)
    return arr


In [5]:
def sampler(data):
    """
    Create the dataset, a dataframe that is comprised of z1-z4,
     or more specifically last5 of hometeam, last5 of away team, and the two prevMeetings.
    """
    samples = []
    for index in range(0, data.shape[0]):       
        #print(index+188)
        home = data[index:index+1]['HomeTeam']                 #find home team and away team names at index
        away = data[index:index+1]['AwayTeam']
        home = " ".join(pd.Series.to_string(home).split()[1:])    #remove index from value and change to string
        away = " ".join(pd.Series.to_string(away).split()[1:])
        try:
            z1arr = last5(home, index, data)
        except:
            z1arr = [-1,-1,-1,-1,-1]               #if the team has no past results, reflect a slightly
                                                    #negative expectation based on no experience in the league
        try:
            z2arr = last5(away, index, data)
        except:
            z2arr = [-1,-1,-1,-1,-1]
        try:
            prev = prevMeetings(index, data)
            z3 = prev[0]
            z4 = prev[1]
        except:
            #If there are no results between the two teams, record z3 and z4 as draws
            z3 = 0
            z4 = 0
        z1 = sum(z1arr)
        z2 = sum(z2arr)
        sample = [z1, z2, z3, z4]
        samples.append(sample)
    df = pd.DataFrame(samples, columns=['z1', 'z2', 'z3', 'z4'])
    return df


In [6]:
def results(data):
    """
    Finds the difference between home goals and away goals for the given dataframe
    """
    goalDifference = []
    for row in range(0, data.shape[0]):
        gd = int(data[row: row+1]['HomeGoals']) - int(data[row: row+1]['AwayGoals'])
        goalDifference.append(gd)
    return goalDifference

In [7]:
data = retrieveEPL()                    #retrieve data
X = sampler(data)                       #turn data into dataframe with columns [z1, z2, z3, z4]
X = X[250:]                             #don't include the first 250 values, which prevMeetings won't find results for
y = results(data[250:])                 #find actual results of matches in data


In [12]:
model = linear_model.LinearRegression()     #instantiate model
model.fit(X,y)                              #fit the model to the data
predictions = model.predict(X)              #use the fitted model to make predictions based on sample data
mean_squared_error(y, predictions)          #find the mean squared error
#model.coef_                                #find the coefficients for each variable


2.8679233043894681

In [13]:
#create the same model as above, but without z3 and z4, to show how little of a difference they make
df1 = X[['z1','z2']]                          
model2 = linear_model.LinearRegression()      #instantiate model
model2.fit(df1,y)                             #fit the model to the data
predictions2 = model2.predict(df1)            #use the fitted model to make predictions based on sample data
mean_squared_error(y, predictions2)           #find the mean squared error
#model2.coef_                                 #find the coefficients for each variable

2.8680144931333884

In [10]:
X_scaled = preprocessing.scale(X)              #scale the data in order to make each variable start at comparable weights
mlp = MLPRegressor(hidden_layer_sizes=50, random_state=1)      #instantiate the MLPRegressor with 50 layers
mlp.fit(X_scaled, y)                                           #fit the data to the regressor
predictions3 = mlp.predict(X_scaled)                           #make predictions based on the scaled data
mean_squared_error(y, predictions3)                            #find the mean squared error

2.8435549982430457

In [14]:
def evaluate(p):
    """
    Prints metrics to evaluate the performance of p against actual results y
    """
    print("\n")
    n = 0
    m = 0
    draws = 0
    hw = 0 
    hl = 0
    for i in range(0, len(p)):
        if abs( p[i] - y[i]) < 0.5:
            n+=1
        if abs( p[i] - y[i]) < 1:
            m+=1
        if p[i]<.5 and p[i] > -.5 and y[i]==0:
            draws +=1
        if p[i]>.5 and y[i]>0:
            hw +=1
        if p[i] < -.5 and y[i] < 0:
            hl +=1
    print("within .5: " + str( n / len(p)))
    print("within 1: " + str( m / len(p)))
    print("draws " + str( draws / len(p)))
    print("home wins: " + str( hw / len(p)))
    print("home losses: " + str( hl / len(p)))
    print("percent total correct predictions: " + str((draws + hw + hl) / len(p)))

In [15]:
evaluate(predictions)
evaluate(predictions2)
evaluate(predictions3)



within .5: 0.26391453442363494
within 1: 0.49129517277763124
draws 0.14468477974149302
home wins: 0.2542864679504089
home losses: 0.019651806911105248
percent total correct predictions: 0.41862305460300714


within .5: 0.264178317066737
within 1: 0.4908994988129781
draws 0.14442099709839093
home wins: 0.2550778158797151
home losses: 0.019651806911105248
percent total correct predictions: 0.41915061988921126


within .5: 0.2625956212081245
within 1: 0.48892112898971246
draws 0.1496966499604326
home wins: 0.24109733579530468
home losses: 0.022025850699024005
percent total correct predictions: 0.4128198364547613
