In [0]:
import pandas as pd
import numpy as np
import sys
import math
import csv
import urllib
import collections
import sklearn
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn import linear_model
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report

In [0]:
#Import Data Files
EPL_data = pd.read_csv('Training_Data.csv')
team_names = pd.read_csv('Team_Names.csv')
team_names.tail()
teamList = team_names['Team_Name'].tolist()
test_data = pd.read_csv('Test_Results.csv')


In [0]:
def getAnnualTeamData(teamName, year):
    
    annual_data = EPL_data[EPL_data['Year'] == year]
    
    # num goals scored in wins and losses
    gamesHome = annual_data[annual_data['HomeTeam'] == teamName] 
    totalGoalsScored = gamesHome['FTHG'].sum()
    gamesAway = annual_data[annual_data['AwayTeam'] == teamName]
    totalGames = gamesHome.append(gamesAway)
    numGames = len(totalGames.index)
    #total goals scored
    totalGoalsScored += gamesAway['FTAG'].sum()
    # total goals allowed
    totalGoalsAllowed = gamesHome['FTAG'].sum()
    totalGoalsAllowed += gamesAway['FTHG'].sum()
    
    #discipline: total red cards, total yellow cards
    totalYellowCards = gamesHome['HY'].sum()
    totalYellowCards += gamesAway['AY'].sum()
    totalRedCards = gamesHome['HR'].sum()
    totalRedCards += gamesAway['AR'].sum()
    
    #total fouls
    totalFouls = gamesHome['HF'].sum()
    totalFouls += gamesAway['AF'].sum()
    
    #total Corners
    totalCorners = gamesHome['HC'].sum()
    totalCorners += gamesAway['AC'].sum()

    #shots per game (spg)
    totalShots = gamesHome['HS'].sum()
    # avg shots per game
    totalShots += gamesAway['AS'].sum()
    if numGames != 0:
        spg = totalShots / numGames
    # avg shots allowed per game
    totalShotsAgainst = gamesHome['AS'].sum()
    totalShotsAgainst += gamesAway['HS'].sum()
    if numGames != 0:
        sag = totalShotsAgainst / numGames
    
    #Games Won Percentage
    gamesWon = annual_data[annual_data['Winner'] == teamName] 
    gamesLost = annual_data[annual_data['Loser'] == teamName] 
    numGamesWon = len(gamesWon.index)
    numGamesLost = len(gamesLost.index)
    if numGames != 0:
        gamesWonPercentage = numGamesWon / numGames
        #Goalie Saves
    totalShotsOnGoal = gamesHome['HST'].sum()
    totalShotsOnGoal += gamesAway['AST'].sum()
    goalieSaves = totalShotsOnGoal - totalGoalsAllowed    
        #Saves Percentage   
    if totalShotsOnGoal != 0:
        savesPercentage = goalieSaves / totalShotsOnGoal        
        #Saves Ratio  
    if goalieSaves != 0:
        savesRatio = totalShotsOnGoal / goalieSaves
        #Scoring Percentage
    if totalShots != 0:
        scoringPercentage = (totalShots - totalGoalsScored) / totalShots
       #Scoring Ratio
    if totalGoalsScored != 0:
        scoringRatio = totalShotsOnGoal / totalGoalsScored       
        
            
    if numGames == 0: 
        gamesWon = 0
        gamesLost = 0
        totalGoalsScored = 0
        totalGoalsAllowed = 0
        totalYellowCards = 0
        totalRedCards = 0
        totalFouls = 0
        totalCorners = 0
        spg = 0
        sag = 0
        gamesWonPercentage = 0
        goalieSaves = 0
        savesPercentage = 0
        savesRatio = 0
        scoringPercentage = 0
        scoringRatio = 0 
        
    return [totalGoalsScored, totalGoalsAllowed, totalYellowCards, totalRedCards,
        totalFouls,totalCorners, spg, sag, gamesWonPercentage, goalieSaves, savesPercentage, savesRatio,
        scoringPercentage, scoringRatio]

In [0]:
def createAnnualDict(year):
    annualDictionary = collections.defaultdict(list)
    for team in teamList:
        team_vector = getAnnualTeamData(team, year)
        annualDictionary[team] = team_vector
    return annualDictionary

In [0]:
def getTrainingData(years):
    totalNumGames = 0
    for year in years:
        annual = EPL_data[EPL_data['Year'] == year]
        totalNumGames += len(annual.index)
    numFeatures = len(getAnnualTeamData('Arsenal',2015))
    xTrain = np.zeros(( totalNumGames, numFeatures))
    yTrain = np.zeros(( totalNumGames ))
    indexCounter = 0
    for year in years:
        team_vectors = createAnnualDict(year)
        annual = EPL_data[EPL_data['Year'] == year]
        numGamesInYear = len(annual.index)
        xTrainAnnual = np.zeros(( numGamesInYear, numFeatures))
        yTrainAnnual = np.zeros(( numGamesInYear ))
        counter = 0
        for index, row in annual.iterrows():
            h_team = row['HomeTeam']
            h_vector = team_vectors[h_team]
            a_team = row['AwayTeam']
            a_vector = team_vectors[a_team]
            diff = [a - b for a, b in zip(h_vector, a_vector)]
            if (counter % 2 == 0):
                if len(diff) != 0:
                    xTrainAnnual[counter] = diff
                yTrainAnnual[counter] = 1
            else:
                if len(diff) != 0:
                    xTrainAnnual[counter] = [ -p for p in diff]
                yTrainAnnual[counter] = 0
            counter += 1
        xTrain[indexCounter:numGamesInYear+indexCounter] = xTrainAnnual
        yTrain[indexCounter:numGamesInYear+indexCounter] = yTrainAnnual
        indexCounter += numGamesInYear
    return xTrain, yTrain

In [0]:
# Create data library from 2012-2016
years = range(2012,2016)
xTrain, yTrain = getTrainingData(years)
np.save('xTrain', xTrain)
np.save('yTrain', yTrain)

In [0]:
from sklearn import linear_model
lm = linear_model.LinearRegression()
from sklearn.model_selection import train_test_split
xTrain, X_test, yTrain, y_test = train_test_split(xTrain, yTrain)
print(xTrain.shape, yTrain.shape)
print(X_test.shape, y_test.shape)
model2 = lm.fit(xTrain, yTrain)
predictions = lm.predict(X_test)
print(sum(predictions)/len(predictions))


(535, 14) (535,)
(179, 14) (179,)
0.5075471683289128


In [0]:
def createGamePrediction(team1_vector, team2_vector, xTrain, yTrain):
    xTrain, X_test, yTrain, Y_test = train_test_split(xTrain, yTrain)
    xTrain.shape, yTrain.shape
    X_test.shape, y_test.shape
    lm = linear_model.LinearRegression()
    model2 = lm.fit(xTrain, yTrain)
    diff = [a - b for a, b in zip(team1_vector, team2_vector)]
    diff = np.asarray(diff)
    prediction = lm.predict(diff.reshape(1,-1))
    return prediction

In [0]:
def formulatePredictions():
    probs = [[0 for x in range(2)] for x in range(len(test_data.index))]
    for index, row in test_data.iterrows():
        game_ID = row['Game_ID']
        year = row['Year'] - 1
        team1_Name = row['HomeTeam']
        team2_Name = row['AwayTeam']
        team1_vector = getAnnualTeamData(team1_Name, year)
        team2_vector = getAnnualTeamData(team2_Name, year)
        prediction = createGamePrediction(team1_vector, team2_vector,xTrain, yTrain)
        probs[index][0] = game_ID
        probs[index][1] = prediction
    probs = pd.np.array(probs)
    return probs

In [0]:
print(test_data)

   Game_ID  Year       Date     HomeTeam        AwayTeam
0        1  2017  11-Feb-17    Liverpool       Tottenham
1        2  2017  11-Feb-17   Man United         Watford
2        3  2017  11-Feb-17        Stoke  Crystal Palace
3        4  2017  11-Feb-17     West Ham       West Brom
4        5  2017  12-Feb-17      Burnley         Chelsea
5        6  2017  12-Feb-17      Swansea       Leicester
6        7  2017  13-Feb-17  Bournemouth        Man City


In [0]:
# Running prediction model on test data
formulatePredictions()

array([[1, array([0.41176738])],
       [2, array([0.5340199])],
       [3, array([0.48241013])],
       [4, array([0.46993713])],
       [5, array([0.82550101])],
       [6, array([0.53942104])],
       [7, array([0.62162685])]], dtype=object)