In [47]:
import pandas as pd
import numpy as np
import csv
import collections
from numpy.linalg import inv
from sklearn.cross_validation import train_test_split

#Fetching the Data from a csv file
EPL_data = pd.read_csv(r'C:\Users\jayap\Desktop\Machine Learning\Individual Project\Data and Ref Paper\Raw_data.csv')
EPL_data['Date'] = pd.to_datetime(EPL_data['Date'])
EPL_data['Year'] = EPL_data['Date'].dt.year
teams = set(EPL_data['HomeTeam'])
EPL_data.head()

Unnamed: 0,ID,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AF,HC,AC,HY,AY,HR,AR,Winner,Loser,Year
0,1,2005-08-13,Everton,Man United,0,2,A,0,1,A,...,14,8,6,3,1,0,0,Man United,Everton,2005
1,2,2005-08-13,Man City,West Brom,0,0,D,0,0,D,...,11,3,6,2,3,0,0,,,2005
2,3,2005-08-14,Arsenal,Newcastle,2,0,H,0,0,D,...,17,8,3,0,1,0,1,Arsenal,Newcastle,2005
3,4,2005-08-20,Newcastle,West Ham,0,0,D,0,0,D,...,11,10,2,1,1,0,1,,,2005
4,5,2005-08-21,Chelsea,Arsenal,1,0,H,0,0,D,...,21,3,7,2,3,0,0,Chelsea,Arsenal,2005


In [48]:
#Funtion to fetch the team stats based on where the team is playing
#
# E.g. Arsenal vs Liverpool, and Arsenal is playing at home
# We will fetch Home Stats for Arsenal and Away Stats for Liverpool
#
#The Stats being considered are Win_percentage, Scoring_Prob, Save_Prob
# Win_percentage is calculated based on Home or Away performance
# Scoring_prob gives team's Attack stats based on Home or Away performance
# Save_Prob gives the teams's Defense Stats based on Home or Away Performance

def get_HA_vector_data(teamName,year,place):
    
    stats = []
    #Team Stats when played at Home
    data = EPL_data[EPL_data['Year'] == year]
    if place == 'Home':
        games_played_home = data[data['HomeTeam'] == teamName]
        numGames = len(games_played_home.index)
        if numGames != 0:
            games_won_home = games_played_home[games_played_home['Winner'] == teamName]
            win_games = len(games_won_home.index)
            home_win_percentage = win_games/numGames

            #Attack Stats when played at home
            total_shots = games_played_home['HS'].sum()
            goals_scored = games_played_home['FTHG'].sum()
            scoring_prob = (total_shots - goals_scored)/total_shots

            #Defense Stats when played at Home
            total_shots_against = games_played_home['AS'].sum()
            goals_allowed = games_played_home['FTAG'].sum()
            save_prob = (total_shots_against - goals_allowed)/total_shots_against

            stats = [home_win_percentage,scoring_prob,save_prob]
#             stats = [scoring_prob,save_prob]
        else:
            stats = [0,0,0]
#             stats = [0,0]
        
    else:
        #Team Stats when played away
        games_played_away = EPL_data[EPL_data['AwayTeam'] == teamName]
        numGames = len(games_played_away.index)
        if numGames != 0:
            games_won_away = games_played_away[games_played_away['Winner'] == teamName]
            win_games = len(games_won_away.index)
            away_win_percentage = win_games/numGames

            #Attack Stats when played at away
            total_shots = games_played_away['AS'].sum()
            goals_scored = games_played_away['FTAG'].sum()
            scoring_prob = (total_shots - goals_scored)/total_shots

            #Defense Stats when played at away
            total_shots_against = games_played_away['HS'].sum()
            goals_allowed = games_played_away['FTHG'].sum()
            save_prob = (total_shots_against - goals_allowed)/total_shots_against

            stats = [away_win_percentage,scoring_prob,save_prob]
#             stats = [scoring_prob,save_prob]
        else:
            stats = [0,0,0]
#             stats = [0,0]
        
    return stats

In [49]:
#Function to fetch the training data for the mentioned range of years

def get_training_data(years):
    totalNumGames = 0
    for year in years:
        annual = EPL_data[EPL_data['Year'] == year]
        totalNumGames += len(annual.index)
    numFeatures = 7 
    xTrain = []
    yTrain = []
    
    for year in years:
        annual = EPL_data[EPL_data['Year'] == year]
        for index, row in annual.iterrows():
            game_vector = []
            h_team = row['HomeTeam']
            h_vector = get_HA_vector_data(h_team,year,'Home')
            a_team = row['AwayTeam']
            a_vector = get_HA_vector_data(a_team,year,'Away')
            if row['Winner'] == h_team:
                yTrain.append([1])
            else:
                yTrain.append([0])
                
            x0 = [1]
            game_vector = x0 + h_vector + a_vector
            xTrain.append(game_vector)
            
    return xTrain,yTrain

In [50]:
#Range of years of data to be fetched for training and testing
years = range(2009,2017)

xTrain, yTrain = get_training_data(years)

#Cinverting to Matrix form
xTrain = np.matrix(xTrain)
yTrain = np.matrix(yTrain)

#Splitting the data for Training and Testing
xTrain, xTest, yTrain, yTest = train_test_split(xTrain,yTrain, test_size = 0.25, random_state = 42)

In [51]:
# The model is trained using Logistic Regression with the optimization Algorithm as Newton Raphson
# The optimization model is used to calculate the Regression coefficients 

#Sigmoid Function which gives output between (0,1). If value > 0.5 it is considered as Home Win
def newton_raph(z):
        sigm = 1. / (1. + np.exp(-z))
        return sigm

# Initial Regression Coefficients
b = [[1.00],[0.0],[0.0],[0.0],[0.0],[0.0],[0.0]]
# b = [[1.00],[0.01],[0.01],[0.01],[0.01]]
b = np.matrix(b)

# Training the model to find the appropriate Regression Coefficients
for i,x in enumerate(xTrain):
    try:
        z = np.matmul(xTrain[i],b)
        p = newton_raph(z)
        pi = p*(1 - p)
        w = np.identity(1)
        w = pi*w
        j = np.matmul(np.transpose(xTrain[i]),w)
        k = np.matmul(j,xTrain[i])
        inv =  np.linalg.inv(k)
        l = yTrain[i] - p
        m = np.matmul(k,np.transpose(xTrain[i]))
        n = np.matmul(m,l)
        b_new = b + n
        b = b_new
    except:
        continue
print("Training the model \n")
print("The regression coefficients using Newton Raphson Algorithm are:")
print(b)

Training the model 

The regression coefficients using Newton Raphson Algorithm are:
[[-0.10215805]
 [11.43039189]
 [-1.96156366]
 [-0.24391118]
 [-4.894875  ]
 [-0.42067552]
 [-1.03250323]]


In [52]:
# Testing the Model to find the accuracy 

right = wrong = 0
for i,x in enumerate(xTest):
    z = np.matmul(xTest[i],b)
    p = newton_raph(z)
    y_new = 0
    
    if p > 0.5:
        y_new = 1
    else:
        y_new = 0
    
    if y_new == yTest[i]:
        right += 1
    else:
        wrong += 1

print("The model is trained to an accuracy of: " + str((right/(right + wrong))*100))   

The model is trained to an accuracy of: 68.5459940652819


In [57]:
years = range(2017,2018)

xTrain, yTrain = get_training_data(years)
xTrain = np.matrix(xTrain)
yTrain = np.matrix(yTrain)

right = wrong = 0
for i,x in enumerate(xTrain):
    z = np.matmul(xTrain[i],b)
    p = newton_raph(z)
    y_new = 0
    if p > 0.5:
        y_new = 1
    else:
        y_new = 0
    
    if y_new == yTrain[i]:
        right += 1
    else:
        wrong += 1


print("The model is performing on New Data with an accuracy of: " + str((right/(right + wrong))*100))

The model is performing on New Data with an accuracy of: 71.01449275362319


In [58]:
# Functon to Predict Results of a match using the Regression Coefficients obtained from the above traininf data
def predict_result(home_team,away_team,year,coeff):
    h_vector = get_HA_vector_data(home_team,2017,'Home')
    a_vector = get_HA_vector_data(away_team,2017,'Away')
    x0 = [1]
    game_vector = x0 + h_vector + a_vector
    game_vector = np.matrix(game_vector)
    z = np.matmul(game_vector,coeff)
    prediction = newton_raph(z)
    return (prediction.item((0,0))*100)
    

In [60]:
result =  predict_result('Arsenal','West Ham',2017,b)
print("The prob of Arsenal winning the match when playing at Home: " + str(result) + "\n")

result =  predict_result('Liverpool','Arsenal',2017,b)
print("The prob of Arsenal winning the match when playing Away: " + str(100 - result))

The prob of Arsenal winning the match when playing at Home: 98.68720266106439

The prob of Arsenal winning the match when playing Away: 50.096412356137805
