In [12]:
import requests
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Common leagues and abbreviations:
* Barclays Premier League: PL
* La Liga Santander: PD
* Bundesliga: BL1
* Liga NOS: PPL
* Ligue 1: FL1
* Serie A: SA

In [3]:
match_master_list=list()
league='PL' #insert here the chosen abbreviation

for year in range(2017,2020):   
    uri = 'http://api.football-data.org/v2/competitions/'+league+'/matches?season=' + str(year)
    headers = { 'X-Auth-Token': 'INSERT YOUR TOKEN' }
    response = requests.get(uri, headers=headers)      
    match_list=response.json()['matches']
    match_master_list.append(match_list)
    nr_games=len(match_list)
    teams_list=list()
    
    for i in range (0, nr_games):
        teams_list.append(match_list[i]['homeTeam']['name'])
        
    if year == 2019:
        teams_list=np.unique(teams_list)
        print("List of teams: " ,teams_list)
        team=input("Team name:")

List of teams:  ['AFC Bournemouth' 'Arsenal FC' 'Aston Villa FC'
 'Brighton & Hove Albion FC' 'Burnley FC' 'Chelsea FC' 'Crystal Palace FC'
 'Everton FC' 'Leicester City FC' 'Liverpool FC' 'Manchester City FC'
 'Manchester United FC' 'Newcastle United FC' 'Norwich City FC'
 'Sheffield United FC' 'Southampton FC' 'Tottenham Hotspur FC'
 'Watford FC' 'West Ham United FC' 'Wolverhampton Wanderers FC']
Team name:Arsenal FC


In [4]:
team_matches=[]
next_matches=[]
next_home_away=[]
score_list=[]
goals_list=[]
home_away_list=[]
winner=[]
scheduled=1

for j in range (0, len(match_master_list)):
    for i in range (0, nr_games):
    
        if match_master_list[j][i]['homeTeam']['name']==team or match_master_list[j][i]['awayTeam']['name']==team:
            
            if match_master_list[j][i]['status']=='FINISHED':
                game= match_master_list[j][i]['homeTeam']['name']+" vs "+ match_master_list[j][i]['awayTeam']['name']
                
                if (team == game.split(" vs ")[0]):
                    home_away_list.append(1)
                    
                    if(match_master_list[j][i]['score']['winner']=='HOME_TEAM'):
                        winner.append(1)
                    
                    else:
                        winner.append(0)
                
                else:
                    home_away_list.append(0)
                    
                    if(match_master_list[j][i]['score']['winner']=='AWAY_TEAM'):
                        winner.append(1)
                    
                    else:
                        winner.append(0)
                        
                team_matches.append(game)
                result= str(match_master_list[j][i]['score']['fullTime']['homeTeam'])+ " - " + str(match_master_list[j][i]['score']['fullTime']['awayTeam'])
                score_list.append(result)
                goals_list.append(int(match_master_list[j][i]['score']['fullTime']['homeTeam'])+int(match_master_list[j][i]['score']['fullTime']['awayTeam']))
                played=len(team_matches)
                
            if match_master_list[j][i]['status']!='FINISHED':
                game= match_master_list[j][i]['homeTeam']['name']+" vs "+ match_master_list[j][i]['awayTeam']['name']
                
                if (team == game.split(" vs ")[0]):
                    home_away_list.append(1)
                    
                else:
                    home_away_list.append(0)
                    
                team_matches.append(game)
                score_list.append(0)
                goals_list.append(0)
                winner.append(0)

In [5]:
#Calling DataFrame constructor after zipping 
# both lists, with columns specified 
df = pd.DataFrame(list(zip(team_matches, home_away_list, goals_list, winner)), 
               columns =['Match', 'Home/Away', 'Goals', 'Won']) 

labels, uniques = pd.factorize(df.Match)
df['Match']=labels

In [26]:
mode='Goals' #Choose to predict which team is going to win (insert Won) or how many goals will the match have (insert Goals)
ngoals=2.5 #The goals threshold, the game will have more or less than ngoals


X=df.filter(['Match', 'Home/Away'], axis=1)[:played]
y=df[mode]
y=y[:played]
next_g=df.filter(['Match', 'Home/Away'], axis=1)[played:]
X=np.matrix(X)
y=np.array(y)
next_g=np.matrix(next_g)

div=int(len(X)*0.75)

if(mode=='Goals'):
    for i in range (0, len(y)):
        if y[i]>ngoals:
            y[i]=1
        else:
            y[i]=0

X_train=X[:div]
y_train=y[:div]
X_test=X[div:]
y_test=y[div:]

In [27]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
next_g_matrix=xgb.DMatrix(next_g[0])

param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.01,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'binary:logistic',  # error evaluation for multiclass training
    'eval_metric' : 'logloss'
}

num_round = 5000 # the number of training iterations

bst = xgb.train(param, dtrain, num_round)


preds = bst.predict(dtest)
print(preds)

for i in range (0, len(preds)):
    
    if preds[i]>=0.5:
        preds[i]=1
    
    else:
        preds[i]=0
        
print ("Accuracy:", accuracy_score(y_test, preds))

[0.24165754 0.8478247  0.07041942 0.9549422  0.9020888  0.941284
 0.73667264 0.9702425  0.13963985 0.15818782 0.9702425  0.9689304
 0.7970919  0.5229887  0.9020888  0.7301381  0.9702425  0.9022018
 0.73667264 0.941284   0.7301381  0.9702425 ]
Accuracy: 0.5454545454545454


In [28]:
if (mode=='Goals'):
    print('The next match has a probability of: ', bst.predict(next_g_matrix)[0]*100, '% to have more than', ngoals, 'goals.')
else:
    print(team, 'has a probability of: ', bst.predict(next_g_matrix)[0]*100, '% to win the next match.')

The next match has a probability of:  63.249671459198 % to have more than 2.5 goals.
