Soccer wizard

In [1]:
import requests
import pandas as pd

from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
# create functions
# get results
def request_data(league='', fixtures_teams=''):
    request = requests.get("https://api-football-v1.p.rapidapi.com/" + fixtures_teams + "/league/" + str(league) ,
           headers={
            "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com",
            "X-RapidAPI-Key": key
           })
    data = pd.read_json(request.content).loc[fixtures_teams][0]
    return data

# get points
def points(home, away):
    if home > away:
        return 3
    elif home < away:
        return 0
    else: return 1

# get rating
def get_rating(rank):
    if rank <= 3:
        return 'A'
    elif rank <= 6:
        return 'B'
    elif rank <= 15:
        return 'C'
    else:
        return 'D'

# get rating values
def get_rating_value(rating):
    rating_list = []
    for i in rating:
        if i == 'A':
            rating_list.append(4)
        elif i == 'B':
            rating_list.append(3)
        elif i == 'C':
            rating_list.append(2)
        else:
            rating_list.append(1)
    return rating_list

In [4]:
# get API key
key = open('rapid_api_key.txt', 'r').read()

# load data
fixtures = request_data(league=54, fixtures_teams='fixtures')
teams_info = request_data(league=54, fixtures_teams='teams')

In [5]:
#leagues['54']
def request_leagues():
    request = requests.get("https://api-football-v1.p.rapidapi.com/leagues/",
           headers={
            "X-RapidAPI-Host": "api-football-v1.p.rapidapi.com",
            "X-RapidAPI-Key": key
           })
    data = pd.read_json(request.content).loc['leagues',][0]
    return data
leagues = request_leagues()
[leagues[x] for x in leagues if leagues[x]['league_id'] == '54']

[{'league_id': '54',
  'name': 'Bundesliga 1',
  'country': 'Germany',
  'country_code': 'DE',
  'season': '2016',
  'season_start': '2016-08-26',
  'season_end': '2017-05-20',
  'logo': 'https://www.api-football.com/public/leagues/54.png',
  'flag': 'https://www.api-football.com/public/flags/de.svg',
  'standings': True}]

In [7]:
# get teams
teams = pd.DataFrame({'id': [], 'team': []})
for num in teams_info:
    teams = teams.append(pd.DataFrame({'id':[teams_info[str(num)]['team_id']], 'team': [teams_info[str(num)]['name']]}))
teams = teams.reset_index(drop=True)
teams.head()

Unnamed: 0,id,team
0,167,1899 Hoffenheim
1,170,FC Augsburg
2,157,Bayern Munich
3,160,SC Freiburg
4,165,Borussia Dortmund


In [8]:
# get results

results = pd.DataFrame()
for f in fixtures:
    results = results.append(pd.DataFrame({'round': [fixtures[f]['round']],
                                           'home': [fixtures[f]['homeTeam_id']],
                                           'away': [fixtures[f]['awayTeam_id']],
                                           'goals_home': [fixtures[f]['goalsHomeTeam']],
                                           'goals_away': [fixtures[f]['goalsAwayTeam']]}))
results = results.reset_index(drop=True)

# get points
results['points_home'] = results.apply(lambda x: points(x['goals_home'], x['goals_away']), axis=1)
results['points_away'] = results.apply(lambda x: points(x['goals_away'], x['goals_home']), axis=1)
results.head()

Unnamed: 0,round,home,away,goals_home,goals_away,points_home,points_away
0,Bundesliga - 1,157,162,6,0,3,0
1,Bundesliga - 1,165,164,2,1,3,0
2,Bundesliga - 1,169,174,1,0,3,0
3,Bundesliga - 1,170,161,0,2,0,3
4,Bundesliga - 1,192,181,2,0,3,0


In [9]:
rounds = results['round'].unique()
table = {r:{t:{'points':[], 'goals_shot':[], 'goals_received':[]} for t in teams['id']} for r in rounds}

In [10]:
# setup table
last_round = ''
for r in rounds:
    for t in teams['id']:
        try:
            points = int(results[(results['round'] == r) & (results['home'] == str(t))]['points_home'])
            goals_shot = int(results[(results['round'] == r) & (results['home'] == str(t))]['goals_home'])
            goals_received = int(results[(results['round'] == r) & (results['home'] == str(t))]['goals_away'])
        except:
            points = int(results[(results['round'] == r) & (results['away'] == str(t))]['points_away'])
            goals_shot = int(results[(results['round'] == r) & (results['away'] == str(t))]['goals_away'])
            goals_received = int(results[(results['round'] == r) & (results['away'] == str(t))]['goals_home'])
        if last_round == '':
            table[r][str(t)]['points'] = points
            table[r][str(t)]['points_cum'] = points
            table[r][str(t)]['goals_shot'] = goals_shot
            table[r][str(t)]['goals_received'] = goals_received
        else:
            table[r][str(t)]['points'] = points
            table[r][str(t)]['points_cum'] = table[last_round][str(t)]['points_cum'] + points
            table[r][str(t)]['goals_shot'] = table[last_round][str(t)]['goals_shot'] + goals_shot
            table[r][str(t)]['goals_received'] = table[last_round][str(t)]['goals_received'] + goals_received
        table[r][str(t)]['goal_difference'] = table[r][str(t)]['goals_shot']-table[r][str(t)]['goals_received']
    last_round = r
#table

In [28]:
# get rank & rating
for r in rounds:
    gameday = pd.DataFrame()
    for t in teams['id']:
        gameday = gameday.append(pd.DataFrame({'round': [r],
                                               'team': [t],
                                               'points_cum': table[r][str(t)]['points_cum'],
                                               'goal_difference': table[r][str(t)]['goal_difference']}))
    gameday = gameday.sort_values(['points_cum', 'goal_difference'], ascending = False)
    gameday['rank'] = [i for i in range(1,len(gameday)+1)]
    for t in teams['id']:
        table[r][str(t)]['rank'] = int(gameday[gameday['team'] == t]['rank'])
        table[r][str(t)]['rating'] = get_rating(int(table[r][str(t)]['rank']))

In [29]:
# get form
for t in teams['id']:
    form = []
    form_rating = []
    for r in rounds:
        if len(form) > 4:
            form.pop(0)
            form_rating.pop(0)
        form.append(table[r][t]['points'])
        form_rating.append(table[r][t]['rating'])
        table[r][t]['form'] = form.copy()
        table[r][t]['form_rating'] = form_rating.copy()

In [31]:
data = pd.DataFrame()
for i in range(len(results)):
    team_1 = results.loc[i]['home']
    team_2 = results.loc[i]['away']
    gameday = results.loc[i]['round']
    if gameday == 'Bundesliga - 1':
        last_gameday = gameday
        next
    form_ratings_1 = get_rating_value(table[last_gameday][team_1]['form_rating'])
    form_ratings_2 = get_rating_value(table[last_gameday][team_2]['form_rating'])
    form_weighted_1 = sum([table[last_gameday][team_1]['form'][i]*form_ratings_1[i] for i in range(len(form_ratings_1))])
    form_weighted_2 = sum([table[last_gameday][team_2]['form'][i]*form_ratings_2[i] for i in range(len(form_ratings_2))])
    if form_weighted_1 > 0:
        form_weighted_1 = form_weighted_1/len(form_ratings_1)
    if form_weighted_2 > 0:
        form_weighted_2 = form_weighted_1/len(form_ratings_2)
    data = data.append(pd.DataFrame({
                                    'round': gameday,
                                    'team_1': team_1,
                                    'goal_difference_1': table[last_gameday][team_1]['goal_difference'],
                                    'rating_1': table[last_gameday][team_1]['rating'],
                                    'form_1': sum(table[last_gameday][team_1]['form'])/len(table[last_gameday][team_1]['form']),
                                    'form_weighted_1': form_weighted_1,                            
                                    'team_2': team_2,
                                    'goal_difference_2': table[last_gameday][team_2]['goal_difference'],
                                    'rating_2': table[last_gameday][team_2]['rating'],
                                    'form_2': sum(table[last_gameday][team_2]['form'])/len(table[last_gameday][team_2]['form']),
                                    'form_weighted_2' : form_weighted_2,
                                    'home_away': [1],
                                    'target': [results.loc[i]['points_home']]
                                    }))
    data = data.append(pd.DataFrame({
                                'round': gameday,
                                'team_1': team_2,
                                'goal_difference_1': table[last_gameday][team_2]['goal_difference'],
                                'rating_1': table[last_gameday][team_2]['rating'],
                                'form_1': sum(table[last_gameday][team_2]['form'])/len(table[last_gameday][team_2]['form']),
                                'form_weighted_1' : form_weighted_2,                        
                                'team_2': team_1,
                                'goal_difference_2': table[last_gameday][team_1]['goal_difference'],
                                'rating_2': table[last_gameday][team_1]['rating'],    
                                'form_2': sum(table[last_gameday][team_1]['form'])/len(table[last_gameday][team_1]['form']),
                                'form_weighted_2': form_weighted_1,
                                'home_away': [0],
                                'target': [results.loc[i]['points_away']]
                                }))
    last_gameday = gameday
data = data.reset_index(drop=True)
data.head()

Unnamed: 0,round,team_1,goal_difference_1,rating_1,form_1,form_weighted_1,team_2,goal_difference_2,rating_2,form_2,form_weighted_2,home_away,target
0,Bundesliga - 1,157,6,A,3.0,12.0,162,-6,D,0.0,0.0,1,3
1,Bundesliga - 1,162,-6,D,0.0,0.0,157,6,A,3.0,12.0,0,0
2,Bundesliga - 1,165,1,B,3.0,9.0,164,-1,C,0.0,0.0,1,3
3,Bundesliga - 1,164,-1,C,0.0,0.0,165,1,B,3.0,9.0,0,0
4,Bundesliga - 1,169,1,B,3.0,9.0,174,-1,C,0.0,0.0,1,3


In [32]:
team_1 = pd.get_dummies(data['team_1'], prefix = 'team_1_', drop_first = True)
team_2 = pd.get_dummies(data['team_2'], prefix = 'team_2_', drop_first = True)
rating_1 = pd.get_dummies(data['rating_1'], prefix = 'team_1', drop_first = True)
rating_2 = pd.get_dummies(data['rating_2'], prefix = 'team_2', drop_first = True)

In [33]:
data_ready = (data.join(rating_1)
                  .join(rating_2)
                  .drop(['team_1', 'team_2', 'rating_1', 'rating_2', 'round'], axis=1))

In [34]:
X = data_ready.drop(['target'], axis= 1)
y = data_ready['target']

## get train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1 , test_size=0.2)
X.head()

Unnamed: 0,goal_difference_1,form_1,form_weighted_1,goal_difference_2,form_2,form_weighted_2,home_away,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D
0,6,3.0,12.0,-6,0.0,0.0,1,0,0,0,0,0,1
1,-6,0.0,0.0,6,3.0,12.0,0,0,0,1,0,0,0
2,1,3.0,9.0,-1,0.0,0.0,1,1,0,0,0,1,0
3,-1,0.0,0.0,1,3.0,9.0,0,0,1,0,1,0,0
4,1,3.0,9.0,-1,0.0,0.0,1,1,0,0,0,1,0


In [35]:
# logistic regression
mdl_ridge = LogisticRegression(penalty='l2', solver='liblinear', max_iter=100, multi_class = 'auto') # Ridge

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'C': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
}]

search = GridSearchCV(mdl_ridge, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True)
search.fit(X_train, y_train)
mdl_ridge = search.best_estimator_
search.best_score_

0.64212678936605316

In [36]:
X_train.columns
X_train.shape

(489, 13)

In [37]:
# k-Neighbors
mdl_knn = KNeighborsClassifier()  

# get features
#knn_features = ['goal_difference_1', 'goal_difference_2', 'home_away', 'form_1', 'form_2',
#                'team_1_B', 'team_1_C', 'team_1_D', 'team_2_B', 'team_2_C', 'team_2_D']

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'n_neighbors': [i for i in range(1,100)]
}]

search = GridSearchCV(mdl_knn, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True)

search.fit(X_train, y_train)
mdl_knn = search.best_estimator_

print(mdl_knn)
search.best_score_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=94, p=2,
           weights='uniform')




0.53578732106339466

In [38]:
# random forest classification
mdl_rf = RandomForestClassifier(min_samples_leaf=5, random_state=1, n_jobs=-1)

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'n_estimators': [500, 1000],
    'max_depth': [i for i in range(5,15)],
    'max_features': ['sqrt', 'log2']
}]

search = GridSearchCV(mdl_rf, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True)

search.fit(X_train, y_train)
mdl_rf = search.best_estimator_

print(mdl_rf)
search.best_score_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)


0.62576687116564422

In [42]:
pd.DataFrame({'features': X_train.columns, 'importance': mdl_rf.feature_importances_}).sort_values('importance', ascending=False)

Unnamed: 0,features,importance
4,form_2,0.23661
1,form_1,0.220347
5,form_weighted_2,0.144681
2,form_weighted_1,0.141406
0,goal_difference_1,0.082933
3,goal_difference_2,0.081501
6,home_away,0.028274
8,team_1_C,0.013587
11,team_2_C,0.013143
9,team_1_D,0.011681


In [43]:
# neural network

mdl_mlp = MLPClassifier(hidden_layer_sizes = (50, 50), activation='relu', alpha=0.0001, solver='lbfgs',
                        batch_size=20, learning_rate='invscaling', learning_rate_init=0.001)

kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'hidden_layer_sizes': [(10,), (50,), (100,), (150,)],
    'alpha': [0.0001, 0.001, 0.01, 1, 10]
}]

search = GridSearchCV(mdl_mlp, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True)

search.fit(X_train, y_train)
mdl_mlp = search.best_estimator_

print(mdl_mlp)
search.best_score_

MLPClassifier(activation='relu', alpha=10, batch_size=20, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50,), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


0.6380368098159509

In [44]:
y_hat_ridge = mdl_ridge.predict_proba(X_train)
y_hat_knn = mdl_knn.predict_proba(X_train)
y_hat_rf = mdl_rf.predict_proba(X_train)
y_hat_mlp = mdl_mlp.predict_proba(X_train)

In [45]:
y_hat_rf = pd.DataFrame(y_hat_rf)
y_hat_rf.columns = ['rf_0', 'rf_1', 'rf_3']
y_hat_knn = pd.DataFrame(y_hat_knn)
y_hat_knn.columns = ['knn_0', 'knn_1', 'knn_3']
y_hat_ridge = pd.DataFrame(y_hat_ridge)
y_hat_ridge.columns = ['ridge_0', 'ridge_1', 'ridge_3']
y_hat_mlp = pd.DataFrame(y_hat_mlp)
y_hat_mlp.columns = ['mlp_0', 'mlp_1', 'mlp_3']

In [46]:
X_ensemble = y_hat_rf.join(y_hat_ridge).join(y_hat_knn).join(y_hat_mlp)
X_ensemble.head()

Unnamed: 0,rf_0,rf_1,rf_3,ridge_0,ridge_1,ridge_3,knn_0,knn_1,knn_3,mlp_0,mlp_1,mlp_3
0,0.275298,0.30542,0.419282,0.259046,0.289389,0.451565,0.5,0.234043,0.265957,0.126548,0.397517,0.475935
1,0.084057,0.109789,0.806154,0.037853,0.128049,0.834097,0.287234,0.234043,0.478723,0.035768,0.07922,0.885011
2,0.059508,0.173936,0.766556,0.084779,0.254438,0.660783,0.191489,0.319149,0.489362,0.022085,0.336953,0.640962
3,0.174612,0.286988,0.5384,0.156461,0.176081,0.667457,0.340426,0.297872,0.361702,0.028487,0.199396,0.772117
4,0.758187,0.151137,0.090676,0.683166,0.232437,0.084397,0.574468,0.212766,0.212766,0.754068,0.16504,0.080892


In [48]:
# logistic regression ensemble
mdl_lasso = LogisticRegression(penalty='l1', solver='liblinear', max_iter=100, multi_class = 'auto')

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'C': [3000, 2000, 1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
}]

search = GridSearchCV(mdl_lasso, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True)

search.fit(X_ensemble, y_train.reset_index(drop=True))
mdl_ensemble = search.best_estimator_
search.best_score_

0.76278118609406953

In [49]:
# TEST
# rename!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
y_hat_ridge = mdl_ridge.predict_proba(X_test)
y_hat_knn = mdl_knn.predict_proba(X_test)
y_hat_rf = mdl_rf.predict_proba(X_test)
y_hat_mlp = mdl_mlp.predict_proba(X_test)

y_hat_rf = pd.DataFrame(y_hat_rf)
y_hat_rf.columns = ['rf_0', 'rf_1', 'rf_3']
y_hat_knn = pd.DataFrame(y_hat_knn)
y_hat_knn.columns = ['knn_0', 'knn_1', 'knn_3']
y_hat_ridge = pd.DataFrame(y_hat_ridge)
y_hat_ridge.columns = ['ridge_0', 'ridge_1', 'ridge_3']
y_hat_mlp = pd.DataFrame(y_hat_mlp)
y_hat_mlp.columns = ['mlp_0', 'mlp_1', 'mlp_3']

X_ensemble = y_hat_rf.join(y_hat_ridge).join(y_hat_knn).join(y_hat_mlp)
X_ensemble.head()

Unnamed: 0,rf_0,rf_1,rf_3,ridge_0,ridge_1,ridge_3,knn_0,knn_1,knn_3,mlp_0,mlp_1,mlp_3
0,0.30713,0.297087,0.395783,0.451525,0.226218,0.322257,0.191489,0.202128,0.606383,0.568624,0.102181,0.329195
1,0.156249,0.317049,0.526703,0.059173,0.305959,0.634868,0.170213,0.265957,0.56383,0.03375,0.349259,0.616992
2,0.213097,0.328871,0.458032,0.20374,0.301752,0.494508,0.287234,0.244681,0.468085,0.104376,0.218058,0.677566
3,0.414922,0.391052,0.194026,0.588097,0.265338,0.146565,0.276596,0.265957,0.457447,0.264922,0.635682,0.099396
4,0.21245,0.256901,0.530649,0.256349,0.33285,0.410802,0.170213,0.255319,0.574468,0.089498,0.359722,0.550779


In [50]:
mdl_ensemble.classes_

array([0, 1, 3], dtype=int64)

In [51]:
pd.DataFrame({'coefficients: ': list(X_ensemble.columns),
              'win': mdl_ensemble.coef_[0],
              'draw':mdl_ensemble.coef_[1],
              'loose': mdl_ensemble.coef_[2]})

Unnamed: 0,coefficients:,win,draw,loose
0,rf_0,7.733103,-4.117965,-8.119821
1,rf_1,-7.060366,18.301548,-11.235965
2,rf_3,-6.092629,-6.625362,9.430358
3,ridge_0,-5.703753,1.559145,7.178513
4,ridge_1,2.064495,-5.707566,6.98846
5,ridge_3,6.168872,2.371819,-6.066682
6,knn_0,-2.13186,1.48869,1.402013
7,knn_1,4.026383,-11.995862,6.62429
8,knn_3,1.628328,2.317386,-3.247934
9,mlp_0,3.587812,-2.196101,-4.365991


In [52]:
y_hat_ensemble = mdl_ensemble.predict(X_ensemble)

accuracy_score(y_test, y_hat_ensemble)

0.63414634146341464

In [53]:
print(confusion_matrix(y_test, y_hat_ensemble))  
print(classification_report(y_test, y_hat_ensemble)) 

[[33 13  8]
 [ 5 15  8]
 [ 6  5 30]]
              precision    recall  f1-score   support

           0       0.75      0.61      0.67        54
           1       0.45      0.54      0.49        28
           3       0.65      0.73      0.69        41

   micro avg       0.63      0.63      0.63       123
   macro avg       0.62      0.63      0.62       123
weighted avg       0.65      0.63      0.64       123



In [54]:
# get accuracy of bets

y_hat_proba = mdl_ensemble.predict_proba(X_ensemble)
bet = [any(y_hat_proba[i] > 0.7) for i in range(len(y_hat_proba))]

data_test = data.loc[X_test.index,]
data_test['predict'] = y_hat_ensemble
data_test['bet'] = bet

print('number of bets = ', len(data_test[bet]), '\n',
      'accuracy = ', accuracy_score(data_test[bet]['target'], data_test[bet]['predict']), '\n',
      'confusiont matrix = ', '\n','\n',
     confusion_matrix(data_test[bet]['target'], data_test[bet]['predict']))
data_test.head()

number of bets =  79 
 accuracy =  0.746835443038 
 confusiont matrix =  
 
 [[30  5  3]
 [ 2  6  5]
 [ 3  2 23]]


Unnamed: 0,round,team_1,goal_difference_1,rating_1,form_1,form_weighted_1,team_2,goal_difference_2,rating_2,form_2,form_weighted_2,home_away,target,predict,bet
493,Bundesliga - 28,167,23,A,2.0,0.64,175,-23,C,2.0,3.2,0,0,0,False
472,Bundesliga - 27,192,8,B,1.4,4.0,169,-2,C,0.4,0.8,1,3,3,False
107,Bundesliga - 6,163,0,C,1.4,0.12,174,-4,D,0.6,0.6,0,0,3,False
558,Bundesliga - 32,192,6,C,1.0,2.6,162,1,B,2.6,0.52,1,3,1,True
535,Bundesliga - 30,165,30,A,2.0,0.56,163,-5,C,1.4,2.8,0,3,3,True
