In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import pickle

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import RepeatedKFold,train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, BatchNormalization,Dropout
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score

import api.util
from api.predictions_converter import PredictionsConverter
from api.sofa_dp import SofaDataProvider

from IPython.display import display
pd.options.display.max_columns = None
%load_ext autoreload
%autoreload 2

In [2]:
dp=SofaDataProvider(load=False)
data, labels, info, df=dp.provide_data()
#df=dp._load_data()

In [6]:
df.columns

Index(['awayScoreHT', 'country', 'country_id', 'ds', 'homeScoreHT', 'liga',
       'mid', 'round', 'sc1', 'sc2', 't1', 't2', 'tid1', 'tid2', 'winner',
       'formation_h', 'formation_a', 'home_formation', 'away_formation',
       'vote_home', 'vote_draw', 'vote_away', 'pop_r'],
      dtype='object')

In [3]:
data[0]

array([0.49217639, 0.23613087, 0.27169275, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [4]:
labels[0]

array([1., 0., 0.])

# Analysis

In [3]:
data_train, data_test, labels_train, labels_test, info_train, info_test = train_test_split(data, labels, info, test_size=0.2, random_state=42)
print(data_train.shape, data_test.shape)

(69716, 156) (17429, 156)


df.isnull().any()

In [4]:
def get_model(n_inputs, n_outputs):
    model = Sequential()
    model.add(Dense(1024, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(512, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dropout(0.2))
    model.add(Dense(64, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(16, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

def evaluate_model(X, y):
    results = list()
    n_inputs, n_outputs = X.shape[1], y.shape[1]
    # define evaluation procedure
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
    # enumerate folds
    for train_ix, test_ix in cv.split(X):
        # prepare data
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]
        # define model
        model = get_model(n_inputs, n_outputs)
        # fit model
        model.fit(X_train, y_train, epochs=10)
        # make a prediction on the test set
        yhat = model.predict(X_test)
        # round probabilities to class labels
        yhat = yhat.round()
        # calculate accuracy
        acc = accuracy_score(y_test, yhat)
        # store result
        print('>%.3f' % acc)
        results.append(acc)
        break
    return results, model

In [5]:
results, model = evaluate_model(data_train, labels_train)
# summarize performance
print('Accuracy: %.3f (%.3f)' % (np.mean(results), np.std(results)))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
>0.500
Accuracy: 0.500 (0.000)


In [10]:
yhat = model.predict(data_test)

In [14]:
def odds2prob(df):
    df['odds_away']=1/df['odds_away']
    df['odds_draw']=1/df['odds_draw']
    df['odds_home']=1/df['odds_home']
    df['margin']=df[['odds_away','odds_draw','odds_home']].sum(axis=1)
    df['odds_away']=df['odds_away']/df['margin']
    df['odds_draw']=df['odds_draw']/df['margin']
    df['odds_home']=df['odds_home']/df['margin']
    return df[['odds_away','odds_draw','odds_home']]

In [18]:
info_test

Unnamed: 0,mid,ts,country,tournament,home_tid,away_tid,homeTeamShort,homeScoreHT,homeScoreFT,awayTeamShort,awayScoreHT,awayScoreFT,winner
10600,6897580,2015-11-07 14:00:00+00:00,italy,serie-b,1212,419,Latina Calcio 1932,0.0,1.0,Cesena,0.0,0.0,home
47569,7895571,2018-11-10 15:15:00+00:00,spain,laliga,925,2106,Getafe,0.0,0.0,Valencia,0.0,1.0,away
17246,6834033,2016-10-07 00:45:00+00:00,south-america,world-cup-qualification-conmebol,324,302,Brazil,4.0,5.0,Bolivia,0.0,0.0,home
71236,8747958,2020-08-23 19:00:00+00:00,brazil,brasileiro-serie-a,2119,961,Vasco,0.0,0.0,Grêmio,0.0,0.0,draw
64437,8246175,2020-02-15 19:00:00+00:00,france,ligue-1,2017,1490,Toulouse,0.0,0.0,Nice,1.0,2.0,away
...,...,...,...,...,...,...,...,...,...,...,...,...,...
64995,8247260,2020-02-28 19:00:00+00:00,france,ligue-2,1883,1694,Sochaux,0.0,1.0,Rodez,0.0,1.0,draw
40229,7471896,2017-09-30 17:30:00+00:00,romania,liga-i,582,169,Dinamo B.,0.0,1.0,Astra,0.0,1.0,draw
80026,8747900,2020-09-05 22:00:00+00:00,brazil,brasileiro-serie-a,493,316,Corinthians,1.0,2.0,Botafogo,1.0,2.0,draw
80557,9030877,2020-09-27 23:30:00+00:00,paraguay,primera-division-apertura,912,966,General Díaz,0.0,0.0,Guaraní,0.0,1.0,away


In [30]:
from api.predictions_converter import PredictionsConverter
#conv_bookies=PredictionsConverter('op', api.util.odds2prob(info_test.copy()).values, labels_test, info_test.copy(), odds=False)
#conv_bookies.make_df()
conv=PredictionsConverter('op', yhat, labels_test, info_test.copy(), odds=False)
conv.make_df()

#conv_bookies.profit()
#conv.profit()
#conv_bookies.performance_metrics()
conv.performance_metrics()

Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,4686,4837,2930,1898,0.664,0.459,0.712,0.623,0.615,0.718,0.667,0.66,0.5
1,DRAW,1446,8871,1944,2090,0.719,0.246,0.409,0.82,0.427,0.809,0.615,0.418,0.5
2,AWAY,1930,8674,1446,2301,0.739,0.295,0.456,0.857,0.572,0.79,0.657,0.507,0.5


In [31]:
#conv_bookies1=PredictionsConverter('op', odds2prob(info_test.copy()).values, labels_test, info_test.copy())
#conv_bookies1.make_df(threshold='max')
conv1=PredictionsConverter('op', yhat, labels_test, info_test.copy(), odds=False)
conv1.make_df(threshold='max')

#conv_bookies1.profit()
#conv1.profit()
#conv_bookies1.performance_metrics()
conv1.performance_metrics()

Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,5201,5544,3525,2373,0.646,0.455,0.687,0.611,0.596,0.7,0.649,0.638,0.5
1,DRAW,1620,10126,2353,2544,0.706,0.25,0.389,0.811,0.408,0.799,0.6,0.398,0.5
2,AWAY,2137,9931,1807,2768,0.725,0.295,0.436,0.846,0.542,0.782,0.641,0.483,0.5
