In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import pickle

from sklearn.preprocessing import LabelEncoder,OneHotEncoder,MinMaxScaler
from sklearn.model_selection import RepeatedKFold,train_test_split
import shap
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend 
from tensorflow.keras.layers import Dense, BatchNormalization,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score


import api.util
from api.predictions_converter import PredictionsConverter
from api.sofa_dp import SofaDataProvider

from IPython.display import display
pd.options.display.max_columns = None
%load_ext autoreload
%autoreload 2

In [50]:
def checkpoint(name):
    return ModelCheckpoint(f'checkpoints/model_{name}.hdf5', monitor='val_acc', verbose=0, save_best_only=True, mode='max')

In [2]:
df_all=pd.read_csv('data/stats_generated.csv', index_col=None)
df_all['vop1']=df_all['vote1']-df_all['oddsprob_home']
df_all['vopx']=df_all['votex']-df_all['oddsprob_draw']
df_all['vop2']=df_all['vote2']-df_all['oddsprob_away']


list(df_all.columns)

In [3]:
includes=['vote1', 'votex', 'vote2','elo1','elo2','drift_home', 'drift_away', 'drift_draw','oddsprob_home', 'oddsprob_draw', 'oddsprob_away']
cols=[x for x in df_all.columns if x!='round.1' and x!='ds.1']
cols=[x for x in cols if not any(i in x for i in includes)]
cols=[x for x in cols if  not 'ht1' in x and not 'ht2' in x and not 'ps_ht' in x]
cols=[x for x in cols if not 'tar_' in x and not 'opp_' in x]
#cols1=[x for x in df_all.columns if '_form' in x]
cols=cols+includes


list(cols)

In [95]:
df.columns[start:]

Index(['tar_w2_tt_avg', 'tar_ft1_tt_avg', 'tar_ft2_tt_avg', 'tar_ps_ft_tt_avg',
       'tar_oddsprob_home_tt_avg', 'tar_oddsprob_draw_tt_avg',
       'tar_oddsprob_away_tt_avg', 'tar_w1_tt_form', 'tar_wx_tt_form',
       'tar_w2_tt_form',
       ...
       'vote1', 'votex', 'vote2', 'elo1', 'elo2', 'drift_home', 'drift_away',
       'drift_draw', 'sch', 'sca'],
      dtype='object', length=121)

In [96]:
nulls=pd.DataFrame(df.isna().sum(), columns=['n'])
#nulls[nulls.n>10000].to_csv('data/nulls.csv')
nulls[nulls.n>10000].index

Index([], dtype='object')

cols

df[df.tid1==594].to_csv('data/ttt1.csv', index=False)
df[df.tid1==1499].to_csv('data/ttt2.csv', index=False)
df[df.mid==9270007].to_csv('data/ttt.csv', index=False)

In [47]:
COL_CUR=['side', 'country_id', 'round', 'ds', 'de', 'form1', 'form2', 'vote1', 'votex', 'vote2', 'pop_r', 'elo1', 'elo2']
COL_PREV=['w1', 'wx', 'w2',  'ht1', 'ht2', 'ft1', 'ft2', 'ps_ht', 'ps_ft']
COL_CAT=['country_id','form1', 'form2']
COL_BIN=['side']
COL_INF=['country', 'liga', 'mid', 'round', 'ds', 't1', 't2','tid1', 'tid2', 'w1', 'wx', 'w2',  'ft1', 'ft2','winner','odds_away','odds_draw','odds_home']


#df=pd.read_csv('data/stats_generated.csv', index_col=None)
start=29
df=df_all[cols]
df['elo1'].fillna((df['elo1'].mean()), inplace=True)
df['elo2'].fillna((df['elo2'].mean()), inplace=True)
nulls=pd.DataFrame(df.isna().sum(), columns=['n'])
cols_null=[x for x in nulls[nulls.n>60000].index if x not in COL_INF]
cols=[x for x in cols if x not in cols_null]


for col in COL_INF:
    df.loc[df[col].isnull(),col]=0

#df=df.dropna()
df=df.fillna(0)
#[df[col].fillna(df[col].mean(), inplace=True) for col in df.columns[start:]]

scaler=MinMaxScaler()
nums=scaler.fit_transform(df[df.columns[start:]].values)
nums_df=pd.DataFrame(nums, columns=df.columns[start:])
df.reset_index(drop=True, inplace=True)
df=pd.concat([df[df.columns[:start]],nums_df], axis=1)

nums1=df[['vote1', 'votex', 'vote2',]].values

df_info=df[COL_INF]

encoder = OneHotEncoder()
pop_r=encoder.fit_transform(df[['pop_r']]).toarray()
#side=df[['side']].values
rounds=encoder.fit_transform(df[['round']]).toarray()
countries=encoder.fit_transform(df[['country_id']]).toarray()
encoder = OneHotEncoder()
form1=encoder.fit_transform(df[['form1']]).toarray()
encoder = OneHotEncoder()
form2=encoder.fit_transform(df[['form2']]).toarray()
side=df[['side']].values

#data=np.hstack([nums,nums1,pop_r,rounds,countries,form1,form2])
data=np.hstack([nums,nums1,side,pop_r,rounds,countries,form1,form2])

df['gd']=df['ft1']-df['ft2']
df['gd']=np.where(df['gd']>5,6,df['gd'])
df['gd']=np.where(df['gd']<-5,-6,df['gd'])
scgd=pd.get_dummies(df['gd'], prefix='gd')

df['sch']=np.where(df['ft1']>5,6,df['ft1'])
df['sca']=np.where(df['ft2']>5,6,df['ft2'])
sch=pd.get_dummies(df['sch'], prefix='sch')
sca=pd.get_dummies(df['sca'], prefix='sca')
labels=np.hstack([df[['w1', 'wx', 'w2']].values,sch.values,sca.values,scgd.values])
#labels=np.hstack([scgd.values])
#labels=df[['w1', 'wx', 'w2']].values

data.shape,labels.shape

((135580, 182), (135580, 30))

# Analysis

In [48]:
data_train, data_test, labels_train, labels_test, info_train, info_test = train_test_split(data, labels, df_info, test_size=0.2, random_state=42)
print(data_train.shape, data_test.shape)

(108464, 182) (27116, 182)


In [168]:
labels_train

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int64)

df.isnull().any()

In [116]:
def get_model(n_inputs, n_outputs):
    model = Sequential()
    #model.add(Dense(1024, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dense(1024, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(512, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(1024, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dense(1024, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(BatchNormalization())
    #model.add(Dropout(0.4))
    model.add(Dense(512, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dropout(0.4))
    #model.add(Dense(64, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dense(n_outputs, activation='softmax'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    #model.compile(loss='mean_squared_error', optimizer='adam', metrics = ['accuracy'])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
    #model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return model

def evaluate_model(X, y, bs=64):
    results = list()
    n_inputs, n_outputs = X.shape[1], y.shape[1]
    # define evaluation procedure
    cv = RepeatedKFold(n_splits=3, n_repeats=2, random_state=1)
    # enumerate folds
    for train_ix, val_ix in cv.split(X):
        # prepare data
        X_train, X_val = X[train_ix], X[val_ix]
        y_train, y_val = y[train_ix], y[val_ix]
        # define model
        model = get_model(n_inputs, n_outputs)
        # fit model
        model.fit(X_train, y_train, batch_size = bs, epochs=30)
        # make a prediction on the test set
        yhat = model.predict(X_val)
        # round probabilities to class labels
        yhat = yhat.round()
        # calculate accuracy
        acc = accuracy_score(y_val, yhat)
        # store result
        print('>%.3f' % acc)
        results.append(acc)
        #break
    return results, model

model.save('models/512-1024-1024-512.keras')

In [118]:
results, model = evaluate_model(data_train, labels_train, bs=64)
# summarize performance
print('Accuracy: %.3f (%.3f)' % (np.mean(results), np.std(results)))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
>0.024
Epoch 1/30

KeyboardInterrupt: 

background = data_train[np.random.choice(data_train.shape[0], 100, replace=False)]
explainer = shap.DeepExplainer(model,  background)

In [None]:
yhat = model.predict(data_test)

In [11]:
def odds2prob(df):
    df['odds_away']=1/df['odds_away']
    df['odds_draw']=1/df['odds_draw']
    df['odds_home']=1/df['odds_home']
    df['margin']=df[['odds_away','odds_draw','odds_home']].sum(axis=1)
    df['odds_away']=df['odds_away']/df['margin']
    df['odds_draw']=df['odds_draw']/df['margin']
    df['odds_home']=df['odds_home']/df['margin']
    return df[['odds_away','odds_draw','odds_home']]


def softmax(df, columns):
    df['margin']=df[columns].sum(axis=1)
    for x in columns:
        df[x]=df[x]/df['margin']
    df=df.drop(columns=['margin'])
    return df

In [132]:
yhat[2]

array([4.8954438e-02, 1.9956774e-01, 7.9421538e-01, 4.9211684e-01,
       3.7973702e-01, 1.0061966e-01, 1.4319265e-02, 9.1202662e-04,
       1.7471502e-04, 4.5027200e-06, 7.3556311e-02, 2.7749127e-01,
       3.0257180e-01, 1.8635319e-01, 8.8486604e-02, 3.7683818e-02,
       1.7281987e-02], dtype=float32)

In [108]:
coly=np.hstack([['w1','wx','w2'],sch.columns,sca.columns,scgd.columns])
#coly=scgd.columns
colyp=[x+'_p' for x in coly]
df_y=pd.DataFrame(data=labels_test[:,3:], columns=coly[3:])
#df_y=pd.DataFrame(data=labels_test, columns=coly)
df_yhat=pd.DataFrame(data=yhat, columns=colyp)
info_test=info_test.rename(columns={'ft1':'sc1','ft2':'sc2'})
info_test=info_test.reset_index(drop=True)
df_preds=pd.concat([info_test,df_y,df_yhat], axis=1)
#df_preds=softmax(df_preds,['w1_p','wx_p','w2_p'])
#df_preds=softmax(df_preds,[x+'_p' for x in sch.columns])
#df_preds=softmax(df_preds,[x+'_p' for x in sca.columns])
#df_preds=softmax(df_preds,[x+'_p' for x in scgd.columns])
df_preds['w1_gd']=df_preds[['gd_6.0_p','gd_5.0_p','gd_4.0_p','gd_3.0_p','gd_2.0_p','gd_1.0_p']].sum(axis=1)
df_preds['wx_gd']=df_preds['gd_0.0_p']
df_preds['w2_gd']=df_preds[['gd_-6.0_p','gd_-5.0_p','gd_-4.0_p','gd_-3.0_p','gd_-2.0_p','gd_-1.0_p']].sum(axis=1)

#df_preds

In [109]:

conv=PredictionsConverter('op', df_preds[['w1_p','wx_p','w2_p']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), odds=True)
conv.make_df()
conv.profit()


WAG:12027; ACC: 0.4330256921925667; PRF: -246.04000000000002; ROI: -0.020457304398436852


In [110]:
#info_test=info_test.rename(columns={'ft1':'sc1','ft2':'sc2'})
conv1=PredictionsConverter('op', df_preds[['w1_gd','wx_gd','w2_gd']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), odds=True)
conv1.make_df(threshold='max')
conv1.profit()

WAG:12706; ACC: 0.42184794585235325; PRF: -509.78; ROI: -0.04012120258145758


In [111]:
conv.performance_metrics()

Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,4619,10751,5302,4982,0.599,0.374,0.481,0.67,0.466,0.683,0.575,0.473,0.5
1,DRAW,1721,14498,4609,4826,0.632,0.255,0.263,0.759,0.272,0.75,0.511,0.267,0.5
2,AWAY,4604,10881,5267,4902,0.604,0.371,0.484,0.674,0.466,0.689,0.579,0.475,0.5


In [160]:
df_preds[['w1','wx','w2','w1_p','wx_p','w2_p','w1_gd','wx_gd','w2_gd']]

Unnamed: 0,w1,wx,w2,w1_p,wx_p,w2_p,w1_gd,wx_gd,w2_gd
0,0,0,1,0.153697,0.409941,0.436362,0.193199,0.416397,0.390404
1,0,0,1,0.389200,0.324374,0.286426,0.375709,0.338702,0.285590
2,1,0,0,0.255113,0.255719,0.489168,0.257906,0.263558,0.478536
3,1,0,0,0.367066,0.345683,0.287250,0.380905,0.317920,0.301175
4,1,0,0,0.551421,0.274133,0.174446,0.545328,0.263596,0.191075
...,...,...,...,...,...,...,...,...,...
27111,0,0,1,0.218319,0.248316,0.533365,0.226258,0.287219,0.486522
27112,1,0,0,0.292947,0.343484,0.363569,0.297586,0.360643,0.341771
27113,1,0,0,0.894143,0.081727,0.024130,0.913940,0.067083,0.018978
27114,0,0,1,0.158910,0.196134,0.644956,0.163151,0.217454,0.619394


In [122]:
from api.predictions_converter import PredictionsConverter
info_test=info_test.rename(columns={'ft1':'sc1','ft2':'sc2'})
conv_bookies=PredictionsConverter('op', api.util.odds2prob(info_test.copy()).values, labels_test, info_test.copy(), odds=True)
conv_bookies.make_df()
conv=PredictionsConverter('op', yhat, labels_test, info_test.copy(), odds=True)
conv.make_df()

conv_bookies.profit()
conv.profit()
conv_bookies.performance_metrics()
conv.performance_metrics()

WAG:5734; ACC: 0.14318102546215555; PRF: -721.0699999999999; ROI: -0.1257534007673526
WAG:8301; ACC: 0.5372846644982532; PRF: -215.48000000000002; ROI: -0.02595831827490664


Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,385,1042,2445,1862,0.249,0.392,0.171,0.299,0.136,0.359,0.235,0.152,0.5
1,DRAW,1,4511,1,1221,0.787,0.213,0.001,1.0,0.5,0.787,0.5,0.002,0.5
2,AWAY,435,1002,2467,1830,0.251,0.395,0.192,0.289,0.15,0.354,0.24,0.168,0.5


Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,3874,7598,2999,2466,0.677,0.374,0.611,0.717,0.564,0.755,0.664,0.586,0.5
1,DRAW,374,12079,818,3666,0.735,0.239,0.093,0.937,0.314,0.767,0.515,0.143,0.5
2,AWAY,4656,6143,4237,1901,0.638,0.387,0.71,0.592,0.524,0.764,0.651,0.603,0.5


In [123]:
conv_bookies1=PredictionsConverter('op', odds2prob(info_test.copy()).values, labels_test, info_test.copy(), odds=True)
conv_bookies1.make_df(threshold='max')
conv1=PredictionsConverter('op', yhat, labels_test, info_test.copy(), odds=True)
conv1.make_df(threshold='max')

conv_bookies1.profit()
conv1.profit()
conv_bookies1.performance_metrics()
conv1.performance_metrics()

WAG:12706; ACC: 0.21611836927435857; PRF: -1422.03; ROI: -0.1119179915000787
WAG:12706; ACC: 0.48504643475523374; PRF: -314.58; ROI: -0.024758381866834565


Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,1340,3023,4935,3408,0.343,0.374,0.282,0.38,0.214,0.47,0.331,0.243,0.5
1,DRAW,9,9467,25,3205,0.746,0.253,0.003,0.997,0.265,0.747,0.5,0.006,0.5
2,AWAY,1397,2945,5017,3347,0.342,0.373,0.294,0.37,0.218,0.468,0.332,0.25,0.5


Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,5525,11713,5265,4613,0.636,0.374,0.545,0.69,0.512,0.717,0.617,0.528,0.5
1,DRAW,873,18171,2029,6043,0.702,0.255,0.126,0.9,0.301,0.75,0.513,0.178,0.5
2,AWAY,6314,9944,7110,3748,0.6,0.371,0.628,0.583,0.47,0.726,0.605,0.538,0.5


In [87]:
conv.DF

Unnamed: 0,ds,country,liga,t1,t2,sc1,sc2,odds_home,odds_draw,odds_away,winner_home,winner_draw,winner_away,pred_home,pred_draw,pred_away,prob_home,prob_draw,prob_away,win,prf
0,2017-10-28 17:00:00+00:00,argentina,liga-profesional-de-futbol,ca huracan,lanus,4.0,0.0,1.84,3.21,4.84,1,0,0,0,1,0,0.419015,0.785230,0.177421,0,-1.00
1,2018-08-11 14:00:00+00:00,england,championship,aston villa,wigan athletic,3.0,2.0,0.00,0.00,0.00,1,0,0,1,0,0,0.916121,0.320084,0.033951,1,-1.00
2,2018-12-01 19:30:00+00:00,italy,serie-a,sampdoria,bologna,4.0,1.0,2.02,3.26,4.17,1,0,0,0,1,0,0.416682,0.878468,0.090711,0,-1.00
3,2018-08-26 12:30:00+00:00,netherlands,eredivisie,fc utrecht,vvvvenlo,1.0,1.0,0.00,0.00,0.00,0,1,0,1,0,0,0.908112,0.472075,0.014991,0,0.00
4,2017-05-14 18:00:00+00:00,spain,laliga,athletic bilbao,leganes,1.0,1.0,1.47,4.43,7.41,0,1,0,1,1,0,0.645087,0.798480,0.056598,1,3.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13553,2018-08-14 18:45:00+00:00,england,efl-cup,yeovil town,aston villa,0.0,1.0,0.00,0.00,0.00,0,0,1,1,1,0,0.738303,0.974766,0.002282,0,0.00
13554,2018-04-07 14:00:00+00:00,ukraine,premier-league-relegation-round,oleksandria,pfc feniks bucha,2.0,0.0,1.82,3.25,4.58,1,0,0,1,1,0,0.570895,0.614656,0.238248,1,2.25
13555,2015-12-12 17:30:00+00:00,austria,bundesliga,sv ried,wolfsberger ac,1.0,0.0,0.00,0.00,0.00,1,0,0,0,0,1,0.429426,0.001383,0.962793,0,0.00
13556,2016-09-21 15:30:00+00:00,finland,veikkausliiga,ifk mariehamn,inter turku,1.0,1.0,1.90,3.37,4.13,0,1,0,1,0,0,0.567031,0.176261,0.496240,0,-1.00


In [126]:
conv.DF.loc[conv.DF['odds_home']>0].win.mean()

0.5372846644982532

In [124]:
res=conv.DF.loc[conv.DF['odds_home']>0]

In [125]:
res[res['pred_home']==1].prf.sum(),res[res['pred_draw']==1].prf.sum(),res[res['pred_away']==1].prf.sum()

(-110.32000000000002, 26.24999999999999, -121.71000000000002)

In [91]:
res.to_csv('data/pred.csv', index=False)

# sdef
$ \frac{1}{2} $