In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import pickle

from sklearn.preprocessing import LabelEncoder,OneHotEncoder,MinMaxScaler,MultiLabelBinarizer
from sklearn.model_selection import RepeatedKFold,train_test_split
import shap
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.backend 
from tensorflow.keras.layers import Dense, BatchNormalization,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import accuracy_score


import api.util
from api.predictions_converter import PredictionsConverter
from api.sofa_dp import SofaDataProvider

from IPython.display import display
pd.options.display.max_columns = None
%load_ext autoreload
%autoreload 2

In [3]:
def checkpoint(name):
    return ModelCheckpoint(f'checkpoints/model_{name}.hdf5', monitor='val_acc', verbose=0, save_best_only=True, mode='max')

In [2]:
INF=['country', 'liga', 'mid', 'round', 'ds', 't1', 't2','tid1', 'tid2', 'w1', 'wx', 'w2',  'ft1', 'ft2','winner','gd','sch','sca','odds_away','odds_draw','odds_home']

df_all=pd.read_csv('data/stats_generated.csv', index_col=None)
df_all=df_all.dropna(subset=['ft1','ft2'])
df_all['vop1']=df_all['vote1']-df_all['oddsprob1']
df_all['vopx']=df_all['votex']-df_all['oddsprobx']
df_all['vop2']=df_all['vote2']-df_all['oddsprob2']
df_all['elo1'].fillna((df_all['elo1'].mean()), inplace=True)
df_all['elo2'].fillna((df_all['elo2'].mean()), inplace=True)
df_all['gd']=df_all['ft1']-df_all['ft2']
df_all['gd']=df_all['gd'].astype(int)
df_all['gd']=np.where(df_all['gd']>5,6,df_all['gd'])
df_all['gd']=np.where(df_all['gd']<-5,-6,df_all['gd'])
df_all['gd']=df_all['gd'].astype(int)
df_all['sch']=np.where(df_all['ft1']>5,6,df_all['ft1'])
df_all['sch']=df_all['sch'].astype(int)
df_all['sca']=np.where(df_all['ft2']>5,6,df_all['ft2'])
df_all['sca']=df_all['sca'].astype(int)
for col in INF:
    df_all.loc[df_all[col].isnull(),col]=0


list(df_all.columns)

In [3]:
def slice_1x2(pattern,base):
    res=[]
    for side in ['1','x','2']:
        col=pattern.replace(f'{base}_', f'{base}{side}_')
        #print(col)
        if col in ALL:
            subcolumns=[x for x in ALL if col == x]
            subcolumns=[x for x in subcolumns if not x in DIRT]
            res=np.hstack([res,subcolumns])
        #break
    #if res==[]:
    #    print('EMPRY: '+pattern)
    return res

def slice_diff(pattern,base):
    res=[]
    if pattern in ALL:
        res=[x for x in ALL if pattern in x]
        res=[x for x in res if not x in DIRT]
    return res

def generate_cols(base):
    subcolumns=[]
    for scope in ['tt','ts']:
        for typ in ['avg','form']:
            # tar and opp columns
            tar=slice_1x2(f'tar_{base}_{scope}_{typ}',base)
            opp=slice_1x2(f'opp_{base}_{scope}_{typ}',base)
            if tar!=[] and opp!=[]:
                subcolumns.append(np.hstack([tar,opp]))
            # resulted columns like ft1_tt_form
            tar=slice_1x2(f'{base}_{scope}_{typ}',base)
            if tar!=[]:
                subcolumns.append(tar)
            # resulted diffs columns like diff_ft_tt_avg
            tar=slice_diff(f'diff_{base}_{scope}_{typ}',base)
            if tar!=[]:
                subcolumns.append(tar)
            #break
        #break

    return subcolumns


#COL_NUM=['vote1', 'votex', 'vote2', 'elo1', 'elo2']
DIRT=['w1', 'wx', 'w2',  'ht1', 'ht2', 'ft1', 'ft2', 'ps_ht', 'ps_ft','winner']
CATEGORICAL=['side', 'country_id','form1', 'form2', 'pop_r']
SINGLES=['vop','diff_vote', 'diff_elo', 'diff_op','oddsprob']
BASES = ['w',  'ht', 'ft', 'vop', 'vote', 'elo', 'drift', 'oddsprob', 'graph', 'possession', 'shont', 'shofft', 'corners', 'offsides', 'fouls','cards', 'gksaves','precision']
MINIMUM=['vote1', 'votex', 'vote2','elo1','elo2']
ALL=df_all.columns
COLUMNS=[[]]
for base in BASES:
    #cols=INF+MINIMUM
    COLUMNS+=generate_cols(base)
    #break
#COLUMNS


In [7]:
classes=['w1', 'wx', 'w2','sch0','sch1','sch2','sch3','sch4','sch5','sch6', 'sca0','sca1','sca2','sca3','sca4','sca5','sca6','gd-6','gd-5','gd-4','gd-3','gd-2','gd-1','gd0','gd1','gd2','gd3','gd4','gd5','gd6']
runs=[]
start=23
c=0
for cols in COLUMNS:
    current_run={'n':c, 'cols':cols}
    #if c<=20: # or c>53:
    #    c+=1
    #    continue
    print(c, cols)
    df=df_all[np.hstack([INF,CATEGORICAL,MINIMUM,cols])]
    #print(len(df.index))
    df_info, data, labels = prepare_df(df, na_way='MEAN')
    current_run=evaluate_feature(data, labels, df_info, current_run)
    runs.append(current_run)
    pd.DataFrame(runs).to_csv('data/features_evaluated.csv', index=False)
    c+=1
    #break
#runs
#df=df_all[cols]

5308130907; PRF: -326.76; ROI: -0.025522143247676325
122 ['tar_offsides1_ts_form' 'tar_offsides2_ts_form' 'opp_offsides1_ts_form'
 'opp_offsides2_ts_form']
>0.025
WAG:9105; ACC: 0.5060955518945635; PRF: 123.81999999999996; ROI: 0.013599121361889068
WAG:8154; ACC: 0.5068677949472652; PRF: 99.40999999999998; ROI: 0.0121915624233505
WAG:12803; ACC: 0.4650472545497149; PRF: -332.62; ROI: -0.025979848473014138
WAG:12803; ACC: 0.46262594704366167; PRF: -365.27; ROI: -0.028530032023744434
123 ['offsides1_ts_form' 'offsides2_ts_form']
>0.025
WAG:9213; ACC: 0.5068924346032779; PRF: 80.51999999999995; ROI: 0.008739824161510904
WAG:8100; ACC: 0.4982716049382716; PRF: -74.68; ROI: -0.009219753086419754
WAG:12803; ACC: 0.46465672108099665; PRF: -395.52; ROI: -0.030892759509489963
WAG:12803; ACC: 0.46567210809966414; PRF: -323.53000000000003; ROI: -0.025269858626884325
124 ['diff_offsides_ts_form']
>0.026
WAG:9119; ACC: 0.4969843184559711; PRF: 62.52999999999999; ROI: 0.0068571115253865545
WAG:8648;

In [198]:
labels

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [4]:
def prepare_df(df_in, na_way='drop'):
    if na_way=='drop':
        df=df_in.dropna()
    elif na_way=='zero':
        df=df_in.fillna(0)
    else:
        df=df_in.copy()
        [df[col].fillna(df[col].mean(), inplace=True) for col in df.columns[start:]]
    
    # Categorical values
    encoder = OneHotEncoder()
    pop_r=encoder.fit_transform(df[['pop_r']]).toarray()
    rounds=encoder.fit_transform(df[['round']]).toarray()
    countries=encoder.fit_transform(df[['country_id']]).toarray()
    encoder = OneHotEncoder()
    form1=encoder.fit_transform(df[['form1']]).toarray()
    encoder = OneHotEncoder()
    form2=encoder.fit_transform(df[['form2']]).toarray()
    side=df[['side']].values

    # Numerical values
    scaler=MinMaxScaler()
    nums=scaler.fit_transform(df[df.columns[start:]].values)
    nums1=df[['vote1', 'votex', 'vote2',]].values

    # Union data
    data=np.hstack([side,pop_r,rounds,countries,form1,form2,nums,nums1])

    # Labels
    ohe=OneHotEncoder()
    folder='prerequisites/'
    with open(os.path.join(folder, 'ohe_gd'), mode='rb') as f:
        ohe=pickle.load(f)
    scgd=ohe.transform(df[['gd']]).toarray()
    with open(os.path.join(folder, 'ohe_sch'), mode='rb') as f:
        ohe=pickle.load(f)
    sch=ohe.transform(df[['sch']]).toarray()
    with open(os.path.join(folder, 'ohe_sca'), mode='rb') as f:
        ohe=pickle.load(f)
    sca=ohe.transform(df[['sca']]).toarray()
    labels=np.hstack([df[['w1', 'wx', 'w2']].values,sch,sca,scgd])
    
    return df[INF], data, labels


In [43]:
df.columns[18:]

Index(['vote1', 'votex', 'vote2', 'elo1', 'elo2'], dtype='object')

# Analysis

In [5]:
def calc_prf(res, yhat, y, info, prefix, threshold=0.5):
    conv=PredictionsConverter('op', yhat, y, info, odds=True)
    conv.make_df(threshold=threshold)
    wag, acc, prf, roi=conv.profit()
    res[prefix+'_WAG']=wag
    res[prefix+'_ACC']=acc
    res[prefix+'_PRF']=prf
    res[prefix+'_ROI']=roi
    dfr=conv.performance_metrics()
    res[prefix+'_TP1'],res[prefix+'_TPX'],res[prefix+'_TP2']=dfr['TP'].values
    res[prefix+'_FP1'],res[prefix+'_FPX'],res[prefix+'_FP2']=dfr['FP'].values
    res[prefix+'_PPV1'],res[prefix+'_PPVX'],res[prefix+'_PPV2']=dfr['PPV'].values
    res[prefix+'_AUC1'],res[prefix+'_AUCX'],res[prefix+'_AUC2']=dfr['AUC'].values
    res[prefix+'_F11'],res[prefix+'_F1X'],res[prefix+'_F12']=dfr['F1'].values
    return res

def evaluate_feature(data, labels, df_info, res):
    data_train, data_test, labels_train, labels_test, info_train, info_test = train_test_split(data, labels, df_info, test_size=0.2, random_state=24)
    results, model = evaluate_model(data_train, labels_train, bs=64)
    yhat = model.predict(data_test)
    res['ACC']=np.mean(results)
    res['STD']=np.std(results)
    colyp=[x+'_p' for x in classes]
    df_y=pd.DataFrame(data=labels_test[:,3:], columns=classes[3:])
    df_yhat=pd.DataFrame(data=yhat, columns=colyp)
    info_test=info_test.rename(columns={'ft1':'sc1','ft2':'sc2'})
    info_test=info_test.reset_index(drop=True)
    df_preds=pd.concat([info_test,df_y,df_yhat], axis=1)
    df_preds['w1_gd']=df_preds[['gd6_p','gd5_p','gd4_p','gd3_p','gd2_p','gd1_p']].sum(axis=1)
    df_preds['wx_gd']=df_preds['gd0_p']
    df_preds['w2_gd']=df_preds[['gd-6_p','gd-5_p','gd-4_p','gd-3_p','gd-2_p','gd-1_p']].sum(axis=1)
    
    res=calc_prf(res, df_preds[['w1_p','wx_p','w2_p']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), 'ML', threshold=0.5)
    res=calc_prf(res, df_preds[['w1_gd','wx_gd','w2_gd']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), 'GD', threshold=0.5)
    res=calc_prf(res, df_preds[['w1_p','wx_p','w2_p']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), 'MLMAX', threshold='max')
    res=calc_prf(res, df_preds[['w1_gd','wx_gd','w2_gd']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), 'GDMAX', threshold='max')
    return res


In [209]:
data_train, data_test, labels_train, labels_test, info_train, info_test = train_test_split(data, labels, df_info, test_size=0.2, random_state=24)
print(data_train.shape, data_test.shape)

(108796, 171) (27200, 171)


df.isnull().any()

In [6]:
def get_model(n_inputs, n_outputs):
    model = Sequential()
    #model.add(Dense(1024, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(2048, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(4096, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dense(4096, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(BatchNormalization())
    #model.add(Dropout(0.4))
    #model.add(Dense(512, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dropout(0.4))
    #model.add(Dense(64, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    #model.add(Dense(n_outputs, activation='softmax'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    #model.compile(loss='mean_squared_error', optimizer='adam', metrics = ['accuracy'])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
    #model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return model

def evaluate_model(X, y, bs=64):
    results = list()
    n_inputs, n_outputs = X.shape[1], y.shape[1]
    # define evaluation procedure
    cv = RepeatedKFold(n_splits=3, n_repeats=2, random_state=1)
    # enumerate folds
    for train_ix, val_ix in cv.split(X):
        # prepare data
        X_train, X_val = X[train_ix], X[val_ix]
        y_train, y_val = y[train_ix], y[val_ix]
        # define model
        model = get_model(n_inputs, n_outputs)
        # fit model
        model.fit(X_train, y_train, batch_size = bs, epochs=7, verbose=0)
        # make a prediction on the test set
        yhat = model.predict(X_val)
        # round probabilities to class labels
        yhat = yhat.round()
        # calculate accuracy
        acc = accuracy_score(y_val, yhat)
        # store result
        print('>%.3f' % acc)
        results.append(acc)
        break
    return results, model

model.save('models/512-1024-8roi-welltrained.keras')

In [211]:
results, model = evaluate_model(data_train, labels_train, bs=64)
# summarize performance
print('Accuracy: %.3f (%.3f)' % (np.mean(results), np.std(results)))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
>0.021
Accuracy: 0.021 (0.000)


In [212]:
yhat = model.predict(data_test)

In [156]:
def odds2prob(df):
    df['odds_away']=1/df['odds_away']
    df['odds_draw']=1/df['odds_draw']
    df['odds_home']=1/df['odds_home']
    df['margin']=df[['odds_away','odds_draw','odds_home']].sum(axis=1)
    df['odds_away']=df['odds_away']/df['margin']
    df['odds_draw']=df['odds_draw']/df['margin']
    df['odds_home']=df['odds_home']/df['margin']
    return df[['odds_away','odds_draw','odds_home']]


def softmax(df, columns):
    df['margin']=df[columns].sum(axis=1)
    for x in columns:
        df[x]=df[x]/df['margin']
    df=df.drop(columns=['margin'])
    return df

In [213]:
#coly=np.hstack([['w1','wx','w2'],sch.columns,sca.columns,scgd.columns])
#coly=scgd.columns
colyp=[x+'_p' for x in classes]
df_y=pd.DataFrame(data=labels_test[:,3:], columns=classes[3:])
#df_y=pd.DataFrame(data=labels_test, columns=classes)
df_yhat=pd.DataFrame(data=yhat, columns=colyp)
info_test=info_test.rename(columns={'ft1':'sc1','ft2':'sc2'})
info_test=info_test.reset_index(drop=True)
df_preds=pd.concat([info_test,df_y,df_yhat], axis=1)
#df_preds=softmax(df_preds,['w1_p','wx_p','w2_p'])
#df_preds=softmax(df_preds,[x+'_p' for x in sch.columns])
#df_preds=softmax(df_preds,[x+'_p' for x in sca.columns])
#df_preds=softmax(df_preds,[x+'_p' for x in scgd.columns])
df_preds['w1_gd']=df_preds[['gd6_p','gd5_p','gd4_p','gd3_p','gd2_p','gd1_p']].sum(axis=1)
df_preds['wx_gd']=df_preds['gd0_p']
df_preds['w2_gd']=df_preds[['gd-6_p','gd-5_p','gd-4_p','gd-3_p','gd-2_p','gd-1_p']].sum(axis=1)

#df_preds

In [214]:
calc_prf({}, df_preds[['w1_p','wx_p','w2_p']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), 'ML', threshold=0.5)

WAG:7109; ACC: 0.5490223660149107; PRF: -139.97000000000003; ROI: -0.019689126459417644


{'ML_WAG': 7109,
 'ML_ACC': 0.5490223660149107,
 'ML_PRF': -139.97000000000003,
 'ML_ROI': -0.019689126459417644,
 'ML_TP1': 4330,
 'ML_TPX': 365,
 'ML_TP2': 3609,
 'ML_FP1': 3684,
 'ML_FPX': 692,
 'ML_FP2': 2574,
 'ML_PPV1': 0.54,
 'ML_PPVX': 0.345,
 'ML_PPV2': 0.584,
 'ML_AUC1': 0.671,
 'ML_AUCX': 0.521,
 'ML_AUC2': 0.679,
 'ML_F11': 0.624,
 'ML_F1X': 0.157,
 'ML_F12': 0.606}

In [215]:
conv=PredictionsConverter('op', df_preds[['w1_p','wx_p','w2_p']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), odds=True)
conv.make_df()
conv.profit()


WAG:7109; ACC: 0.5490223660149107; PRF: -139.97000000000003; ROI: -0.019689126459417644


(7109, 0.5490223660149107, -139.97000000000003, -0.019689126459417644)

In [159]:
#info_test=info_test.rename(columns={'ft1':'sc1','ft2':'sc2'})
conv1=PredictionsConverter('op', df_preds[['w1_gd','wx_gd','w2_gd']].values, df_preds[['w1','wx','w2']].values, info_test.copy(), odds=True)
conv1.make_df()
conv1.profit()

WAG:8573; ACC: 0.465531319258136; PRF: 105.81999999999996; ROI: 0.012343403709319955


(8573, 0.465531319258136, 105.81999999999996, 0.012343403709319955)

In [111]:
conv.performance_metrics()

Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,4472,8034,4474,2994,0.626,0.374,0.599,0.642,0.5,0.729,0.621,0.545,0.5
1,DRAW,1101,12509,2524,3840,0.681,0.247,0.223,0.832,0.304,0.765,0.527,0.257,0.5
2,AWAY,4406,8342,4065,3161,0.638,0.379,0.582,0.672,0.52,0.725,0.627,0.549,0.5


In [160]:
df_preds[['w1','wx','w2','w1_p','wx_p','w2_p','w1_gd','wx_gd','w2_gd']]

Unnamed: 0,w1,wx,w2,w1_p,wx_p,w2_p,w1_gd,wx_gd,w2_gd
0,0,0,1,0.153697,0.409941,0.436362,0.193199,0.416397,0.390404
1,0,0,1,0.389200,0.324374,0.286426,0.375709,0.338702,0.285590
2,1,0,0,0.255113,0.255719,0.489168,0.257906,0.263558,0.478536
3,1,0,0,0.367066,0.345683,0.287250,0.380905,0.317920,0.301175
4,1,0,0,0.551421,0.274133,0.174446,0.545328,0.263596,0.191075
...,...,...,...,...,...,...,...,...,...
27111,0,0,1,0.218319,0.248316,0.533365,0.226258,0.287219,0.486522
27112,1,0,0,0.292947,0.343484,0.363569,0.297586,0.360643,0.341771
27113,1,0,0,0.894143,0.081727,0.024130,0.913940,0.067083,0.018978
27114,0,0,1,0.158910,0.196134,0.644956,0.163151,0.217454,0.619394


In [122]:
from api.predictions_converter import PredictionsConverter
info_test=info_test.rename(columns={'ft1':'sc1','ft2':'sc2'})
conv_bookies=PredictionsConverter('op', api.util.odds2prob(info_test.copy()).values, labels_test, info_test.copy(), odds=True)
conv_bookies.make_df()
conv=PredictionsConverter('op', yhat, labels_test, info_test.copy(), odds=True)
conv.make_df()

conv_bookies.profit()
conv.profit()
conv_bookies.performance_metrics()
conv.performance_metrics()

WAG:5734; ACC: 0.14318102546215555; PRF: -721.0699999999999; ROI: -0.1257534007673526
WAG:8301; ACC: 0.5372846644982532; PRF: -215.48000000000002; ROI: -0.02595831827490664


Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,385,1042,2445,1862,0.249,0.392,0.171,0.299,0.136,0.359,0.235,0.152,0.5
1,DRAW,1,4511,1,1221,0.787,0.213,0.001,1.0,0.5,0.787,0.5,0.002,0.5
2,AWAY,435,1002,2467,1830,0.251,0.395,0.192,0.289,0.15,0.354,0.24,0.168,0.5


Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,3874,7598,2999,2466,0.677,0.374,0.611,0.717,0.564,0.755,0.664,0.586,0.5
1,DRAW,374,12079,818,3666,0.735,0.239,0.093,0.937,0.314,0.767,0.515,0.143,0.5
2,AWAY,4656,6143,4237,1901,0.638,0.387,0.71,0.592,0.524,0.764,0.651,0.603,0.5


In [123]:
conv_bookies1=PredictionsConverter('op', odds2prob(info_test.copy()).values, labels_test, info_test.copy(), odds=True)
conv_bookies1.make_df(threshold='max')
conv1=PredictionsConverter('op', yhat, labels_test, info_test.copy(), odds=True)
conv1.make_df(threshold='max')

conv_bookies1.profit()
conv1.profit()
conv_bookies1.performance_metrics()
conv1.performance_metrics()

WAG:12706; ACC: 0.21611836927435857; PRF: -1422.03; ROI: -0.1119179915000787
WAG:12706; ACC: 0.48504643475523374; PRF: -314.58; ROI: -0.024758381866834565


Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,1340,3023,4935,3408,0.343,0.374,0.282,0.38,0.214,0.47,0.331,0.243,0.5
1,DRAW,9,9467,25,3205,0.746,0.253,0.003,0.997,0.265,0.747,0.5,0.006,0.5
2,AWAY,1397,2945,5017,3347,0.342,0.373,0.294,0.37,0.218,0.468,0.332,0.25,0.5


Unnamed: 0,Name,TP,TN,FP,FN,Accuracy,Prevalence,Sensitivity,Specificity,PPV,NPV,AUC,F1,Threshold
0,HOME,5525,11713,5265,4613,0.636,0.374,0.545,0.69,0.512,0.717,0.617,0.528,0.5
1,DRAW,873,18171,2029,6043,0.702,0.255,0.126,0.9,0.301,0.75,0.513,0.178,0.5
2,AWAY,6314,9944,7110,3748,0.6,0.371,0.628,0.583,0.47,0.726,0.605,0.538,0.5


#df=pd.read_csv('data/stats_generated.csv', index_col=None)
start=29
df=df_all[cols]
df['elo1'].fillna((df['elo1'].mean()), inplace=True)
df['elo2'].fillna((df['elo2'].mean()), inplace=True)
nulls=pd.DataFrame(df.isna().sum(), columns=['n'])
cols_null=[x for x in nulls[nulls.n>60000].index if x not in COL_INF]
cols=[x for x in cols if x not in cols_null]


for col in COL_INF:
    df.loc[df[col].isnull(),col]=0

#df=df.dropna()
df=df.fillna(0)
#[df[col].fillna(df[col].mean(), inplace=True) for col in df.columns[start:]]

scaler=MinMaxScaler()
nums=scaler.fit_transform(df[df.columns[start:]].values)
nums_df=pd.DataFrame(nums, columns=df.columns[start:])
df.reset_index(drop=True, inplace=True)
df=pd.concat([df[df.columns[:start]],nums_df], axis=1)

nums1=df[['vote1', 'votex', 'vote2',]].values

df_info=df[COL_INF]

encoder = OneHotEncoder()
pop_r=encoder.fit_transform(df[['pop_r']]).toarray()
rounds=encoder.fit_transform(df[['round']]).toarray()
countries=encoder.fit_transform(df[['country_id']]).toarray()
encoder = OneHotEncoder()
form1=encoder.fit_transform(df[['form1']]).toarray()
encoder = OneHotEncoder()
form2=encoder.fit_transform(df[['form2']]).toarray()
side=df[['side']].values

#data=np.hstack([nums,nums1,side,rounds,countries,form1,form2])
data=np.hstack([nums,nums1,side,pop_r,rounds,countries,form1,form2])

df['gd']=df['ft1']-df['ft2']
df['gd']=np.where(df['gd']>5,6,df['gd'])
df['gd']=np.where(df['gd']<-5,-6,df['gd'])
scgd=pd.get_dummies(df['gd'], prefix='gd')

df['sch']=np.where(df['ft1']>5,6,df['ft1'])
df['sca']=np.where(df['ft2']>5,6,df['ft2'])
sch=pd.get_dummies(df['sch'], prefix='sch')
sca=pd.get_dummies(df['sca'], prefix='sca')
labels=np.hstack([df[['w1', 'wx', 'w2']].values,sch.values,sca.values,scgd.values])
#labels=np.hstack([scgd.values])
#labels=df[['w1', 'wx', 'w2']].values

data.shape,labels.shape

In [87]:
conv.DF

Unnamed: 0,ds,country,liga,t1,t2,sc1,sc2,odds_home,odds_draw,odds_away,winner_home,winner_draw,winner_away,pred_home,pred_draw,pred_away,prob_home,prob_draw,prob_away,win,prf
0,2017-10-28 17:00:00+00:00,argentina,liga-profesional-de-futbol,ca huracan,lanus,4.0,0.0,1.84,3.21,4.84,1,0,0,0,1,0,0.419015,0.785230,0.177421,0,-1.00
1,2018-08-11 14:00:00+00:00,england,championship,aston villa,wigan athletic,3.0,2.0,0.00,0.00,0.00,1,0,0,1,0,0,0.916121,0.320084,0.033951,1,-1.00
2,2018-12-01 19:30:00+00:00,italy,serie-a,sampdoria,bologna,4.0,1.0,2.02,3.26,4.17,1,0,0,0,1,0,0.416682,0.878468,0.090711,0,-1.00
3,2018-08-26 12:30:00+00:00,netherlands,eredivisie,fc utrecht,vvvvenlo,1.0,1.0,0.00,0.00,0.00,0,1,0,1,0,0,0.908112,0.472075,0.014991,0,0.00
4,2017-05-14 18:00:00+00:00,spain,laliga,athletic bilbao,leganes,1.0,1.0,1.47,4.43,7.41,0,1,0,1,1,0,0.645087,0.798480,0.056598,1,3.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13553,2018-08-14 18:45:00+00:00,england,efl-cup,yeovil town,aston villa,0.0,1.0,0.00,0.00,0.00,0,0,1,1,1,0,0.738303,0.974766,0.002282,0,0.00
13554,2018-04-07 14:00:00+00:00,ukraine,premier-league-relegation-round,oleksandria,pfc feniks bucha,2.0,0.0,1.82,3.25,4.58,1,0,0,1,1,0,0.570895,0.614656,0.238248,1,2.25
13555,2015-12-12 17:30:00+00:00,austria,bundesliga,sv ried,wolfsberger ac,1.0,0.0,0.00,0.00,0.00,1,0,0,0,0,1,0.429426,0.001383,0.962793,0,0.00
13556,2016-09-21 15:30:00+00:00,finland,veikkausliiga,ifk mariehamn,inter turku,1.0,1.0,1.90,3.37,4.13,0,1,0,1,0,0,0.567031,0.176261,0.496240,0,-1.00


In [126]:
conv.DF.loc[conv.DF['odds_home']>0].win.mean()

0.5372846644982532

In [124]:
res=conv.DF.loc[conv.DF['odds_home']>0]

includes=['vote1', 'votex', 'vote2','elo1','elo2','drift_home', 'drift_away', 'drift_draw','oddsprob_home', 'oddsprob_draw', 'oddsprob_away']
cols=[x for x in df_all.columns if x!='round.1' and x!='ds.1']
cols=[x for x in cols if not any(i in x for i in includes)]
cols=[x for x in cols if  not 'ht1' in x and not 'ht2' in x and not 'psht' in x]
cols=[x for x in cols if not 'tar_' in x and not 'opp_' in x]
#cols1=[x for x in df_all.columns if '_form' in x]
cols=cols+includes


In [125]:
res[res['pred_home']==1].prf.sum(),res[res['pred_draw']==1].prf.sum(),res[res['pred_away']==1].prf.sum()

(-110.32000000000002, 26.24999999999999, -121.71000000000002)

In [91]:
res.to_csv('data/pred.csv', index=False)

# sdef
$ \frac{1}{2} $