In [None]:
# Import the necessary libraries.
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import scipy as scipy
from IPython.display import display
import matplotlib.pyplot as plt
import importlib as il
import nbimporter

import data_functions as df
il.reload(df)
import model_functions as mf
il.reload(mf)

%matplotlib inline

## Training and Evaluating Models

In [None]:
start_season = 11    #0...16
num_seasons = 6    #0...16


drop_columns = ['Unnamed: 0','HomeTeam', 'AwayTeam', 'Date',  'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
           'HomeTeamLP', 'AwayTeamLP','HTFormPts','ATFormPts',
           'HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
           'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3',
                'HM4','HM5','AM4','AM5',
                #'MW',
                #'DiffPts',
                'HTGS', 'ATGS', 'HTGC', 'ATGC',
               ]

cols_to_scale = ['HTGD','ATGD','HTP','ATP','DiffLP']

data = pd.read_csv('./Datasets/final_dataset.csv')
data = df.get_seasons(data, start_season, num_seasons)
data = df.delete_first_3_weeks(data)
#data = data[350*s_num_temporadas:len(data)-350*num_temporadas]
#display(data.tail())
data = df.drop_basic_columns(data, drop_columns)
#data = df.drop_teams_onehot(data)
#data = df.odds_to_prob(data)
#data = df.explore_data(data)
#data = df.scatter(data)
#data = df.extract_pca(data)
data, scaler = df.scale_features(data,cols_to_scale)
#data = df.fill_nan(data)
#data = data.dropna()
data = df.form_to_str(data)
data = df.preprocess_features(data)

# Show the feature information by printing the first five rows
#print("\nFeature values:")
#display(data.head())
 

# Tuning the parameters of XGBoost.

# Fitting the model on the whole dataset for future predictions.

In [None]:
X_all = data.drop(['FTR'],1)

parameters_xg = { 'learning_rate' : [0.03],
               'n_estimators' : [100],
               'max_depth': [8],   #6
               'min_child_weight': [5],
               'gamma':[0.2],
               'subsample':[0.8],
               'colsample_bytree':[0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-2]
             } 


    

#clf_H = mf.get_model(xgb.XGBClassifier(seed=2),X_all,df.binarize_FTR(data['FTR'],label='H'),parameters_xg,'H',1)
clf_H = mf.get_model(xgb.XGBClassifier(seed=2),
                     X_all,
                     df.binarize_FTR(data['FTR'],label='H'),
                     parameters_xg,'H',1)

#clf_D = mf.get_model(xgb.XGBClassifier(seed=2),
#                     X_all,
#                     df.binarize_FTR(data['FTR'],label='D'),
#                     parameters_xg,'D',0)
#
#clf_A = mf.get_model(xgb.XGBClassifier(seed=2),
#                     X_all,
#                     df.binarize_FTR(data['FTR'],label='A'),
#                     parameters_xg,'A',0)

In [None]:
X_all = data.drop(['FTR'],1)

weights=[]
for idx in range(len(X_all)):
    #mod = ((int(idx / 350)+1)^2)/100#*0.05
    mod = (int(idx / 350)+1)#*0.05
    weights.append(mod)

clf_H = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.2, learning_rate=0.03, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=1, seed=2, silent=True, subsample=0.8)

clf_H.fit(X_all, df.binarize_FTR(data['FTR'],label='H'), sample_weight=weights)
display(clf_H)

# Report the final F1 score for training and testing after parameter tuning
#f1, acc = predict_labels(clf, X_train, y_train, label_to_train)
f1, acc = mf.predict_labels(clf_H, X_all, df.binarize_FTR(data['FTR'],label='H'), 'H')
display("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

In [None]:
weight = df.binarize_FTR(data['FTR'],label='D').value_counts()['N']/df.binarize_FTR(data['FTR'],label='D').value_counts()['D']
    
clf_D = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.2, learning_rate=0.03, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=weight, seed=2, silent=True, subsample=0.8)

clf_D.fit(X_all, df.binarize_FTR(data['FTR'],label='D'), sample_weight=weights)
display(clf_D)

# Report the final F1 score for training and testing after parameter tuning
#f1, acc = predict_labels(clf, X_train, y_train, label_to_train)
f1, acc = mf.predict_labels(clf_D, X_all, df.binarize_FTR(data['FTR'],label='D'), 'D')
display("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

In [None]:
weight = df.binarize_FTR(data['FTR'],label='A').value_counts()['N']/df.binarize_FTR(data['FTR'],label='A').value_counts()['A']

clf_A = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.2, learning_rate=0.03, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0.01, reg_lambda=1,
       scale_pos_weight=weight, seed=2, silent=True, subsample=0.8)

clf_A.fit(X_all, df.binarize_FTR(data['FTR'],label='A'), sample_weight=weights)
display(clf_A)

# Report the final F1 score for training and testing after parameter tuning
#f1, acc = predict_labels(clf, X_train, y_train, label_to_train)
f1, acc = mf.predict_labels(clf_A, X_all, df.binarize_FTR(data['FTR'],label='A'), 'A')
display("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

# Multiclass

In [None]:
weight = [
    df.binarize_FTR(data['FTR'],label='H').value_counts()['N']/df.binarize_FTR(data['FTR'],label='H').value_counts()['H'],
    df.binarize_FTR(data['FTR'],label='D').value_counts()['N']/df.binarize_FTR(data['FTR'],label='D').value_counts()['D'],
    df.binarize_FTR(data['FTR'],label='A').value_counts()['N']/df.binarize_FTR(data['FTR'],label='A').value_counts()['A']
]

weights=[]
counter = 0
for idx, row in data.iterrows():
    #print(row['FTR'])
    if (row['FTR'] == 'H'):         mod = weight[0]
    elif (row['FTR'] == 'D'):        mod = weight[1]
    elif (row['FTR'] == 'A'):        mod = weight[2]
    #mod = ((int(idx / 350)+1)^2)/100#*0.05
    mod *= (int(counter / 350)+1)*2*0.1#*0.05
    #mod=1
    counter += 1
    
    
    weights.append(mod)
#display(weights)    

In [None]:
from sklearn.metrics import confusion_matrix

X_all = data.drop(['FTR'],1)

#weights=[]
#for idx in range(len(X_all)):
#    #mod = ((int(idx / 350)+1)^2)/100#*0.05
#    mod = (int(idx / 350)+1)#*0.05
#    weights.append(mod)

#weight = [
#    df.binarize_FTR(data['FTR'],label='H').value_counts()['N']/df.binarize_FTR(data['FTR'],label='H').value_counts()['H'],
#    df.binarize_FTR(data['FTR'],label='D').value_counts()['N']/df.binarize_FTR(data['FTR'],label='D').value_counts()['D'],
#    df.binarize_FTR(data['FTR'],label='A').value_counts()['N']/df.binarize_FTR(data['FTR'],label='A').value_counts()['A']
#]

clf_M = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.2, learning_rate=0.03, max_delta_step=0, max_depth=8,
       min_child_weight=5, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softmax', reg_alpha=0.01, reg_lambda=1,
       seed=2, silent=True, subsample=0.8)

#clf_M.fit(X_all, data['FTR'], sample_weight=weights)
clf_M.fit(X_all, data['FTR'], sample_weight=weights)
display(clf_M)

# Report the final F1 score for training and testing after parameter tuning
#f1, acc = predict_labels(clf, X_train, y_train, label_to_train)
#f1, acc = mf.predict_labels(clf_M, X_all, data['FTR'], '')
y_pred = clf_M.predict(X_all)

#f1 = f1_score(data['FTR'], y_pred, pos_label=label),
print(confusion_matrix(data['FTR'], y_pred))
acc = sum(data['FTR'] == y_pred) / float(len(y_pred))
print(acc)
#display("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))

In [None]:
mf.show_features_importances(clf_M)

# Keras

In [None]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
from sklearn.cross_validation import train_test_split

class EarlyStoppingByLossVal(keras.callbacks.Callback):
   def __init__(self, monitor='val_loss', value=0.00001, verbose=0):
       super(keras.callbacks.Callback, self).__init__()
       self.monitor = monitor
       self.value = value
       self.verbose = verbose
   def on_epoch_end(self, epoch, logs={}):
       current = logs.get(self.monitor)
       if current is None:
           warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)
       if current < self.value:
           if self.verbose > 0:
               print("Epoch %05d: early stopping THR" % epoch)
           self.model.stop_training = True
        
callbackss = [
    #EarlyStoppingByLossVal(monitor='val_loss', value=0.16, verbose=1),
    #keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1),
    # EarlyStopping(monitor='val_loss', patience=2, verbose=0),
    #ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0),
    keras.callbacks.TensorBoard(log_dir='./tf_model_full'),
    #ModelCheckpoint('./tf_model_full', monitor='val_loss', save_best_only=True, verbose=0)
]

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

X_all = data.drop(['FTR'],1)
X_all = df.fill_nan(X_all)
#y_all = df.binarize_FTR(data['FTR'],label='H')
y_all = data['FTR']

encoder = LabelEncoder()
encoder.fit(y_all)
encoded_Y = encoder.transform(y_all)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

X_train, X_test, y_train, y_test = train_test_split(X_all, dummy_y,
                                                    test_size = 0.15,
                                                    random_state = 2,
                                                    stratify = y_all)
                                                    
def create_model():
    global data
    model = Sequential()
    model.add(Dense(40, activation='relu', kernel_initializer='normal', input_shape=(data.shape[1]-1,)))
    #model.add(keras.layers.normalization.BatchNormalization())
    #model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
    model.add(Dropout(0.2))
    model.add(Dense(20, activation='relu', kernel_initializer='normal'))
    #model.add(keras.layers.normalization.BatchNormalization())
    #model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
    model.add(Dropout(0.2))
    #model.add(Dense(30, activation='relu', kernel_initializer='normal'))
    #model.add(keras.layers.normalization.BatchNormalization())
    #model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
    #model.add(Dropout(0.2))
    model.add(Dense(10, activation='relu', kernel_initializer='normal'))
    #model.add(keras.layers.normalization.BatchNormalization())
    #model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
    model.add(Dropout(0.2))
    model.add(Dense(3, activation='softmax', kernel_initializer='normal'))
    #model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
    
    
    model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adagrad(lr=0.1, epsilon=1e-08, decay=0.0),
              metrics=['accuracy'])

    display(model.summary())
    return model
    
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import math

weight = [
    df.binarize_FTR(data['FTR'],label='H').value_counts()['N']/df.binarize_FTR(data['FTR'],label='H').value_counts()['H'],
    df.binarize_FTR(data['FTR'],label='D').value_counts()['N']/df.binarize_FTR(data['FTR'],label='D').value_counts()['D'],
    df.binarize_FTR(data['FTR'],label='A').value_counts()['N']/df.binarize_FTR(data['FTR'],label='A').value_counts()['A']
]

class_weight = {0 : math.log(weight[2]),
    1: math.log(weight[1]),
    2: math.log(weight[0])}

estimator = KerasClassifier(build_fn=create_model, epochs=500, batch_size=320, verbose=1)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

y_keras = dummy_y
#y_keras = pd.DataFrame(dummy_y,index=X_all.index).join(X_all).values
results = cross_val_score(estimator, X_all.values, y_keras, cv=kfold,  fit_params={
        #'sample_weight': np.array(weights),
        #'class_weight': class_weight,
        'callbacks': callbackss
            }
                         )
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
# BASE LINE
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop
from sklearn.cross_validation import train_test_split

    
X_all = data.drop(['FTR'],1)
X_all = df.fill_nan(X_all)
y_all = data['FTR']


def only_hw(string):
    if string == 'A':
        return 1
    else:
        return 0
    
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all.apply(only_hw),
                                                    test_size = 0.15,
                                                    random_state = 2,
                                                    stratify = y_all.apply(only_hw))

model = Sequential()
model.add(Dense(20, activation='relu', kernel_initializer='normal', input_shape=(X_train.shape[1],)))
#model.add(keras.layers.normalization.BatchNormalization())
#model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
model.add(Dropout(0.2))
model.add(Dense(10, activation='relu', kernel_initializer='normal'))
#model.add(keras.layers.normalization.BatchNormalization())
#model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
model.add(Dropout(0.2))
#model.add(Dense(30, activation='relu', kernel_initializer='normal'))
#model.add(keras.layers.normalization.BatchNormalization())
#model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
#model.add(Dropout(0.2))
model.add(Dense(5, activation='relu', kernel_initializer='normal'))
#model.add(keras.layers.normalization.BatchNormalization())
#model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid', kernel_initializer='normal'))
#model.add(keras.layers.advanced_activations.LeakyReLU(0.3))
display(model.summary())


model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adagrad(lr=0.1, epsilon=1e-08, decay=0.0),
              metrics=['binary_accuracy'])

class EarlyStoppingByLossVal(keras.callbacks.Callback):
   def __init__(self, monitor='val_loss', value=0.00001, verbose=0):
       super(keras.callbacks.Callback, self).__init__()
       self.monitor = monitor
       self.value = value
       self.verbose = verbose
   def on_epoch_end(self, epoch, logs={}):
       current = logs.get(self.monitor)
       if current is None:
           warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)
       if current < self.value:
           if self.verbose > 0:
               print("Epoch %05d: early stopping THR" % epoch)
           self.model.stop_training = True
        
callbacks = [
    EarlyStoppingByLossVal(monitor='val_loss', value=0.16, verbose=1),
    #keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=100, verbose=1),
    # EarlyStopping(monitor='val_loss', patience=2, verbose=0),
    #ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0),
    keras.callbacks.TensorBoard(log_dir='./tf_model_full'),
    #ModelCheckpoint('./tf_model_full', monitor='val_loss', save_best_only=True, verbose=0)
]

history = model.fit(X_train.values, y_train.values,
                    batch_size=32,
                    shuffle=True,
                    epochs=1000,
                    verbose=2,
                    validation_data=(X_test.values, y_test.values),
                    callbacks=callbacks)

In [None]:
score = model.evaluate(X_test.values, y_test.values, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
val_df = model.predict(X_test.values)

In [None]:
val_df

# NEAT

In [None]:
#http://neat-python.readthedocs.io/en/latest/index.html
import neat
import visualize
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import random
random.seed(42)
    
X_all = data.drop(['FTR'],1)
X_all = df.drop_teams_onehot(X_all)
X_all = df.fill_nan(X_all)

y_all = data['FTR']

encoder = LabelEncoder()
encoder.fit(y_all)
encoded_Y = encoder.transform(y_all)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

# 2-input XOR inputs and expected outputs.
#xor_inputs = [(0.0, 0.0), (0.0, 1.0), (1.0, 0.0), (1.0, 1.0)]
#xor_outputs = [   (0.0,),     (1.0,),     (1.0,),     (0.0,)]


def eval_genomes(genomes, config):
    for genome_id, genome in genomes:
        #genome.fitness = 4.0
        genome.fitness = 0.0
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        for xi, xo in zip(X_all.values, dummy_y):
            output = net.activate(xi)
            if (output[0]>=0.5 and output[1]<0.5 and output[2]<0.5 and xo[0] == 1):
                genome.fitness += 1
            elif (output[0]<0.5 and output[1]>=0.5 and output[2]<0.5 and xo[1] == 1):
                genome.fitness += 1
            elif (output[0]<0.5 and output[1]<0.5 and output[2]>=0.5 and xo[2] == 1):
                genome.fitness += 1
        genome.fitness /= len(X_all.values)
        print('.', end='')
            #genome.fitness -= (output[0] - xo[0]) ** 2
    #print('.')
    
def eval_genomes_balance(genomes, config):
    for genome_id, genome in genomes:
        #genome.fitness = 4.0
        genome.fitness = 0.0
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        for xi, xo in zip(X_all.values, dummy_y):
            output = net.activate(xi)
            if (output[0]>=0.5):
                genome.fitness -= 1
                if (xo[2] == 1.0): genome.fitness += xi[20]
            if (output[1]>=0.5):
                genome.fitness -= 1
                if (xo[1] == 1.0): genome.fitness += xi[21]
            if (output[2]>=0.5):
                genome.fitness -= 1
                if (xo[0] == 1.0): genome.fitness += xi[22]
            #print(xi[20],xi[21],xi[22],xo,output,genome.fitness)        
        #genome.fitness /= len(X_all.values)
        print('.', end='')
            #genome.fitness -= (output[0] - xo[0]) ** 2
    #print('.')

def run(config_file):
    # Load configuration.
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation,
                         config_file)

    # Create the population, which is the top-level object for a NEAT run.
    p = neat.Population(config)

    # Add a stdout reporter to show progress in the terminal.
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    #p.add_reporter(neat.Checkpointer(5))

    # Run for up to 300 generations.
    #winner = p.run(eval_genomes, 300)
    winner = p.run(eval_genomes_balance, 100)
    #pe = neat.ThreadedEvaluator(3, eval_genome)
    #winner = p.run(pe.evaluate, 300)
    #pe.stop()

    # Display the winning genome.
    print('\nBest genome:\n{!s}'.format(winner))

    # Show output of the most fit genome against training data.
    print('\nOutput:')
    winner_net = neat.nn.FeedForwardNetwork.create(winner, config)
    for xi, xo in zip(X_all.values, dummy_y):
        output = winner_net.activate(xi)
        #print("input {!r}, expected output {!r}, got {!r}".format(xi, xo, output))

    #node_names = {-1:'A', -2: 'B', 0:'A XOR B'}
    #visualize.draw_net(config, winner, True, node_names=node_names)
    visualize.plot_stats(stats, ylog=False, view=True)
    #visualize.plot_species(stats, view=True)

    #p = neat.Checkpointer.restore_checkpoint('neat-checkpoint-4')
    #p.run(eval_genomes, 10)
    return winner_net

model = run('config-feedforward')   

In [None]:
import pickle

pickle.dump(model,'neat_balance.model')

# Predict the last season

In [None]:

tdata = pd.read_csv('./Datasets/final_dataset.csv')

tdata = df.get_seasons(tdata, start_season+num_seasons, 1)
tdata = df.delete_first_3_weeks(tdata)

pred_labels=tdata[['HomeTeam','AwayTeam','IWH','IWD','IWA']]
#tdata = tdata[len(tdata)-350*num_temporadas:len(tdata)-350*(num_temporadas-1)]
#display(tdata.head())
tdata = df.drop_basic_columns(tdata, drop_columns)
#data = df.drop_teams_onehot(data)
#data = df.odds_to_prob(data)
#data = df.explore_data(data)
#data = df.scatter(data)
#tdata = df.extract_pca(tdata)
tdata, scaler = df.scale_features(tdata, cols_to_scale, scaler)
#tdata = df.fill_nan(tdata)
#tdata = tdata.dropna()
tdata = df.form_to_str(tdata)
tdata = df.preprocess_features(tdata)

# Show the feature information by printing the first five rows
#print("\nFeature values:")
#display(data.head())


X_last = tdata.drop(['FTR'],1)
#y_last = df.binarize_FTR(tdata['FTR'],label='H')

y_last_bets = pd.DataFrame(tdata['FTR']).join(X_last[['IWH','IWD','IWA']])

#y_pred_H, y_pred_prob_H = mf.simulate_predict(clf_H, X_last, df.binarize_FTR(tdata['FTR'],label='H'), 'H')
#y_pred_D, y_pred_prob_D = mf.simulate_predict(clf_D, X_last, df.binarize_FTR(tdata['FTR'],label='D'), 'D')
#y_pred_A, y_pred_prob_A = mf.simulate_predict(clf_A, X_last, df.binarize_FTR(tdata['FTR'],label='A'), 'A')

#lbalance_H = mf.simulate_bets(y_pred_H, y_pred_prob_H, y_last_bets, 'H','IWH',0.5, 1.15)
#lbalance_D = mf.simulate_bets(y_pred_D, y_pred_prob_D, y_last_bets, 'D','IWD',0.3, 1.15)
#lbalance_A = mf.simulate_bets(y_pred_A, y_pred_prob_A, y_last_bets, 'A','IWA',0.6, 1.15)

pred_labels['1IWH'] = 1/pred_labels['IWH']
pred_labels['1IWA'] = 1/pred_labels['IWD']
pred_labels['1IWD'] = 1/pred_labels['IWA']

#display(pd.DataFrame(pred_labels)      
#      .join(pd.DataFrame(y_pred_prob_H,index=pred_labels.index,columns=['PH','NH']))
#      .join(pd.DataFrame(y_pred_prob_D,index=pred_labels.index,columns=['PD','ND']))
#      .join(pd.DataFrame(y_pred_prob_A,index=pred_labels.index,columns=['PA','NA']))
#     )

y_pred = clf_M.predict(X_last)
y_pred_prob = clf_M.predict_proba(X_last)

acc = sum(tdata['FTR'] == y_pred) / float(len(y_pred))
print(acc)

display(pd.DataFrame(pred_labels)      
        .join(pd.DataFrame(y_pred,index=pred_labels.index,columns=['PFTR']))
     )
display(y_pred_prob)

In [None]:
#
import math
balance = 100
bet = balance/20
counter = 0

wins = 0
skipped = 0
lbalance = []

for index, row in y_last_bets.iterrows():
    ftr = row['FTR']    
    #odds = row[column];
    #prediction = y_pred_[counter]
    #prediction_prob = y_pred_prod_[counter]
    #print(prediction_prob)
    
    counter = counter +1
    #print(y_pred_prob_H[counter-1][0])
    #if ((1/prediction_prob[0])-0.05 < row['B365H'] and prediction_prob[0] > 0.65 and prediction == 'H' ):
    #if (prediction_prob[0] > 0.5 ):    
    #if (prediction_prob[0] > 0.45):  
    lbalance.append(balance)
    #print(prediction,prediction_prob[0],row[column],label)
    #if (((prediction_prob[0] > prob and prediction == label) or (prob < 0.5 and prediction_prob[0] < prob)) and row[column]>umbral ):    
    if (math.isnan(row['IWH']) or (row['FTR'] != 'H' and row['FTR']!='D' and row['FTR']!='A')):
        skipped = skipped +1
        continue       
    #print(y_pred[counter-1])
    #print(y_pred_prob[counter-1])
    
    if (y_pred[counter-1] == 'H' and y_pred_prob[counter-1][2] > 0.34 and row['IWH']>1.15 ):    
        balance = balance - bet
        if (ftr == 'H'):           
            wins = wins+1
            balance = balance + (bet*row['IWH'])       
    elif (y_pred[counter-1] == 'A' and y_pred_prob[counter-1][0] > 0.7 and row['IWA']>1.15 ):    
        balance = balance - bet
        if (ftr == 'A'):           
            wins = wins+1
            balance = balance + (bet*row['IWA'])
    elif (y_pred[counter-1] == 'D' and y_pred_prob[counter-1][1] > 0.45 and row['IWD']>1.15 ):    
        balance = balance - bet
        if (ftr == 'D'):           
            wins = wins+1
            balance = balance + (bet*row['IWD'])
    else:
        skipped = skipped +1
        print("{:.0f}\t{}\t[{}\t{:.3f}\t{:.3f}\t{:.3f}]\t<{:.0f}>\t[{:.3f}\t{:.3f}\t{:.3f}]\t[{:.3f}\t{:.3f}\t{:.3f}]\t{:.0f} skip"
          .format(counter-1,ftr,y_pred[counter-1],y_pred_prob[counter-1][2],y_pred_prob[counter-1][1],y_pred_prob[counter-1][0],index,1/row['IWH'],1/row['IWD'],1/row['IWA'],row['IWH'],row['IWD'],row['IWA'],balance))
        continue

    print("{:.0f}\t{}\t[{}\t{:.3f}\t{:.3f}\t{:.3f}]\t<{:.0f}>\t[{:.3f}\t{:.3f}\t{:.3f}]\t[{:.3f}\t{:.3f}\t{:.3f}]\t{:.0f} "
          .format(counter-1,ftr,y_pred[counter-1],y_pred_prob[counter-1][2],y_pred_prob[counter-1][1],y_pred_prob[counter-1][0],index,1/row['IWH'],1/row['IWD'],1/row['IWA'],row['IWH'],row['IWD'],row['IWA'],balance))
    #print(counter-1,ftr,prediction,index, balance)
    

total = len(y_pred)-skipped
if (total == 0):
    total = 1
print(len(y_pred)-skipped, wins)
print("Balance and accuracy score for training set: {:.4f} , {:.4f}.".format(balance , (wins/total)))


In [None]:
plt.plot(lbalance)
plt.show()