In [1]:
import json
import csv
import joblib
from os import path
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from datetime import datetime, timedelta, date
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import tree
from sklearn import neighbors
from sklearn import ensemble
from xgboost import XGBClassifier
from sklearn import svm
from sklearn import gaussian_process
from sklearn import naive_bayes
from sklearn import neural_network
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score,  f1_score, log_loss
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', 500)

In [2]:
#ds = datetime.strptime('2017-01-01', '%Y-%m-%d')
df=pd.read_csv('data/to_train.csv')
df['marketTime']=pd.to_datetime(df['marketTime'])
df.sort_values(by='marketTime', inplace=True)
df['rid']=df.course.str.lower().replace(regex=True,to_replace=r'\\W|\s',value=r'')+df.marketTime.dt.strftime('%Y%m%d%H%M')
df=df.fillna(0)

df_pred=pd.read_csv('data/topred.csv')
df_pred=df_pred.fillna(0)


In [3]:
df_pred=pd.read_csv('data/topred.csv')
df_pred=df_pred.fillna(0)
cols=['rid','course', 'marketTime', 'horseName', 'position','res_win', 'res_place', 'runners', 'ncond', 'metric', 'class','decimalPrice', 'age', 'RPR', 'TR', 'OR', 'weight','age_rank', 'decimalPrice_rank','weight_rank', 'RPR_rank', 'TR_rank','OR_rank', 'res_win_h_avg_rank', 'res_place_h_avg_rank','decimalPrice_diff_h_avg_rank', 'position_diff_h_avg_rank','res_win_t_avg_rank', 'res_place_t_avg_rank','decimalPrice_diff_t_avg_rank', 'position_diff_t_avg_rank','res_win_j_avg_rank', 'res_place_j_avg_rank','decimalPrice_diff_j_avg_rank', 'position_diff_j_avg_rank','metric_h_avg', 'res_win_h_avg','res_place_h_avg','decimalPrice_diff_h_avg', 'RPR_diff_h_avg', 'TR_diff_h_avg','OR_diff_h_avg', 'position_diff_h_avg', 'metric_t_avg', 'res_win_t_avg','res_place_t_avg', 'decimalPrice_diff_t_avg', 'RPR_diff_t_avg','TR_diff_t_avg', 'OR_diff_t_avg', 'position_diff_t_avg', 'metric_j_avg','res_win_j_avg', 'res_place_j_avg', 'decimalPrice_diff_j_avg','RPR_diff_j_avg', 'TR_diff_j_avg', 'OR_diff_j_avg','position_diff_j_avg', 'res_win_h_avg_diff', 'res_place_h_avg_diff','decimalPrice_diff_h_avg_diff', 'position_diff_h_avg_diff','res_win_t_avg_diff', 'res_place_t_avg_diff','decimalPrice_diff_t_avg_diff', 'position_diff_t_avg_diff','res_win_j_avg_diff', 'res_place_j_avg_diff','decimalPrice_diff_j_avg_diff', 'position_diff_j_avg_diff', 'age_diff','decimalPrice_diff', 'weight_diff', 'RPR_diff', 'TR_diff', 'OR_diff']
df=df[cols]
df_pred=df_pred[['date']+cols]


# Get all RIDs
rids=df.rid.unique()
#validate=rids[-10000:]
#rids=rids[:-10000]
np.random.shuffle(rids)
# Get three RIDs lists
#train, test, validate = np.split(rids, [int(.7*len(rids)), int(.85*len(rids))]) 
train, test = np.split(rids, [int(.9*len(rids))]) 
# Split dataframe on parts
train_df=df[df['rid'].isin(train)]
test_df=df[df['rid'].isin(test)]
#validate_df=df[df['rid'].isin(validate)]
validate_df=df_pred

# Catogorical are columns with rank and three others
cols_categorical=[col for col in df.columns if '_rank' in col]+['ncond', 'class']

# Filter out cols_categorical
cols=[col for col in df.columns if not col in cols_categorical]
# Numerical are columns with avg and diff
cols_numerical=[col for col in cols if ('_avg' in col) or ('_diff' in col)]
# ...and some others
#cols_numerical=cols_numerical+['RPR', 'TR', 'OR', 'win_drift']
cols_numerical=cols_numerical+ ['RPR', 'TR','OR','decimalPrice']

# The rest columns 
cols=[col for col in cols if not col in cols_numerical]

cols_descr=['rid','course','marketTime', 'horseName', 'runners','decimalPrice','res_win', 'res_place']

# Prepare data
train_x=train_df[cols_categorical+cols_numerical].values
train_y=train_df['res_win'].astype(int).values
test_x=test_df[cols_categorical+cols_numerical].values
test_y=test_df['res_win'].astype(int).values
validate_x=validate_df[cols_categorical+cols_numerical].values
validate_y=validate_df['res_win'].astype(int).values

In [4]:
clfs = {}
clfs['gbc'] = {'clf': ensemble.GradientBoostingClassifier(), 'name': 'GradientBoostingClassifier'}

clfs['xgb'] = {'clf': XGBClassifier(), 'name': "XGBClassifier"}
clfs['GPC'] = {'clf': gaussian_process.GaussianProcessClassifier(), 'name': 'GaussianProcess'}
clfs['cb'] = {'clf': CatBoostClassifier(), 'name': 'CBC'}
clfs['lr'] = {'clf': linear_model.LogisticRegression(), 'name': 'LogisticRegression'}

parameters = {'C':[1],'tol':[0.0001],'solver': ['newton-cg'], 'multi_class': ['multinomial']}
clfs['lrgrid'] = {'clf': GridSearchCV(linear_model.LogisticRegression(), parameters), 'name': 'LogisticRegression'}

parameters = {'n_estimators':np.arange(64, 1024, step=64)}
clfs['rfgrid'] = {'clf': GridSearchCV(ensemble.RandomForestClassifier(), parameters), 'name': 'Random Forest'}

parameters = {'kernel':['linear', 'sigmoid', 'poly', 'rbf'], 'gamma':np.linspace(0.0,2.0,num=21),'C': np.linspace(0.5,1.5,num=11)}
clfs['svcgrid'] = {'clf': GridSearchCV(svm.SVC(), parameters), 'name': 'SVC with GridSearch'}
clfs['svc'] = {'clf': svm.SVC(probability=True), 'name': 'SVC'}
parameters = {'n_estimators':np.arange(3, 11, step=2)}
clfs['adagrid'] = {'clf': GridSearchCV(ensemble.AdaBoostClassifier(), parameters), 'name': 'AdaBoost'}

#clfs['nb'] = {'clf': naive_bayes.GaussianNB(), 'name':'GaussianNaiveBayes'}
clfs['mlp'] = {'clf': neural_network.MLPClassifier(), 'name': 'MLP'}
#clfs['knngrid'] = {'clf': neighbors.KNeighborsClassifier(), 'name': 'KNN'}
clfs['tr'] = {'clf': tree.DecisionTreeClassifier(), 'name':'DecisionTree'}
#clfs['extr'] = {'clf': ensemble.ExtraTreesClassifier(), 'name':'ExtraTree'}

In [6]:
scoring = ['accuracy', 'neg_log_loss']
def process_clf(clf):
    model=clfs[clf]['clf']
    scores = cross_validate(model, test_x, test_y, scoring=scoring, cv=10)
    sorted(scores.keys())
    model=model.fit(test_x, test_y)
    
    pred=model.predict(test_x)
    prob=model.predict_proba(test_x)[:,1]
    acc=accuracy_score(test_y, pred)
    loss=log_loss(test_y,prob)

    predv=model.predict(validate_x)
    probv=model.predict_proba(validate_x)[:,1]
    accv=accuracy_score(validate_y, predv)
    lossv=log_loss(validate_y,probv)
    print('{}: train {:0.4f}, logloss:{:0.4f}; test {:0.4f}, logloss:{:0.4f}; validate {:0.4f}, logloss:{:0.4f}'.format(clfs[clf]['name'],scores['test_accuracy'].mean(),scores['test_neg_log_loss'].mean(),acc,loss,accv,lossv))
    descr=test_df[cols_descr].reset_index(drop=True)
    calc_profit(clf,1,descr,prob)
    descr=validate_df[cols_descr].reset_index(drop=True)
    calc_profit(clf,2,descr,probv)
    return prob, probv

scoring = ['accuracy', 'neg_log_loss']
def pred_clf(clf):
    #scores = cross_validate(model, test_x, test_y, scoring=scoring, cv=10)
    #sorted(scores.keys())
    fn=f'models/{clf}.sav'
    if path.exists(fn):
        print('Using the saved model')
        model = joblib.load(fn)
    else:
        model=clfs[clf]['clf']
        print('Have to train a model')
        model=model.fit(test_x, test_y)
        joblib.dump(model, fn)
    pred=model.predict(test_x)
    prob=model.predict_proba(test_x)[:,1]
    acc=accuracy_score(test_y, pred)
    loss=log_loss(test_y,prob)

    predv=model.predict(validate_x)
    probv=model.predict_proba(validate_x)[:,1]
    print('{}: test {:0.4f}, logloss:{:0.4f};'.format(clfs[clf]['name'],acc,loss))
    descr=test_df[cols_descr].reset_index(drop=True)
    calc_profit(clf,1,descr,prob)
    descr=validate_df[['date']+cols_descr].reset_index(drop=True)
    calc_pred(clf,descr,probv)
    return prob, probv

def calc_profit(clf,var,descr,prob):
    descr=pd.concat([descr,pd.DataFrame(prob, columns=['prob'])], axis=1)
    descr['probsum']=descr.groupby(['rid'])['prob'].transform('sum')
    descr['prob']=descr['prob']/descr['probsum']
    descr['diff']=(descr['prob']-descr['decimalPrice'])*10
    descr['odds']=1/descr['decimalPrice']
    descr['C']=1
    descr['prf_win']=np.where(descr['res_win']==1,descr['odds']-1,-1)
    descr['prf_place']=np.where(descr['res_place']==1,(descr['odds']-1)/5,-1)
    descr['prf_ew']=descr['prf_win']/2+descr['prf_place']/2
    descr.to_csv(f'data/try_{clf}{var}.csv')

def calc_pred(clf,descr,prob):
    descr=pd.concat([descr,pd.DataFrame(prob, columns=['prob'])], axis=1)
    descr['probsum']=descr.groupby(['rid'])['prob'].transform('sum')
    descr['prob']=descr['prob']/descr['probsum']
    descr['diff']=(descr['prob']-descr['decimalPrice'])*10
    descr['odds']=1/descr['decimalPrice']
    descr['C']=1
    descr['prf_win']=np.where(descr['res_win']==1,descr['odds']-1,-1)
    descr['prf_place']=np.where(descr['res_place']==1,(descr['odds']-1)/5,-1)
    descr['prf_ew']=descr['prf_win']/2+descr['prf_place']/2
    descr.to_csv(f'data/today_{clf}.csv')

In [7]:
prob,probv=pred_clf('gbc') 

Using the saved model
GradientBoostingClassifier: test 0.9449, logloss:0.1347;


In [6]:
prob,probv=pred_clf('xgb') 

Have to train a model
XGBClassifier: test 0.9798, logloss:0.0637;


In [6]:
df_pred=pd.read_csv('data/topred.csv')
df_pred=df_pred.fillna(0)
validate_df=df_pred

# Catogorical are columns with rank and three others
cols_categorical=[col for col in validate_df.columns if '_rank' in col]+['ncond', 'class']

validate_df[cols_categorical]=validate_df[cols_categorical].astype(int)

# Filter out cols_categorical
cols=[col for col in validate_df.columns if not col in cols_categorical]
# Numerical are columns with avg and diff
cols_numerical=[col for col in cols if ('_avg' in col) or ('_diff' in col)]
# ...and some others
#cols_numerical=cols_numerical+['RPR', 'TR', 'OR', 'win_drift']
cols_numerical=cols_numerical+ ['RPR', 'TR','OR','decimalPrice']

# The rest columns 
cols=[col for col in cols if not col in cols_numerical]

cols_descr=['rid','course','marketTime', 'horseName', 'runners','decimalPrice','res_win', 'res_place']

# Prepare data
validate_x=validate_df[cols_categorical+cols_numerical]

In [7]:
def pred_clf(clf):
    #scores = cross_validate(model, test_x, test_y, scoring=scoring, cv=10)
    #sorted(scores.keys())
    if clf=='cb':
        fn='models/horses007.cbm'
        model=CatBoostClassifier()
        model.load_model(fn)
    else:
        fn=f'models/{clf}.sav'
        model = joblib.load(fn)
    predv=model.predict(validate_x)
    probv=model.predict_proba(validate_x)[:,1]
    descr=validate_df[cols_descr].reset_index(drop=True)
    calc_pred(clf,descr,probv)
    return probv

In [8]:
#prob,probv=process_clf('lr') 
probv=pred_clf('cb') 

In [63]:
df_gbc=pd.read_csv('data/today_gbc.csv')
df_xgb=pd.read_csv('data/today_xgb.csv')
df_cb=pd.read_csv('data/today_cb.csv')
df_gbc=df_gbc.drop_duplicates(subset=['course','marketTime','horseName'], keep='last')
df_xgb=df_xgb.drop_duplicates(subset=['course','marketTime','horseName'], keep='last')
df_cb=df_cb.drop_duplicates(subset=['course','marketTime','horseName'], keep='last')
df_gbc.rename(columns={'prob':'prob_gbc','diff':'diff_gbc'}, inplace=True)
df_xgb.rename(columns={'prob':'prob_xgb','diff':'diff_xgb'}, inplace=True)
df_cb.rename(columns={'prob':'prob_cb','diff':'diff_cb'}, inplace=True)
dfr=pd.concat([df_gbc[['date','course', 'marketTime', 'horseName', 'runners', 'decimalPrice', 'res_win', 'res_place', 'odds', 'C', 'prob_gbc', 'diff_gbc']],
           df_xgb[['prob_xgb', 'diff_xgb']],df_cb[['prob_cb', 'diff_cb']] ], axis=1)
dfr['wag_gbc']=np.where(dfr['diff_gbc']>=0.3,1,0)
dfr['wag_xgb']=np.where(dfr['diff_xgb']>=1.5,1,0)
dfr['wag_cb']=np.where(dfr['diff_cb']>=1.5,1,0)
dfr=dfr.loc[~((dfr['diff_gbc']<0.2) & (dfr['diff_xgb']<1) & (dfr['diff_cb']<1))]
dfr.to_csv(f'data/today_res.csv')

# TF

In [17]:
gb = joblib.load('models/gbc.sav')
gb.feature_importances_
nums=gb.feature_importances_.reshape(-1,1)
features=np.array(cols_numerical+cols_categorical).reshape(-1,1)

In [18]:
df_features=pd.DataFrame(np.hstack([features,nums]), columns=['features','importance'])
df_features.sort_values(by='importance', ascending=False)

Unnamed: 0,features,importance
17,res_win_j_avg,9.612719740077693e-05
32,res_win_j_avg_diff,9.52135932980455e-05
41,OR_diff,7.800700995077393e-05
23,position_diff_j_avg,7.199990191130053e-05
54,decimalPrice_diff_h_avg_rank,6.554663689137319e-05
...,...,...
37,decimalPrice_diff,0.0
34,decimalPrice_diff_j_avg_diff,0.0
1,res_win_h_avg,0.0
22,OR_diff_j_avg,0.0


In [19]:
import matplotlib.pyplot as plt
plt.bar(range(X_train.shape[1]), df_features.importance)
plt.xticks(range(X_train.shape[1]), df_features.features)
plt.show()

NameError: name 'X_train' is not defined

In [17]:
validate_y.shape

(91,)

In [15]:
reset_graph()

num_examples, n_inputs  = train_x.shape
n_hidden1 = 100 
n_hidden2 = 100 
n_outputs = 2 
learning_rate = 0.01
n_epochs=5
batch_size=64
batch_norm_momentum = 0.9


X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int32, shape=(None), name="y")
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    he_init = tf.variance_scaling_initializer()

    my_batch_norm_layer = partial(
            tf.layers.batch_normalization,
            training=training,
            momentum=batch_norm_momentum)

    my_dense_layer = partial(
            tf.layers.dense,
            kernel_initializer=he_init)

    hidden1 = my_dense_layer(X, n_hidden1, name="hidden1")
    bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))
    hidden2 = my_dense_layer(bn1, n_hidden2, name="hidden2")
    bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))
    logits_before_bn = my_dense_layer(bn2, n_outputs, name="outputs")
    logits = my_batch_norm_layer(logits_before_bn)

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()
saver = tf.train.Saver()

extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess: 
    init.run() 
    for epoch in range (n_epochs): 
        for X_batch, y_batch in shuffle_batch(train_x, train_y, batch_size): 
            #X_batch_scaled = (X_batch - means) / stds
            sess.run([training_op, extra_update_ops], feed_dict= {training: True, X: X_batch, y: y_batch } ) 
        if epoch % 5 == 0:
            accuracy_test = accuracy.eval(feed_dict={X: test_x, y: test_y } ) 
            accuracy_val = accuracy.eval(feed_dict={X: validate_x, y: validate_y } ) 
            print (epoch, "TEST ACC: ", accuracy_test,"VAL ACC: ", accuracy_val ) 
    save_path = saver.save(sess, "./models/my_model_final.ckpt" ) 


0 TEST ACC:  0.90847963 VAL ACC:  1.0
