In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import tree
from sklearn import neighbors
from sklearn import ensemble
from sklearn import svm
from sklearn import gaussian_process
from sklearn import naive_bayes
from sklearn import neural_network
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score,  f1_score, log_loss
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler

In [2]:
import csv
class Logger():
    def __init__(self, path='log/', name='execution',typ='txt', verbose=False):
        self.fn=f'{path}{name}.{typ}'
        self.type = typ
        self.header = ['note', 'clf', 'train_acc', 'train_loss', 'test_acc', 'test_loss', 'val_acc', 'val_loss']
        self.reset()
        
    def reset(self):
        if self.type =='csv':
            with open(self.fn, mode='w', newline='',  encoding='utf8') as f:
                cw = csv.writer(f, delimiter=',')
                cw.writerow(self.header)
        else:
            with open(self.fn, mode='w', newline='',  encoding='utf8') as f:
                f.write('')

    def log(self, data):
        if self.type =='csv':
            with open(self.fn, mode='a+', newline='',  encoding='utf8') as f:
                cw = csv.writer(f, delimiter=',')
                cw.writerow(data)
            #print(data)
        else:
            with open(self.fn, mode='a+', newline='',  encoding='utf8') as f:
                f.write(data)
            #print(data)

In [3]:
clfs = {}
clfs['lr'] = {'clf': linear_model.LogisticRegression(), 'name': 'LogisticRegression'}
clfs['gbc'] = {'clf': ensemble.GradientBoostingClassifier(), 'name': 'GradientBoostingClassifier'}
clfs['bag'] = {'clf': ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5, max_features=0.5), 'name': "BaggingClassifier"}
clfs['GPC'] = {'clf': gaussian_process.GaussianProcessClassifier(), 'name': 'GaussianProcess'}
clfs['nusvc'] = {'clf': svm.NuSVC(probability=True), 'name': 'NuSVC'}
clfs['rf'] = {'clf': ensemble.RandomForestClassifier(), 'name': 'Random Forest'}
clfs['svc'] = {'clf': svm.SVC(), 'name': 'SVC'}
clfs['ada'] = {'clf': ensemble.AdaBoostClassifier(), 'name': 'AdaBoost'}
clfs['nb'] = {'clf': naive_bayes.GaussianNB(), 'name':'GaussianNaiveBayes'}
clfs['mlp'] = {'clf': neural_network.MLPClassifier(), 'name': 'MLP'}
clfs['knn'] = {'clf': neighbors.KNeighborsClassifier(), 'name': 'KNN'}
clfs['tr'] = {'clf': tree.DecisionTreeClassifier(), 'name':'DecisionTree'}
clfs['extr'] = {'clf': ensemble.ExtraTreesClassifier(), 'name':'ExtraTree'}

In [9]:
scoring = ['accuracy', 'neg_log_loss']
def process_clf(clf):
    model=clfs[clf]['clf']
    scores = cross_validate(model, X_train, y_train, scoring=scoring, cv=10)
    sorted(scores.keys())
    model=model.fit(X_train, y_train)
    pred=model.predict(X_test)
    prob=model.predict_proba(X_test)[:,1]
    acc=accuracy_score(y_test, pred)
    loss=log_loss(y_test,prob)

    pred_validate=model.predict(X_validate)
    prob_validate=model.predict_proba(X_validate)[:,1]
    acc_validate=accuracy_score(y_validate, pred_validate)
    loss_validate=log_loss(y_validate,prob_validate)
    logger.log([note,clfs[clf]['name'], scores['test_accuracy'].mean(), scores['test_neg_log_loss'].mean(), acc, loss,acc_validate,loss_validate ])
    print('{}: train {:0.4f}, logloss:{:0.4f}; test {:0.4f}, logloss:{:0.4f}; validate {:0.4f}, logloss:{:0.4f}'.format(clfs[clf]['name'],scores['test_accuracy'].mean(),scores['test_neg_log_loss'].mean(),acc,loss,acc_validate,loss_validate))
    return model

df=pd.read_csv('../input/horsesstats/to_train.csv')
df['marketTime']=pd.to_datetime(df['marketTime'])
df.sort_values(by='marketTime', inplace=True)

cols=['course', 'marketTime', 'horseName', 'position','res_win', 'res_place', 'runners', 'ncond', 'metric', 'class','decimalPrice', 'age', 'RPR', 'TR', 'OR', 'weight','age_rank', 'decimalPrice_rank','weight_rank', 'RPR_rank', 'TR_rank','OR_rank', 'res_win_h_avg_rank', 'res_place_h_avg_rank','decimalPrice_diff_h_avg_rank', 'position_diff_h_avg_rank','res_win_t_avg_rank', 'res_place_t_avg_rank','decimalPrice_diff_t_avg_rank', 'position_diff_t_avg_rank','res_win_j_avg_rank', 'res_place_j_avg_rank','decimalPrice_diff_j_avg_rank', 'position_diff_j_avg_rank','metric_h_avg', 'res_win_h_avg','res_place_h_avg','decimalPrice_diff_h_avg', 'RPR_diff_h_avg', 'TR_diff_h_avg','OR_diff_h_avg', 'position_diff_h_avg', 'metric_t_avg', 'res_win_t_avg','res_place_t_avg', 'decimalPrice_diff_t_avg', 'RPR_diff_t_avg','TR_diff_t_avg', 'OR_diff_t_avg', 'position_diff_t_avg', 'metric_j_avg','res_win_j_avg', 'res_place_j_avg', 'decimalPrice_diff_j_avg','RPR_diff_j_avg', 'TR_diff_j_avg', 'OR_diff_j_avg','position_diff_j_avg', 'res_win_h_avg_diff', 'res_place_h_avg_diff','decimalPrice_diff_h_avg_diff', 'position_diff_h_avg_diff','res_win_t_avg_diff', 'res_place_t_avg_diff','decimalPrice_diff_t_avg_diff', 'position_diff_t_avg_diff','res_win_j_avg_diff', 'res_place_j_avg_diff','decimalPrice_diff_j_avg_diff', 'position_diff_j_avg_diff', 'age_diff','decimalPrice_diff', 'weight_diff', 'RPR_diff', 'TR_diff', 'OR_diff']
df=df[cols]
df['rid']=df.course.str.lower().replace(regex=True,to_replace=r'\\W|\s',value=r'')+df.marketTime.dt.strftime('%Y%m%d%H%M')
df=df.fillna(0)

In [5]:
df=pd.read_csv('data/to_train.csv')
df['marketTime']=pd.to_datetime(df['marketTime'])
df.sort_values(by='marketTime', inplace=True)
df['rid']=df.course.str.lower().replace(regex=True,to_replace=r'\\W|\s',value=r'')+df.marketTime.dt.strftime('%Y%m%d%H%M')
df=df.fillna(0)

In [6]:
def data_prepare(isCat=True, isScale=True):
    global scaler,cols_categorical,cols_numerical
    # Get all RIDs
    rids=df.rid.unique()
    validate=rids[-10000:]
    rids=rids[:-10000]
    #train, test, validate = np.split(rids, [int(.7*len(rids)), int(.85*len(rids))]) 
    train, test = np.split(rids, [int(.85*len(rids))]) 

    # Catogorical are columns with rank and three others
    cols_categorical=[col for col in df.columns if '_rank' in col]+['ncond', 'class']
    df[cols_categorical]=df[cols_categorical].astype(int)

    # Split dataframe on parts
    train_df=df[df['rid'].isin(train)]
    test_df=df[df['rid'].isin(test)]
    validate_df=df[df['rid'].isin(validate)]

    # Filter out cols_categorical
    cols=[col for col in df.columns if not col in cols_categorical]

    # Numerical are columns with avg and diff
    cols_numerical=[col for col in cols if ('_avg' in col) or ('_diff' in col)]

    # ...and some others
    cols_numerical=cols_numerical+ ['RPR', 'TR','OR','decimalPrice']

    # The rest columns 
    cols=[col for col in cols if not col in cols_numerical]
    if isScale:
        scaler=StandardScaler().fit(train_df[cols_numerical].values)
    # Prepare data
    train_x=df2np(train_df, isCat,isScale)
    train_y=train_df['res_win'].astype(int).values
    test_x=df2np(test_df, isCat,isScale)
    test_y=test_df['res_win'].astype(int).values
    validate_x=df2np(validate_df, isCat,isScale)
    validate_y=validate_df['res_win'].astype(int).values
    return train_x,train_y,test_x,test_y,validate_x,validate_y

def df2np(df, isCat,isScale):
    if isScale:
        numarr=scaler.transform(df[cols_numerical].values)
    else:
        numarr=df[cols_numerical].values
    if not isCat:
        return numarr
    df_cat=df[cols_categorical]
    return np.hstack([numarr,df_cat.values])


In [7]:
logger=Logger(typ='csv')
X_train, y_train, X_test, y_test, X_validate, y_validate=data_prepare(isCat=True, isScale=True)

In [12]:
hdrs=cols_numerical+cols_categorical

In [13]:
note='Cat_Scale'
clf=process_clf('gbc')


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
plt.bar(range(X_train.shape[1]), gb.feature_importances_)
plt.xticks(range(X_train.shape[1]), hdrs)
plt.show()

In [45]:
for clf in ['lr','gbc','bag','GPC','nusvc','rf','svc','ada','nb','mlp','knn','tr','extr']:
    print(clfs[clf]['name'])

LogisticRegression
GradientBoostingClassifier
BaggingClassifier
GaussianProcess
NuSVC
Random Forest
SVC
AdaBoost
GaussianNaiveBayes
MLP
KNN
DecisionTree
ExtraTree
