In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score, recall_score, f1_score
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

## for feature experiment

In [4]:
##### svm 함수의 parameter default
C = 1.0   #[10000,5000,3000,1000,500,100,10,1,0.1]
kernel = 'rbf'   #['rbf','poly','linear']
degree=3
gamma='auto_deprecated' #Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
coef0=0.0
shrinking= True 
probability=False  #enable probability estimates
tol=1e-3 
cache_size=200
class_weight=None  #Set the parameter C of class i to class_weight[i]*C for SVC.
verbose=False
max_iter=-1
decision_function_shape='ovr'
random_state=None

##### column selection

used_column_name = {
    'race_result': ['date', 'race_num', 'rank', 'lane', 'horse', 'jockey', 'trainer', 'owner'],
    'jockey': ['date', 'jockey', 'weight'],
    'owner': ['date', 'owner', 'owner_money'],
    'trainer': ['date', 'trainer', 'race_count'],
    'horse': ['date', 'horse', 'age', '1yr_first', '1yr_second', 'price'],
}

##### choose cleaning or filling missing value
# training_data = data_set[~data_set['date'].isin(test_day)].fillna(0)
# test_data = data_set[data_set['date'].isin(test_day)].fillna(0)

##### feature engineering  (make your input feature columns)

##### choose test_day
test_day = ['2019-04-13', '2019-04-14', '2019-04-20', '2019-04-21']


In [61]:
DATA_PATH = 'data/'

column_name = {
    'race_result': ['date', 'race_num', 'track_length', 'track_state', 'weather', 'rank', 'lane', 'horse', 'home',
                    'gender', 'age', 'weight', 'rating', 'jockey', 'trainer', 'owner', 'single_odds', 'double_odds'],
    'jockey': ['jockey', 'group', 'birth', 'age', 'debut', 'weight', 'weight_2', 'race_count', 'first', 'second',
               '1yr_count', '1yr_first', '1yr_second'],
    'owner': ['owner', 'reg_horse', 'unreg_horse', 'owned_horse', 'reg_date', '1yr_count', '1yr_first', '1yr_second',
              '1yr_third', '1yr_money', 'race_count', 'first', 'second', 'third', 'owner_money'],
    'trainer': ['trainer', 'group', 'birth', 'age', 'debut', 'race_count', 'first', 'second', '1yr_count', '1yr_first',
                '1yr_second'],
    'horse': ['horse', 'home', 'gender', 'birth', 'age', 'class', 'group', 'trainer', 'owner', 'father', 'mother',
              'race_count', 'first', 'second', '1yr_count', '1yr_first', '1yr_second', 'horse_money', 'rating',
              'price'],
}


def load_data():
    df_dict = dict()  # key: data type(e.g. jockey, trainer, ...), value: corresponding dataframe

    for data_type in ['horse', 'jockey', 'owner', 'trainer', 'race_result']:
        fnames = sorted(os.listdir(DATA_PATH + data_type))

        df = pd.DataFrame()

        # concatenate all text files in the directory
        for fname in fnames:
            tmp = pd.read_csv(os.path.join(DATA_PATH, data_type, fname), header=None, sep=",",
                              encoding='cp949', names=column_name[data_type])

            if data_type != 'race_result':
                date = fname.split('.')[0]
                tmp['date'] = date[:4] + "-" + date[4:6] + "-" + date[-2:]

            df = pd.concat([df, tmp])

        # cast date column to dtype datetime
        df['date'] = df['date'].astype('datetime64[ns]')

        # append date offset to synchronize date with date of race_result data
        if data_type != 'race_result':
            df1 = df.copy()
            df1['date'] += pd.DateOffset(days=2)  # saturday
            df2 = df.copy()
            df2['date'] += pd.DateOffset(days=3)  # sunday
            df = df1.append(df2)

        # select columns to use
        df = df[used_column_name[data_type]]

        # insert dataframe to dictionary
        df_dict[data_type] = df
        
## race result에 모든 날짜 맞추겠음
    df_dict['race_result']['rank'].replace('1', 1., inplace=True)
    df_dict['race_result']['rank'].replace('2', 2., inplace=True)
    df_dict['race_result']['rank'].replace('3', 3., inplace=True)
    df_dict['race_result']['rank'].replace('4', 4., inplace=True)
    df_dict['race_result']['rank'].replace('5', 5., inplace=True)
    df_dict['race_result']['rank'].replace('6', 6., inplace=True)
    df_dict['race_result']['rank'].replace('7', 7., inplace=True)
    df_dict['race_result']['rank'].replace('8', 8., inplace=True)
    df_dict['race_result']['rank'].replace('9', 9., inplace=True)
    df_dict['race_result']['rank'].replace('10', 10., inplace=True)
    df_dict['race_result']['rank'].replace('11', 11., inplace=True)
    df_dict['race_result']['rank'].replace('12', 12., inplace=True)
    df_dict['race_result']['rank'].replace('13', 13., inplace=True)
    df_dict['race_result']['rank'].replace(' ', np.nan, inplace=True)
    
    # drop rows with rank missing values
    df_dict['race_result'].dropna(subset=['rank'], inplace=True)

    df_dict['race_result']['rank'] = df_dict['race_result']['rank'].astype('int')
    # make a column 'win' that indicates whether a horse ranked within the 3rd place
    df_dict['race_result']['win'] = df_dict['race_result'].apply(lambda x: 1 if x['rank'] < 4 else 0, axis=1)

    # drop duplicated rows
    df_dict['jockey'].drop_duplicates(subset=['date', 'jockey'], inplace=True)
    df_dict['owner'].drop_duplicates(subset=['date', 'owner'], inplace=True)
    df_dict['trainer'].drop_duplicates(subset=['date', 'trainer'], inplace=True)

    # merge dataframes
    df = df_dict['race_result'].merge(df_dict['horse'], on=['date', 'horse'], how='left')
    df = df.merge(df_dict['jockey'], on=['date', 'jockey'], how='left')
    df = df.merge(df_dict['owner'], on=['date', 'owner'], how='left')
    df = df.merge(df_dict['trainer'], on=['date', 'trainer'], how='left')

    # drop unnecessary columns which are used only for merging dataframes
    df.drop(['horse', 'jockey', 'trainer', 'owner'], axis=1, inplace=True)

    return df

def cache_data(df):
    df.to_csv('cache.csv', index=False)

def load_cache():
    return pd.read_csv('cache.csv')

def get_data(test_day, is_training, df=None):
    data_set = df if df is not None else load_cache()

    # select training and test data by test day
    # TODO : cleaning or filling missing value
    training_data = data_set[~data_set['date'].isin(test_day)].fillna(0)
    test_data = data_set[data_set['date'].isin(test_day)].fillna(0)

    # TODO : make your input feature columns

    # select training x and y
    training_y = training_data['win']
    training_x = training_data.drop(['win', 'date', 'race_num', 'rank', 'lane'], axis=1)

    # select test x and y
    ## 몇 번 레인에서 뛰는게 중요한 경우 넣어도 됨(일단 현재는 빼놓음)
    test_y = test_data['win']
    test_x = test_data.drop(['win', 'date', 'race_num', 'rank', 'lane'], axis=1)

    inspect_test_data(test_x, test_day)

    return (training_x, training_y) if is_training else (test_x, test_y)

def inspect_test_data(test_x, test_days):
    df = pd.DataFrame()

    for test_day in test_days:
        fname = os.path.join(DATA_PATH, 'race_result', test_day.replace('-', '') + '.csv')
        tmp = pd.read_csv(fname, header=None, sep=",",
                          encoding='cp949', names=column_name['race_result'])
        tmp.replace(' ', np.nan, inplace=True)
        tmp.dropna(subset=['rank'], inplace=True)

        df = pd.concat([df, tmp])

#     print(test_x.shape[0])
#     print(df.shape[0])

    assert test_x.shape[0] == df.shape[0], 'your test data is wrong!'
    

In [64]:
df = load_cache()
means_df = df[['date', 'race_num']].merge(
    df.groupby(('date', 'race_num')).mean().reset_index(),
    on=('date', 'race_num'), how='left')
std_df = df[['date', 'race_num']].merge(
    df.groupby(('date', 'race_num')).std().reset_index(),
    on=('date', 'race_num'), how='left')

#df['1yr_first'] = np.where(std_df['1yr_first'], (df['1yr_first'] - means_df['1yr_first']) / std_df['1yr_first'], 0)
#df['1yr_second'] = np.where(std_df['1yr_second'], (df['1yr_second'] - means_df['1yr_second']) / std_df['1yr_second'], 0)
df

  This is separate from the ipykernel package so we can avoid doing imports until
  


Unnamed: 0,date,race_num,rank,lane,win,age,1yr_first,1yr_second,price,weight,owner_money,race_count
0,2016-01-02,1,1,5,1,2.0,0.0,0.0,25000.0,,3.834330e+08,
1,2016-01-02,1,2,7,1,2.0,0.0,0.0,70000.0,,,
2,2016-01-02,1,3,4,1,2.0,0.0,1.0,40000.0,,1.755043e+09,
3,2016-01-02,1,4,3,0,2.0,0.0,0.0,33000.0,,1.680000e+07,
4,2016-01-02,1,5,9,0,2.0,0.0,0.0,40000.0,,2.525551e+09,
5,2016-01-02,1,6,10,0,2.0,0.0,0.0,35000.0,,2.307075e+09,
6,2016-01-02,1,7,6,0,2.0,0.0,1.0,30000.0,,2.056380e+09,
7,2016-01-02,1,8,1,0,2.0,0.0,0.0,40000.0,,1.748165e+08,
8,2016-01-02,1,9,8,0,2.0,0.0,0.0,20000.0,,1.794022e+09,
9,2016-01-02,1,10,2,0,2.0,0.0,0.0,30000.0,,2.880213e+09,


In [65]:
training_x, training_y = get_data(test_day, is_training=True, df=df)
model=svm.SVC(C=C, kernel=kernel, degree=degree, gamma=gamma,
         coef0=coef0, shrinking=shrinking, probability=probability,
         tol=tol, cache_size=cache_size, class_weight=class_weight,
         verbose=verbose, max_iter=max_iter, decision_function_shape=decision_function_shape,
         random_state=random_state)
model.fit(training_x, training_y)

#         print(f"params for {day}",model.get_params())
test_x, test_y = get_data(test_day, is_training=False, df=df)

pred_y = model.predict(test_x)
print(f"for {test_day}")
print('accuracy: {}'.format(accuracy_score(test_y, pred_y)))
print('recall: {}'.format(recall_score(test_y, pred_y)))
print('f1-score: {}'.format(f1_score(test_y, pred_y)))



for ['2019-04-13', '2019-04-14', '2019-04-20', '2019-04-21']
accuracy: 0.7154989384288747
recall: 0.007575757575757576
f1-score: 0.014705882352941178


In [19]:
#cache_data(load_data())

In [None]:
def k_f_score(test_day, C, kernel, degree, gamma,
                 coef0, shrinking, probability=False,
                 tol=1e-3, cache_size=200, class_weight=None,
                 verbose=False, max_iter=-1, decision_function_shape='ovr',
                 random_state=None):
    f_score=list()
    test_day = [[i] for i in test_day]
    for day in test_day:
        
        training_x, training_y = get_data(day, is_training=True)
        model=svm.SVC( C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated',
                 coef0=0.0, shrinking=True, probability=False,
                 tol=1e-3, cache_size=200, class_weight=None,
                 verbose=False, max_iter=-1, decision_function_shape=decision_function_shape,
                 random_state=None)
        model.fit(training_x, training_y)
        
#         print(f"params for {day}",model.get_params())
        test_x, test_y = get_data(day, is_training=False)

        pred_y = model.predict(test_x)
        print(f"for {day}")
        print('accuracy: {}'.format(accuracy_score(test_y, pred_y)))
        print('recall: {}'.format(recall_score(test_y, pred_y)))
        print('f1-score: {}'.format(f1_score(test_y, pred_y)))
    
        f_score.append(f1_score(test_y, pred_y))
        k_f_score = np.mean(f_score)
    return k_f_score

In [None]:
k_f_score(test_day, C, kernel, degree, gamma,
                 coef0, shrinking, probability=False,
                 tol=1e-3, cache_size=200, class_weight=None,
                 verbose=False, max_iter=-1, decision_function_shape='ovr',
                 random_state=None)

## paramter search

In [None]:
C

In [None]:
# Parameter search 
kernel_set = ['rbf','poly'] # ['rbf','poly','linear']
C_set = [100,10] # [10000,5000,3000,1000,500,100,10,1,0.1]
classweight_set = [1,10] #[1, 2, 3, 5, 10]

best_f1_score=0
best_kernel=''
best_c =0
best_weight=0
update = 0
for kernel in kernel_set :
    for C in C_set :
        for classweight in classweight_set :
            score = k_f_score(test_day, C=C, kernel=kernel, degree, gamma,
                 coef0, shrinking, probability=False,
                 tol=1e-3, cache_size=200, class_weight= {1: classweight},
                 verbose=False, max_iter=-1, decision_function_shape='ovr',
                 random_state=None)
            
            if score> best_f1_score:
                update+=1
                print(update, 'th update')
                print('f1_score : ',score)
                print('kernel : ', i)
                print('C : ', j)
                print('class weight : ',k)
                best_f1_score=score
                best_kernel=i
                best_c=j
                best_weight=k
                print()

print('Best setting')
print('f1_score : ', best_f1_score)
print('kernel : ', best_kernel)
print('C : ', best_c)
print('class weight : ', best_weight)