In [1]:
%config IPCompleter.greedy=True
import pandas as pd
import pickle
import datetime
import re
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as pit
from jupyterthemes import jtplot
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np

def preprocessing(results, kako=5):
    df = results.copy()
    df = df[-(df['前走レースID(新)'].astype(str).str.contains("\D"))]
    
    df['斤量'] = df['斤量'].map(lambda x: re.sub('▲|☆|△|◇|★', '', str(x)))
    df['斤量'] = df['斤量'].astype(float)
    df['rank'] = df['rank'].map(lambda x: 1 if x < 4 else 0)
    for num in range(1,kako + 1):
        df[str(num) + '走前斤量'] = df[str(num) + '走前斤量'].map(lambda x: re.sub('▲|☆|△|◇|★', '', str(x)))
        df[str(num) + '走前斤量'] = df[str(num) + '走前斤量'].astype(float)
    
    df['馬体重増減'] = df['馬体重増減'].astype(float)
    
    df['コース'] = df['距離'].map(lambda x: x[:1])
    df['距離'] = df['距離'].map(lambda x: re.split('芝|ダ', x)[1]).astype(int)
    df['date'] = df['レースID(新)'].astype(str).map(lambda x: datetime.datetime.strptime(x[:8], '%Y%m%d'))
    
    df.drop(['レースID(新)', '前走レースID(新)','クラス名', '出走頭数', '馬体重', '馬名', '着順',
             '単勝オッズ', '人気',
                   '多頭出し', '所属', '騎手', '調教師','種牡馬', '母父馬','馬連', '３連単', '走破タイム', '着差', 
                   '1角', '2角', '3角', '4角','上り3F', '上り3F順', '賞金', '付加賞金'], axis=1, inplace=True)
    
#     for num in range(1, kako + 1):
#         str_num = str(num) + '走前'
#         df.drop([
#             str_num + '人気',
#             str_num + '単勝オッズ',
#         ], axis=1, inplace=True)
    for num in range(kako + 1, 10):
        str_num = str(num) + '走前'
        df.drop([
            str_num + '斤量',
            str_num + '騎手コード',
            str_num + '馬番',
            str_num + '人気',
            str_num + '馬体重増減',
            str_num + '単勝オッズ',
#             str_num + '着差',
#             str_num + '1角',
#             str_num + '2角',
#             str_num + '3角',
#             str_num + '4角',
#             str_num + '上り3F',
#             str_num + '上り3F順',
#             str_num + '賞金',
#             str_num + '付加賞金'
        ], axis=1, inplace=True)

    return df

def split_data(df, test_size=0.3):
    sorted_ids = df.sort_values('date').index.unique()[:50000]
    train_ids = sorted_ids[:round(len(sorted_ids) * (1-test_size))]
    test_ids = sorted_ids[round(len(sorted_ids) * (1-test_size)):]
    
    train = df.loc[train_ids]
    test = df.loc[test_ids]
    
    return train, test

def process_categorical(df, target_columns):
    df2 = df.copy()
    for column in target_columns:
        df2[column] = LabelEncoder().fit_transform(df2[column].fillna('Na'))
    df2 = pd.get_dummies(df2)
    for column in target_columns:
        df2[column] = df2[column].astype('category')
        
    return df2

def format_kako_race(df, kako=5):
    df_copy = df.copy()
    df_copy = df_copy[-(df_copy['着順'].astype(str).str.contains("\D"))]
    df_copy['着順'] = df_copy['着順'].astype(int)
    df_copy['rank'] = df_copy['着順'].map(lambda x: x if x < 4 else 4)
    for num in range(1, kako + 1):
        str_num = str(num)
        df_copy = df_copy[-(df_copy[str_num + '走前着順'].astype(str).str.contains("\D"))]
        df_copy['rank_' + str_num] = df_copy[str_num + '走前着順'].astype(int).map(lambda x: x if x < 4 else 4)
        # 3着以内かどうか
        df_copy['rank_' + str_num] = df_copy['rank_' + str(num)].map(lambda x: 1 if x < 4 else 0)
        df_copy['jockey_'+ str_num] = df_copy[str_num + '走前騎手コード']
        
    for num in range(1, 10):
        str_num = str(num)
        df_copy.drop([
            str_num + '走前馬連',
            str_num + '走前３連単',
            str_num + '走前クラス名',
            str_num + '走前着順',
            str_num + '走前騎手',
#             str_num + '走前人気',
#             str_num + '走前馬番',
            str_num + '走前走破タイム',
            str_num + '走前着差',
            str_num + '走前場所',
            str_num + '走前距離',
            str_num + '走前馬体重',
#             str_num + '走前馬体重増減',
#             str_num + '走前単勝オッズ',
            str_num + '走前枠番',
            str_num + '走前馬連',
            str_num + '走前３連単',
            str_num + '走前1角',
            str_num + '走前2角',
            str_num + '走前3角',
            str_num + '走前4角',
            str_num + '走前上り3F',
            str_num + '走前上り3F順',
            str_num + '走前賞金',
            str_num + '走前付加賞金'
        ], axis=1, inplace=True)

    return df_copy

class ModelEvaluator:
    def __init__(self, model, haitou_table, std = True):
        self.model = model
        self.haitou = haitou_table
        self.std = std
        
    def predict_proba(self, X):
        proba = pd.Series(self.model.predict_proba(X)[:, 1], index=X.index)
        if self.std:
            standard_scaler = lambda x: (x - x.mean()) / x.std()
            proba = proba.groupby(level=0).transform(standard_scaler)
            proba = (proba - proba.min()) / (proba.max() - proba.min())
        return proba
    
    def prefict(self, X, threshold=0.5):
        y_pred = self.predict_proba(X)
        return [0 if p < threshold else 1 for p in y_pred]
    
    def score(self, y_true, X):
        return roc_auc_score(y_true, self.predict_proba(X))
    
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({'features': X.columns, 'importance': self.model.feature_importances_})
        return importances.sort_values('importance', ascending=False)[:n_display]
    
    def pred_table(self, X, threshold=0.5, bet_only=True):
        pred_table = X.copy()[['馬番']]
        pred_table['pred'] = self.prefict(X, threshold)
        if bet_only:
            pred_table = pred_table[pred_table['pred'] == 1]
            pred_table.drop(['pred'], axis=1, inplace=True)
            return pred_table
        else:
            return pred_table
        
    def fukusho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        money = -100 * n_bets
        haitou = self.haitou.copy()
        df = haitou.merge(pred_table, left_index=True, right_index=True, how='right')
        for i in range(1, 5):
            money += df[df['馬番_' + str(i)] == df['馬番']]['複勝_' + str(i)].sum()
        return n_bets, money
    
    def tansho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        money = -100 * n_bets
        haitou = self.haitou.copy()
        df = haitou.merge(pred_table, left_index=True, right_index=True, how='right')
        df['単勝配当'] = df['単勝配当'].astype(int)
        money += df[df['馬番_1'] == df['馬番']]['単勝配当'].sum()
        return n_bets, money
    
def gain(return_func, X, n_samples=100, lower=50, min_threshold=0.5):
    gain = {}
    for i in tqdm(range(n_samples)):
        threshold = 1 * i /n_samples + min_threshold * (1 - i/n_samples)
        n_bets, money = return_func(X, threshold)
        if n_bets > lower:
            gain[n_bets] = (n_bets*100 + money) / (n_bets * 100)
    return pd.Series(gain)

In [2]:
# zi = pd.read_csv('./csv/horse_zi.csv')
# zi.to_pickle('horse_zi.pickle')
zi = pd.read_pickle('horse_zi.pickle')
zi

Unnamed: 0_level_0,スピード指数
レースID(新),Unnamed: 1_level_1
200701060601010101,94
200701060601010102,91
200701060601010103,116
200701060601010104,114
200701060601010105,89
...,...
202205150703041202,95
202205150703041203,107
202205150703041204,94
202205150703041205,96


In [3]:
allrace_peds_raceid = pd.read_pickle('allrace_peds_raceid.pickle')
allrace_peds_raceid

Unnamed: 0_level_0,レースID(新),コンピ指数,コンピ順位,前走レースID(新),場所,距離,クラス名,出走頭数,着順,騎手,...,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61,horse_id,スピード指数
レースID(新/馬番無),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022051505020812,202205150502081205,78,1,202204030902040710,東京,ダ1400,3勝,16,１,川田将雅,...,Storm Bird,Terlingua,フォーティナイナー,Leap Lively,Mr. Prospector,Secrettame,Riverman,Teacher's Joy,2018105954,118
2022051505020812,202205150502081212,70,3,202204170902081212,東京,ダ1400,3勝,16,２,坂井瑠星,...,Storm Bird,Terlingua,Alydar,Track Robbery,Naskra,Candle Star,Whitesburg,Light Verse,2018105470,113
2022051505020812,202205150502081207,56,4,202204160902071011,東京,ダ1400,3勝,16,３,福永祐一,...,Northern Dancer,Sex Appeal,Mill Reef,Irish Lass,ゼダーン,Khairunissa,Sheshoon,Manush,2017106572,109
2022051505020812,202205150502081215,55,5,202204160902071006,東京,ダ1400,3勝,16,４,武豊,...,Hail to Reason,Cosmah,Understanding,Mountain Flower,Bold Ruler,Somethingroyal,Nijinsky,Prodana Neviesta,2016104952,108
2022051505020812,202205150502081206,75,2,202203120702011115,東京,ダ1400,3勝,16,５,ルメール,...,Hail to Reason,Cosmah,Understanding,Mountain Flower,テスコボーイ,ソシアルバターフライ,パーソロン,スイートルナ,2018105334,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007010608010101,200701060801010104,47,10,200612170903060116,京都,ダ1200,未勝利,16,12,武幸四郎,...,Red God,Runaway Bride,Donut King,Fleeting Doll,Northern Dancer,Windy Answer,Sir Gaylord,Windsail,2004102278,102
2007010608010101,200701060801010115,45,11,200612170903060409,京都,ダ1200,未勝利,16,13,川島信二,...,Princequillo,Knights Daughter,Johnstown,Vienna,Princely Gift,Costa Sola,ダイハード,メジロマンゲツ,2004103399,93
2007010608010101,200701060801010103,40,16,200607020804060504,京都,ダ1200,未勝利,16,14,赤木高太,...,Nearctic,Natalma,Round Table,Zonah,Battle Joined,Fast Turn,Olden Times,Chavalon,2004101085,84
2007010608010101,200701060801010114,41,15,200610140805030211,京都,ダ1200,未勝利,16,15,安部幸夫,...,Aristophanes,Trevisa,Nantallah,Rough Shod,Tom Fool,Busanda,Bold Ruler,Grey Flight,2004105239,109


In [21]:
a = pd.merge(allrace_peds_raceid, zi, on='レースID(新)', how='inner')
raceid = pd.read_csv("./csv/raceid.csv")
race = pd.merge(raceid, a, on='レースID(新)', how='inner')
race = race.set_index('レースID(新/馬番無)')

Unnamed: 0_level_0,レースID(新),コンピ指数,コンピ順位,前走レースID(新),場所,距離,クラス名,出走頭数,着順,騎手,...,peds_54,peds_55,peds_56,peds_57,peds_58,peds_59,peds_60,peds_61,horse_id,スピード指数
レースID(新/馬番無),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022051505020812,202205150502081205,78,1,202204030902040710,東京,ダ1400,3勝,16,１,川田将雅,...,Storm Bird,Terlingua,フォーティナイナー,Leap Lively,Mr. Prospector,Secrettame,Riverman,Teacher's Joy,2018105954,118
2022051505020812,202205150502081212,70,3,202204170902081212,東京,ダ1400,3勝,16,２,坂井瑠星,...,Storm Bird,Terlingua,Alydar,Track Robbery,Naskra,Candle Star,Whitesburg,Light Verse,2018105470,113
2022051505020812,202205150502081207,56,4,202204160902071011,東京,ダ1400,3勝,16,３,福永祐一,...,Northern Dancer,Sex Appeal,Mill Reef,Irish Lass,ゼダーン,Khairunissa,Sheshoon,Manush,2017106572,109
2022051505020812,202205150502081215,55,5,202204160902071006,東京,ダ1400,3勝,16,４,武豊,...,Hail to Reason,Cosmah,Understanding,Mountain Flower,Bold Ruler,Somethingroyal,Nijinsky,Prodana Neviesta,2016104952,108
2022051505020812,202205150502081206,75,2,202203120702011115,東京,ダ1400,3勝,16,５,ルメール,...,Hail to Reason,Cosmah,Understanding,Mountain Flower,テスコボーイ,ソシアルバターフライ,パーソロン,スイートルナ,2018105334,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007010608010101,200701060801010104,47,10,200612170903060116,京都,ダ1200,未勝利,16,12,武幸四郎,...,Red God,Runaway Bride,Donut King,Fleeting Doll,Northern Dancer,Windy Answer,Sir Gaylord,Windsail,2004102278,102
2007010608010101,200701060801010115,45,11,200612170903060409,京都,ダ1200,未勝利,16,13,川島信二,...,Princequillo,Knights Daughter,Johnstown,Vienna,Princely Gift,Costa Sola,ダイハード,メジロマンゲツ,2004103399,93
2007010608010101,200701060801010103,40,16,200607020804060504,京都,ダ1200,未勝利,16,14,赤木高太,...,Nearctic,Natalma,Round Table,Zonah,Battle Joined,Fast Turn,Olden Times,Chavalon,2004101085,84
2007010608010101,200701060801010114,41,15,200610140805030211,京都,ダ1200,未勝利,16,15,安部幸夫,...,Aristophanes,Trevisa,Nantallah,Rough Shod,Tom Fool,Busanda,Bold Ruler,Grey Flight,2004105239,109


In [6]:
raceid = pd.read_csv("./csv/raceid.csv")
dev = pd.read_csv("./csv/horse_zi_dev.csv")
race = pd.merge(raceid, dev, on='レースID(新)', how='inner')
# race = race.set_index('レースID(新/馬番無)')
# race.drop(['レースID(新)'], axis=1, inplace=True)
race.to_pickle('horse_zi_dev.pickle')
race = pd.read_pickle('horse_zi_dev.pickle')
race

Unnamed: 0,レースID(新),レースID(新/馬番無),スピード偏差値
0,202205150502081205,2022051505020812,65.9
1,202205150502081212,2022051505020812,60.2
2,202205150502081207,2022051505020812,55.7
3,202205150502081215,2022051505020812,54.5
4,202205150502081206,2022051505020812,68.2
...,...,...,...
679656,200701060801010104,2007010608010101,50.0
679657,200701060801010115,2007010608010101,40.0
679658,200701060801010103,2007010608010101,30.0
679659,200701060801010114,2007010608010101,57.8


In [None]:
allrace_peds_raceid = pd.read_pickle('allrace_peds_raceid.pickle')
merge = allrace_peds_raceid.merge(race, left_index=True, right_index=True, how='left')
merge

In [14]:
race_dev = pd.merge(allrace_peds_raceid, race, on='レースID(新)', how='inner')
# race1 = pd.merge(raceid, race_dev, on='レースID(新)', how='inner')
race1 = race_dev.set_index('レースID(新/馬番無)')
race1.to_pickle('allrace_peds_raceid.pickle')