In [1]:
%config IPCompleter.greedy=True
import pandas as pd
import pickle
import datetime
import re
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as pit
from jupyterthemes import jtplot
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt

def preprocessing(results, kako=5):
    df = results.copy()
    df = df[-(df['前走レースID(新)'].astype(str).str.contains("\D"))]
    
    df['斤量'] = df['斤量'].map(lambda x: re.sub('▲|☆|△|◇|★', '', str(x)))
    df['斤量'] = df['斤量'].astype(float)
    df = df[-(df['着順'].astype(str).str.contains("\D"))]
    df['着順'] = df[df['着順'].notna()]['着順'].astype(int).map(lambda x: x if x < 4 else 4)
    df['rank'] = df['着順'].map(lambda x: 1 if x < 4 else 0)
    df['着差']  = df[-(df['着差'] == '----')]['着差'].astype(float)
#     df['time'] = df['走破タイム'].map(lambda x: ((int(x[0])*60*10) + (((int(x[2]) *10) + int(x[3]))* 10) + int(x[5])))
#     df['difference'] = [-(df['着差'] == '----')]['着差'].astype(np.float64)
    df['馬体重増減'] = df['馬体重増減'].astype(float)
    
    df['コース'] = df['距離'].map(lambda x: x[:1])
    df['距離'] = df['距離'].map(lambda x: re.split('芝|ダ', x)[1]).astype(int)
    df['date'] = df['レースID(新)'].astype(str).map(lambda x: datetime.datetime.strptime(x[:8], '%Y%m%d'))
    
    df.drop(['レースID(新)', '前走レースID(新)','クラス名', '出走頭数', '馬体重', '馬名', '着順',
#              '単勝オッズ',
             '人気', 
#              'スピード指数',
             'コンピ指数', 'コンピ順位',
                   '多頭出し', '所属', '騎手', '調教師','種牡馬', '母父馬','馬連', '３連単', '走破タイム', '着差', 
                   '1角', '2角', '3角', '4角',
             '上り3F', '上り3F順', '賞金', '付加賞金'], axis=1, inplace=True)
    
    for i in range(1, 10):
        str_num = str(i) + '走前'
        df[str_num + '着差']  = df[-(df[str_num + '着差'] == '----')][str_num + '着差'].astype(float)
        df[str_num + '着順'] = df[df[str_num + '着順'].notna()][str_num + '着順']
        df[str_num + '着順'] = df[-(df[str_num + '着順'] == 0.0)][str_num + '着順']
        df[str_num + '着順'] = df[-(df[str_num + '着順'] == 0.5)][str_num + '着順']
        df[str_num + '_rank'] = df[df[str_num + '着順'].notna()][str_num + '着順'].astype(int).map(lambda x: x if x < 4 else 4)
        df[str_num + '斤量'] = df[str_num + '斤量'].map(lambda x: re.sub('▲|☆|△|◇|★', '', str(x)))
        df[str_num + '斤量'] = df[str_num + '斤量'].astype(float)

    df = df.set_index('レースID(新/馬番無)')
    return df

def split_data(df, test_size=0.3):
    sorted_ids = df.sort_values('date').index.unique()
    train_ids = sorted_ids[:round(len(sorted_ids) * (1-test_size))]
    test_ids = sorted_ids[round(len(sorted_ids) * (1-test_size)):]
    
    train = df.loc[train_ids]
    test = df.loc[test_ids]
    
    return train, test

def process_categorical(df, target_columns):
    df2 = df.copy()
    for column in target_columns:
        df2[column] = LabelEncoder().fit_transform(df2[column].fillna('Na'))
    df2 = pd.get_dummies(df2)
    for column in target_columns:
        df2[column] = df2[column].astype('category')
        
    return df2

def format_kako_race(df, kako=5):
    df_copy = df.copy()
    df_copy = df_copy.reset_index()
    df_copy = df_copy[-(df_copy['着順'].astype(str).str.contains("\D"))]
    df_copy['着順'] = df_copy['着順'].astype(int)
    df_copy['rank'] = df_copy['着順'].map(lambda x: x if x < 4 else 4)
        
#     for num in range(1, 10):
#         str_num = str(num)
#         df_copy.drop([
#             str_num + '走前馬連',
#             str_num + '走前３連単',
#             str_num + '走前クラス名',
#             str_num + '走前着順',
#             str_num + '走前騎手',
# #             str_num + '走前人気',
# #             str_num + '走前馬番',
#             str_num + '走前走破タイム',
#             str_num + '走前着差',
#             str_num + '走前場所',
#             str_num + '走前距離',
#             str_num + '走前馬体重',
# #             str_num + '走前馬体重増減',
# #             str_num + '走前単勝オッズ',
#             str_num + '走前枠番',
#             str_num + '走前馬連',
#             str_num + '走前３連単',
#             str_num + '走前1角',
#             str_num + '走前2角',
#             str_num + '走前3角',
#             str_num + '走前4角',
# #             str_num + '走前上り3F',
#             str_num + '走前上り3F順',
# #             str_num + '走前賞金',
#             str_num + '走前付加賞金'
#         ], axis=1, inplace=True)

    df_copy = df_copy.set_index('レースID(新/馬番無)')
    return df_copy

class ModelEvaluator:
    def __init__(self, model, haitou_table, std = True):
        self.model = model
        self.haitou = haitou_table
        self.std = std
        
    def predict_proba(self, X):
        proba = pd.Series(self.model.predict_proba(X)[:, 1], index=X.index)
#         proba = pd.Series(self.model.predict_proba(X.drop(['単勝オッズ'], axis=1))[:, 1], index=X.index)
        if self.std:
            standard_scaler = lambda x: (x - x.mean()) / x.std()
            proba = proba.groupby(level=0).transform(standard_scaler)
            proba = (proba - proba.min()) / (proba.max() - proba.min())
        return proba
    
    def prefict(self, X, threshold=0.5):
        y_pred = self.predict_proba(X)
        return [0 if p < threshold else 1 for p in y_pred]
    
    def score(self, y_true, X):
        return roc_auc_score(y_true, self.predict_proba(X))
    
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({'features': X.columns, 'importance': self.model.feature_importances_})
        return importances.sort_values('importance', ascending=False)[:n_display]
    
    def pred_table(self, X, threshold=0.5, bet_only=True):
        pred_table = X.copy()[['馬番', '単勝オッズ']]
        pred_table['pred'] = self.prefict(X, threshold)
        if bet_only:
            pred_table = pred_table[pred_table['pred'] == 1][['馬番', '単勝オッズ']]
            return pred_table
        else:
            return pred_table
        
    def fukusho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        money = -100 * n_bets
        haitou = self.haitou.copy()
        df = haitou.merge(pred_table, left_index=True, right_index=True, how='right')

        n_hits = len(df[df['馬番_1'] == df['馬番']]) + len(df[df['馬番_2'] == df['馬番']]) + len(df[df['馬番_3'] == df['馬番']]) + len(df[df['馬番_4'] == df['馬番']])
        for i in range(1, 5):
            money += df[df['馬番_' + str(i)] == df['馬番']]['複勝_' + str(i)].sum()
        return_rate =  (n_bets*100 + money) / (n_bets * 100)
        return n_bets, return_rate,n_hits
    
    def tansho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        
        n_races = pred_table.index.nunique()
        
        money = -100 * n_bets
        df = self.haitou.copy()
        df = haitou.merge(pred_table, left_index=True, right_index=True, how='right')
        df['単勝配当'] = df['単勝配当'].astype(int)
        
        std = ((df['馬番_1'] ==  df['馬番']) * df['単勝配当'])\
        .groupby(level=0).sum().std() * np.sqrt(n_races) / (100 * n_bets)
        
        n_hits = len(df[df['馬番_1'] == df['馬番']])
        
        money += df[df['馬番_1'] == df['馬番']]['単勝配当'].sum()
        return_rate =  (n_bets*100 + money) / (n_bets * 100)
        return n_bets, return_rate, n_hits
    
    def tansho_return_proper(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        n_races = pred_table.index.nunique()
        df = self.haitou.copy()
        df = df.merge(pred_table, left_index=True, right_index=True, how='right')
        
        bet_money = (1/pred_table['単勝オッズ']).sum()
        std = ((df['馬番_1'] == df['馬番']).astype(int)).groupby(level=0).sum().std() * np.sqrt(n_races) / bet_money
        
        n_hits = len(df.query('馬番_1 == 馬番'))
        return_rate = n_hits/bet_money
        return n_bets, return_rate, n_hits
    
def gain(return_func, X, n_samples=100, lower=50, min_threshold=0.5):
    gain = {}
    for i in tqdm(range(n_samples)):
        threshold = 1 * i /n_samples + min_threshold * (1 - i/n_samples)
        n_bets, return_rate, n_hits = return_func(X, threshold)
        if n_bets == 0:
            break;
        if n_bets > lower:
            gain[n_bets] = { 'return_rate': return_rate, 'n_hits': n_hits }
    return pd.DataFrame(gain).T

In [23]:
allrace = pd.read_pickle('./pickle/all_merge.pickle')
df_copy = allrace.copy()

In [24]:
owner = pd.read_csv("./csv/owner.csv")
allrace = pd.merge(df_copy, owner, on='レースID(新)', how='inner')

In [25]:
all_r = preprocessing(allrace)

In [26]:
standard_scaler = lambda  x: (x - x.mean()) / x.std()
cp_all_r = all_r.copy()

for i in range(1, 10):
    cp_all_r[str(i) + '走前着差'] = cp_all_r.groupby(level=0)[str(i) + '走前着差'].transform(standard_scaler)

In [31]:
for i in range(1, 10):
    cp_all_r[str(i) + '走前着差'] = (cp_all_r[str(i) + '走前着差']- cp_all_r[str(i) + '走前着差'].min()) / (cp_all_r[str(i) + '走前着差'].max() - cp_all_r[str(i) + '走前着差'].min())

In [34]:
cp_all_r.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 659917 entries, 2022051505020812 to 2007010608010101
Data columns (total 365 columns):
 #    Column       Dtype         
---   ------       -----         
 0    場所           object        
 1    距離           int64         
 2    性別           object        
 3    年齢           int64         
 4    斤量           float64       
 5    馬体重増減        float64       
 6    単勝オッズ        float64       
 7    枠番           int64         
 8    馬番           int64         
 9    コンピ一位指数      float64       
 10   コンピ一位指数差     float64       
 11   コンピ前指数       float64       
 12   コンピ後指数       float64       
 13   1走前着順        float64       
 14   1走前賞金        float64       
 15   1走前jockeyid  float64       
 16   1走前コンピ指数     float64       
 17   1走前コンピ順位     float64       
 18   1走前タイム指数     float64       
 19   1走前タイム指数偏差値  float64       
 20   1走前場所        object        
 21   1走前クラス       object        
 22   1走前コース       object        
 23   1走前距離  