In [1]:
#!pip install optuna
import pandas as pd
import pickle
import datetime
import re
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, brier_score_loss
import matplotlib.pyplot as pit
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import torch
from torch import nn
# from google.colab import drive

# exports
def plot_calibration_curve(named_classifiers, X_test, y_test):
    fig = plt.figure(figsize=(10, 10))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "k:", label="完全な補正")
    for name, clf in named_classifiers.items():
        prob_pos = clf.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, prob_pos)
        brier = brier_score_loss(y_test, prob_pos)
        print("%s:" % name)
        print("\tAUC  : %1.3f" % auc)
        print("\tBrier: %1.3f" % (brier))
        print()

        fraction_of_positives, mean_predicted_value = calibration_curve(
            y_test,
            prob_pos,
            n_bins=10,
        )

        ax1.plot(
            mean_predicted_value,
            fraction_of_positives,
            "s-",
            label="%s (%1.3f)" % (name, brier),
        )

        ax2.hist(prob_pos, range=(0, 1), bins=10, label=name, histtype="step", lw=2)

    ax1.set_ylabel("正例の比率")
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title("信頼性曲線")

    ax2.set_xlabel("予測値の平均")
    ax2.set_ylabel("サンプル数")
    ax2.legend(loc="upper center", ncol=2)

    plt.tight_layout()

def preprocessing(results, kako=5):
    df = results.copy()

    df.drop([
        'compi',
        'compi_num', 
        "speed", 
        'rank',
#         'result',
#         'course',
        'born',
        '1走前走破タイム', '2走前走破タイム', '3走前走破タイム',
        '4走前走破タイム', '5走前走破タイム',
        '1走前補正タイム', '2走前補正タイム', '3走前補正タイム',
        '4走前補正タイム', '5走前補正タイム',
        '1走前結果', '2走前結果', '3走前結果',
        '4走前結果', '5走前結果',
        '1走前オッズ', '2走前オッズ', '3走前オッズ',
        '4走前オッズ', '5走前オッズ',
        '1走前コンピ指数', '2走前コンピ指数', '3走前コンピ指数',
        '4走前コンピ指数', '5走前コンピ指数',
        'horse_race_id',  'body_weight','body_weight_in_de',
            '騎手全体勝率','騎手全体連対率','騎手全体複勝率','騎手競馬場別騎乗回数','騎手競馬場別勝率','騎手競馬場別連対率','騎手競馬場別複勝率',
    '騎手コース別騎乗回数','騎手コース別勝率','騎手コース別連対率','騎手コース別複勝率','騎手距離別騎乗回数','騎手距離別勝率','騎手距離別連対率',
    '騎手距離別複勝率','騎手同コース同距離別騎乗回数','騎手同コース同距離別勝率','騎手同コース同距離別連対率','騎手同コース同距離別複勝率',
    '調教師全体勝率','調教師全体連対率','調教師全体複勝率','調教師競馬場別騎乗回数','調教師競馬場別勝率','調教師競馬場別連対率',
    '調教師競馬場別複勝率','調教師コース別騎乗回数','調教師コース別勝率','調教師コース別連対率','調教師コース別複勝率','調教師距離別騎乗回数',
    '調教師距離別勝率','調教師距離別連対率','調教師距離別複勝率','調教師同コース同距離別騎乗回数','調教師同コース同距離別勝率',
    '調教師年齢別年間勝率', '調教師年齢別年間連対率', '調教師年齢別年間複勝率','調教師年齢別勝率', '調教師年齢別連対率', '調教師年齢別複勝率',
    '調教師同コース同距離別連対率','調教師同コース同距離別複勝率','種牡馬全体勝率','種牡馬全体連対率','種牡馬全体複勝率',
    '種牡馬競馬場別出走頭数','種牡馬競馬場別勝率','種牡馬競馬場別連対率','種牡馬競馬場別複勝率','種牡馬コース別出走頭数','種牡馬コース別勝率',
    '種牡馬コース別連対率','種牡馬コース別複勝率','種牡馬距離別出走頭数','種牡馬距離別勝率','種牡馬距離別連対率','種牡馬距離別複勝率',
    '種牡馬同コース同距離別出走頭数','種牡馬同コース同距離別勝率','種牡馬同コース同距離別連対率','種牡馬同コース同距離別複勝率',
    '種牡馬同周り勝率', '種牡馬同周り連対率', '種牡馬同周り複勝率',  '種牡馬同枠勝率', '種牡馬同枠連対率', '種牡馬同枠複勝率',
    '父系統出走頭数','父系統全体勝率','父系統全体連対率','父系統全体複勝率','父系統競馬場別出走頭数','父系統競馬場別勝率','父系統競馬場別連対率','父系統競馬場別複勝率','父系統コース別出走頭数','父系統コース別勝率',
    '父系統コース別連対率','父系統コース別複勝率','父系統距離別出走頭数','父系統距離別勝率','父系統距離別連対率','父系統距離別複勝率',
    '父系統同コース同距離別出走頭数','父系統同コース同距離別勝率','父系統同コース同距離別連対率','父系統同コース同距離別複勝率',
    '季節勝率', '季節連対率', '季節複勝率', '逃げ率','先行率','中団率','追込率','マクリ率','上がり3F平均',
    '勝率','同競馬場勝率','同距離勝率','同競馬場同距離勝率','同騎手騎乗勝率','コースタイプ勝率','同距離同クラス勝率','同枠タイプ生涯勝率',
    '連対率','同競馬場連対率','同距離連対率','同競馬場同距離連対率','同騎手騎乗連対率','コースタイプ連対率','同距離同クラス連対率','同枠タイプ生涯連対率',
    '複勝率','同競馬場複勝率','同距離複勝率','同競馬場同距離複勝率','同騎手騎乗複勝率','コースタイプ複勝率','同距離同クラス複勝率','同枠タイプ生涯複勝率',
    '生涯出遅れ率','騎乗騎手年間出遅れ率', '同周り勝率', '同周り連対率', '同周り複勝率',

    ], axis=1, inplace=True)
    df['date'] = df['date'].astype(str).map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))
    
    df = df.sort_values(by='date', ascending = False)
    df = df.set_index('race_id')
    return df

def split_data(df, test_size=0.3, place=None):
    sorted_ids = df.sort_values('date').index.unique()
    train_ids = sorted_ids[:round(len(sorted_ids) * (1-test_size))]
    test_ids = sorted_ids[round(len(sorted_ids) * (1-test_size)):]

    train = df.loc[train_ids]
    test = df.loc[test_ids]
    
    return train, test

def train_valid_split_data(df, test_size=0.3):
    sorted_ids = df.sort_values('date').index.unique()
    train_ids = sorted_ids[:round(len(sorted_ids) * (1-test_size))]
    test_ids = sorted_ids[round(len(sorted_ids) * (1-test_size)):]
    
    train = df.loc[train_ids]
    valid = df.loc[test_ids]
    
    return train, valid

def process_categorical(df, target_columns):
    df2 = df.copy()
    for column in target_columns:
        df2[column] = LabelEncoder().fit_transform(df2[column].fillna('Na'))
    # df2 = pd.get_dummies(df2, sparse=True)
    df2 = pd.get_dummies(df2)
    for column in target_columns:
        df2[column] = df2[column].astype('category')
        
    return df2

class TimeModel:
    def __init__(self, model, base_data):
        self.model = model
        self.base_data = base_data
        
    def pred_time(self, X):
        pred_time = self.base_data.copy()[['id', 'popular']]
        actual_table = X.copy()[['id', 'h_num', 'place_id']]

        X = X.drop(['id'], axis=1)
        actual_table['pred_time'] = model.predict(X)

        actual_table = actual_table.reset_index()
        pred_time = pred_time.reset_index()
        actual = pred_time.merge(actual_table, left_index=True, right_index=True, how='right')
        actual.drop(['id_x', 'id_y', 'race_id_y'], axis=1, inplace=True)

        return actual
    
    def race_pred_time(self, X):
        actual = self.pred_time(X)
        groups = actual.groupby('race_id_x').groups
        column_list = ["h_num", 'pred_time', 'popular']

        new_df = pd.DataFrame()
        max_length = 0
        for group, indexes in groups.items():
            # 最後に並び替えをさせるのに最大作成された項目数を記録
            length = len(indexes)+1
            if length > max_length:
                max_length = length

            columns = list()
            values = list()
            columns += ['race_id', 'place_id']
            values += [actual.iloc[indexes]['race_id_x'].T.tolist()[0], actual.iloc[indexes]['place_id'].T.tolist()[0]]

            for target_column in column_list:
                columns += [f'{target_column}_{x}' for x in range(1, length)]
                sort_values = actual.iloc[indexes, :].sort_values(by='pred_time', ascending = False)
                values += sort_values[target_column].T.tolist()

            record_df = pd.DataFrame([values], columns=columns)
            new_df = pd.concat([new_df, record_df], axis=0)

        return new_df

class ModelEvaluator:
    def __init__(self, model, haitou_table, std = True):
        self.model = model
        self.haitou = haitou_table
        self.std = std
        self.pp = None
        
    def predict_proba(self, X, std=True):
#         proba = pd.Series(self.model.predict_proba(X)[:, 1], index=X.index)
        if self.pp is not None:
          return self.pp

        proba = pd.Series(self.model.predict_proba(X.drop(['id', 'odds', 'time_odds'], axis=1))[:, 1], index=X.index)
        if std:
            standard_scaler = lambda x: (x - x.mean()) / x.std()
            proba = proba.groupby(level=0).transform(standard_scaler)
            proba = (proba - proba.min()) / (proba.max() - proba.min())
            
        self.pp = proba
        return proba
    
    def prefict(self, X, threshold=0.5):
        y_pred = self.predict_proba(X)
        return [0 if p < threshold else 1 for p in y_pred]
    
    def win_ratio(self, X):
        sum1 = pd.DataFrame(self.predict_proba(X).groupby(level=0).sum())
        y_pred = self.predict_proba(X)

        return [(p / sum1.loc[i])[0] for i, p in y_pred.items()]
    
    def score(self, y_true, X):
        proba = self.predict_proba(X, True)
        n = lambda x: 0.0 if np.isnan(x) else x
        proba = proba.map(n)
        return roc_auc_score(y_true, proba)
    
    def feature_importance(self, X, n_display=20):
        importances = pd.DataFrame({'features': X.columns, 'importance': self.model.feature_importances_})
        return importances.sort_values('importance', ascending=False)[:n_display]
    
    def pred_table(self, X, threshold=0.5, bet_only=True):
        pred_table = X.copy()[['h_num', 'odds', 'time_odds']]
        pred_table['pred'] = self.prefict(X, threshold)
        pred_table['win_ratio'] = self.win_ratio(X)
        if bet_only:
            pred_table = pred_table[pred_table['pred'] == 1][['h_num', 'odds', 'time_odds', 'win_ratio']]
#             pred_table = pred_table[pred_table['pred'] == 1][['h_num', 'odds', 'time_odds']]
            return pred_table
        else:
            return pred_table
        
    def fukusho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        money = -100 * n_bets
        haitou = self.haitou.copy()
        df = haitou.merge(pred_table, left_index=True, right_index=True, how='right')

        n_hits = len(df[df['1着馬番'] == df['h_num']]) + len(df[df['2着馬番'] == df['h_num']]) + len(df[df['3着馬番'] == df['h_num']]) + len(df[df['4着馬番'] == df['h_num']])
        for i in range(1, 5):
            money += df[df[str(i) + '着馬番'] == df['h_num']]['複勝' + str(i)].sum()
        return_rate =  (n_bets*100 + money) / (n_bets * 100)
        return n_bets, return_rate,n_hits
    
    def tansho_return(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        
        n_races = pred_table.index.nunique()
        
        money = -100 * n_bets
        df = self.haitou.copy()
        df = df.merge(pred_table, left_index=True, right_index=True, how='right')
        df['単勝配当'] = df['単勝'].astype(int)
        
#         std = ((df['1着馬番'] ==  df['h_num']) * df['単勝配当'])\
#         .groupby(level=0).sum().std() * np.sqrt(n_races) / (100 * n_bets)
        
        n_hits = len(df[df['1着馬番'] == df['h_num']])
        
        money += df[df['1着馬番'] == df['h_num']]['単勝配当'].sum()
        return_rate =  (n_bets*100 + money) / (n_bets * 100)
        return n_bets, return_rate, n_hits
    
    def tansho_return_proper(self, X, threshold=0.5):
        pred_table = self.pred_table(X, threshold)
        n_bets = len(pred_table)
        n_races = pred_table.index.nunique()
        df = self.haitou.copy()
        df = df.merge(pred_table, left_index=True, right_index=True, how='right')
        
        bet_money = (1/pred_table['odds']).sum()
        std = ((df['1着馬番'] == df['h_num']).astype(int)).groupby(level=0).sum().std() * np.sqrt(n_races) / bet_money
        
        df['h_num'] = df['h_num'].astype(float)
        df['馬番_1'] = df['1着馬番']
        n_hits = len(df.query('馬番_1 == h_num'))
        return_rate = n_hits/bet_money
        return n_bets, return_rate, n_hits
        
    
def gain(return_func, X, n_samples=100, lower=50, min_threshold=0.5):
    gain = {}
    for i in tqdm(range(n_samples)):
        # 閾値を増やす        
        threshold = 1 * i /n_samples + min_threshold * (1 - i/n_samples)
        n_bets, return_rate, n_hits = return_func(X, threshold)
        if n_bets == 0:
            break;
        if n_bets > lower:
            gain[n_bets] = { 'return_rate': return_rate, 'n_hits': n_hits }
    return pd.DataFrame(gain).T

In [2]:
haitou = pd.read_csv('./csv_new2/race_detail.csv')
haitou = haitou.set_index('race_id')

In [4]:
# allrace = pd.read_pickle('./pickle_new/base_race_20220813_5.pickle')
allrace = pd.read_pickle('./pickle_new/base_race_20220813_2.pickle')
time = pd.read_csv('./csv_new2/base/race_time.csv')
allrace = allrace.merge(time, how='left', on='id')

time_odds_base = pd.read_csv('./csv_new2/time_odds.csv')
allrace = allrace.merge(time_odds_base, how='left', on='id')

df = allrace.query('(course == 2 | course == 1)')
all_r = preprocessing(df)
# all_r = preprocessing(allrace)
# all_r['popular'] = all_r['popular'].map(lambda x: 1 if x == 1 else 0)

all_r.drop([
  '気温', '風速', '風向',
  '1走前着差', '2走前着差', '3走前着差',
    '4走前着差', '5走前着差',
  '1走前スピードZI','2走前スピードZI', '3走前スピードZI',
    '4走前スピードZI', '5走前スピードZI',
  '1走前スピード指数','2走前スピード指数', '3走前スピード指数',
    '4走前スピード指数', '5走前スピード指数',
    '1走前相対着順', '2走前相対着順','3走前相対着順','4走前相対着順','5走前相対着順',
    '1走前相対人気', '2走前相対人気','3均走前相対人気','4走前相対人気','5走前相対人気',
      '1走前スピード指数偏差','2走前スピード指数偏差', '3走前スピード指数偏差',
    '4走前スピード指数偏差', '5走前スピード指数偏差',
], axis=1, inplace=True)

categorical = process_categorical(all_r, [
    'producer', 'owner', 'training_course', 
    'jockey_id', 'gender', 'trainer_id', 'weight',
    '天候', '馬場状態', 'grade', 'age', 'place_id',
    'color_id', 'stallion_id', 'affiliation_id'
])

categorical = categorical.reset_index()
vec = pd.read_pickle('./pickle_new/peds_vec.pickle')
categorical = categorical.merge(vec.drop(['name'], axis=1), on='horse_id')

categorical = categorical.set_index('race_id')

# target = pd.read_pickle('./pickle_new/new_race_20220904.pickle')
# time_odds = pd.read_csv('./csv_new2/20220904/time_odds.csv')
# target = target.merge(time_odds, how='left', on='id')
# target = target[target['date'].notnull()]

# target = target.query('(course == 2 | course == 1)')
# target = preprocessing(target)
# target['result'] = target['result'].map(lambda x: 1 if x == 1 else 0)
# target.drop([
#   '気温', '風速', '風向',
#   '1走前着差', '2走前着差', '3走前着差', '4走前着差', '5走前着差',
#   '1走前スピードZI','2走前スピードZI', '3走前スピードZI','4走前スピードZI', '5走前スピードZI',
#   '1走前スピード指数','2走前スピード指数', '3走前スピード指数','4走前スピード指数', '5走前スピード指数',
# #   '先行指数', 'ペース指数', '上がり指数', 'スピード指数'
# ], axis=1, inplace=True)
# for i in range(1, 63):
#     target.drop(['peds' + str(i)], axis=1, inplace=True)
# test1 = process_categorical(target,  [
#     'producer', 'owner', 'training_course', 
#     'jockey_id', 'gender', 'trainer_id', 'weight',
#     '天候', '馬場状態', 'grade', 'age', 'place_id',
#     'color_id', 'stallion_id', 'affiliation_id'
# ])
c = categorical.copy()

In [589]:
cc = c.copy()
cc['result'] = cc['result'].map(lambda x: 1 if x <= 3 else 0)
result_d = cc.fillna(0)

train1, valid1  = split_data(result_d)
valid1, test1  = train_valid_split_data(valid1)

X_train1_d  = train1.drop(['id', 'date', 'result',  'time_popular', 'time_odds', 'odds', 'popular', 'correct_time', 'horse_id'], axis=1)
t_train1_d  = train1['result']
X_valid1_d  = valid1.drop(['id', 'date', 'result',  'time_popular', 'correct_time', 'horse_id'], axis=1)
t_valid1_d  = valid1['result']

X_train_d = torch.Tensor(X_train1_d.values)
t_train_d = torch.Tensor(t_train1_d.values)
X_valid_d = torch.Tensor(X_valid1_d.drop(['odds', 'popular', 'time_odds'], axis=1).values)
t_valid_d = torch.Tensor(t_valid1_d.values)

t_train_d = t_train_d.reshape([-1, 1])
t_valid_d = t_valid_d.reshape([-1, 1])

X_test1_d = test1.drop(['id', 'date', 'result',  'time_popular', 'correct_time', 'horse_id'], axis=1)
t_test1_d = test1['result']
X_test_d = torch.Tensor(X_test1_d.drop(['odds', 'popular', 'time_odds'], axis=1).values)
t_test_d = torch.Tensor(t_test1_d.values)

t_test_d = t_test_d.reshape([-1, 1])

In [22]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_train_d, t_train_d)
loader = DataLoader(dataset, batch_size=128, shuffle=True)

from torch import optim

model = nn.Sequential(
    nn.Linear(575, 128),
    nn.BatchNorm1d(128),
    nn.Linear(128, 1),
    nn.Sigmoid(),
)

loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters())

# optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(50):
    model.train()

    for X, t in loader:
        optimizer.zero_grad()
        y = model(X)
        loss = loss_fn(y, t)
        # 傾きの計算
        loss.backward()
        # optimizerの更新
        optimizer.step()
        
    model.eval()
    y_train_d = model(X_train_d)
    y_valid_d = model(X_valid_d)
   #  平均二乗誤差 予測値と正解値の誤差の計算
    loss_train = loss_fn(y_train_d, t_train_d)
    loss_valid = loss_fn(y_valid_d, t_valid_d)
    auc_train = roc_auc_score(t_train_d.detach().numpy(), y_train_d.detach().numpy())
    auc_valid = roc_auc_score(t_valid_d.detach().numpy(), y_valid_d.detach().numpy())
    
    print('epoch: {}, train:[loss={:.3f}, AUC={:.3f}], test:[loss={:.3f}, AUC={:.3f}]'.  format(epoch, loss_train, auc_train, loss_valid, auc_valid))

epoch: 0, train:[loss=0.143, AUC=0.756], test:[loss=0.150, AUC=0.748]
epoch: 1, train:[loss=0.142, AUC=0.766], test:[loss=0.147, AUC=0.761]
epoch: 2, train:[loss=0.140, AUC=0.767], test:[loss=0.145, AUC=0.763]
epoch: 3, train:[loss=0.139, AUC=0.771], test:[loss=0.143, AUC=0.767]
epoch: 4, train:[loss=0.142, AUC=0.772], test:[loss=0.147, AUC=0.763]
epoch: 5, train:[loss=0.138, AUC=0.771], test:[loss=0.143, AUC=0.765]
epoch: 6, train:[loss=0.138, AUC=0.773], test:[loss=0.144, AUC=0.763]
epoch: 7, train:[loss=0.140, AUC=0.775], test:[loss=0.144, AUC=0.771]
epoch: 8, train:[loss=0.139, AUC=0.774], test:[loss=0.144, AUC=0.769]
epoch: 9, train:[loss=0.143, AUC=0.767], test:[loss=0.151, AUC=0.758]
epoch: 10, train:[loss=0.139, AUC=0.771], test:[loss=0.145, AUC=0.761]
epoch: 11, train:[loss=0.139, AUC=0.771], test:[loss=0.143, AUC=0.765]
epoch: 12, train:[loss=0.139, AUC=0.777], test:[loss=0.143, AUC=0.772]
epoch: 13, train:[loss=0.138, AUC=0.772], test:[loss=0.143, AUC=0.764]
epoch: 14, train

In [23]:
y_test_d = model(X_test_d)
roc_auc_score(t_test_d.detach().numpy(), y_test_d.detach().numpy())

0.7682880557954992

In [650]:
grade = pd.read_csv('./csv_new2/races.csv')

In [793]:
x = valid1[['h_num', 'odds', 'time_odds', 'popular']]
t_pred_d = pd.Series(np.around(torch.flatten(y_valid_d).detach().numpy(), decimals=5), index=x.index)
sum1 = pd.DataFrame(t_pred_d.groupby(level=0).sum())

x['proba'] = t_pred_d
proba = x[['proba']]
standard_scaler = lambda x: (x - x.mean()) / x.std()
proba = proba.groupby(level=0).transform(standard_scaler)
proba = (proba - proba.min()) / (proba.max() - proba.min())

x['proba'] = proba
x['pred'] = x['proba'].map(lambda x: 0 if x <= 0.5 else 1)

shisuu = pd.read_csv('./shisuu_new.csv')
v = valid1.reset_index()[['race_id', 'h_num', 'id']]
v['horse_race_id'] = v['id']

sv = v.drop(['id'], axis=1).merge(shisuu, on='horse_race_id')
sv = sv.merge(x, on=['race_id', 'h_num'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['proba'] = t_pred_d
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['proba'] = proba
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['pred'] = x['proba'].map(lambda x: 0 if x <= 0.5 else 1)


In [809]:
x1 = sv[['race_id', 'h_num', 'odds', 'time_odds', 'pred', 'proba', 'popular','score']]
x1['s_proba'] = (x1['proba'] * 100).astype(int)
x1['shisuu'] = (((50 + x1['score']) / 100)  * x1['s_proba']).astype(int)
x1 = x1.merge(grade, on='race_id')

bt1 = x1[
    (x1['pred'] == 1)
    &
#     ((x1['pred_rank'] == 1))
#     &
#      (x1['time_odds'] / x1['d_odds'] >= 1)
#     &
#     (x1['grade'] > 2)
#     &
    (x1['score'] > 0)
]
bt1

Unnamed: 0,race_id,h_num,odds,time_odds,pred,proba,popular,score,s_proba,shisuu,grade
33,2018041509020807,12,4.3,4.5,1,0.547701,2.0,8,54,31,3
38,2018041509020807,5,7.3,7.4,1,0.547039,4.0,11,54,32,3
50,2018041506030805,10,4.5,5.0,1,0.589088,2.0,2,58,30,3
77,2018041503010407,4,6.1,7.0,1,0.624692,3.0,3,62,32,3
78,2018041503010407,1,3.4,4.2,1,0.621195,1.0,3,62,32,3
...,...,...,...,...,...,...,...,...,...,...,...
137462,2021042405020111,2,3.1,2.9,1,0.816141,1.0,17,81,54,7
137476,2021042405020109,2,3.6,4.9,1,0.710751,2.0,5,71,39,4
137488,2021042405020109,7,2.1,2.5,1,0.811091,1.0,11,81,49,4
137503,2021042409020907,2,2.7,3.3,1,0.547634,2.0,2,54,28,3


In [810]:
bh = bt1.merge(haitou, on='race_id')

money = 0
f_c = 0
for i in range(1, 5):
    s = str(i)
    f_c += len(bh[bh['h_num'] == bh[s + '着馬番']]['複勝' + s])
    money += bh[bh['h_num'] == bh[s + '着馬番']]['複勝' + s].sum()
    
print("点数：{} レース数:{} 対象レース数:{} 出現頻度:{:.1%} 的中率:{:.1%} 的中数:{} 賭金:{:,}円 配当合計:{:,}円 回収率:{:.1%}". format(
    len(bt1),\
    len(valid1.groupby('race_id')),\
    len(bt1.groupby('race_id')),\
    len(bt1.groupby('race_id')) / len(valid1.groupby('race_id')),\
    f_c / len(bt1),\
    f_c,\
    len(bt1) * 100,\
    int(money),\
    (money / (len(bt1) * 100))
))

点数：12743 レース数:10060 対象レース数:7692 出現頻度:76.5% 的中率:51.3% 的中数:6542 賭金:1,274,300円 配当合計:1,074,130円 回収率:84.3%


In [811]:
b_umaren = bt1.merge(x1, on='race_id')
b_umaren = b_umaren[
    b_umaren['h_num_x'] != b_umaren['h_num_y']
]

umaren = b_umaren[
#     (b_umaren['score_y'] >= 0)
#     &
    ((b_umaren['shisuu_x'] + b_umaren['shisuu_y']) > 75)
]

In [812]:
umaren_bets = umaren.merge(haitou, on='race_id')[['race_id', 'h_num_x', 'h_num_y', '1着馬番', '2着馬番', '馬連', '馬単']]
tekichu = umaren_bets[
    (
        (umaren_bets['h_num_x'] == umaren_bets['1着馬番'])
        &
        (umaren_bets['h_num_y'] == umaren_bets['2着馬番'])
    )
    |
    (
        (umaren_bets['h_num_x'] == umaren_bets['2着馬番'])
        &
        (umaren_bets['h_num_y'] == umaren_bets['1着馬番'])
    )
]

print("点数：{} レース数:{} 出現頻度:{:.1%} 的中率:{:.1%} 的中数:{} 賭金:{:,}円 配当合計:{:,}円 最高配当:{:,}円 回収率:{:.1%}". format(
    len(umaren),\
    len(umaren.groupby('race_id')),\
    len(umaren.groupby('race_id')) / len(valid1.groupby('race_id')),\
    len(tekichu) / len(umaren),\
    len(tekichu),\
    len(umaren) * 100,\
    int(tekichu['馬連'].sum()),\
    int(tekichu['馬連'].max()),\
    (int(tekichu['馬連'].sum()) / (len(umaren) * 100))
))

点数：3077 レース数:1345 出現頻度:13.4% 的中率:12.1% 的中数:371 賭金:307,700円 配当合計:296,400円 最高配当:8,560円 回収率:96.3%


In [813]:
umatan = b_umaren[
    ((b_umaren['shisuu_x'] + b_umaren['shisuu_y']) > 75)
]
umatan_bets = umatan.merge(haitou, on='race_id')[['race_id', 'h_num_x', 'h_num_y', '1着馬番', '2着馬番', '馬連', '馬単']]

umatan_tekichu = umatan_bets[
    (
        (umatan_bets['h_num_x'] == umatan_bets['1着馬番'])
        &
        (umatan_bets['h_num_y'] == umatan_bets['2着馬番'])
    )
    |
    (
        (umatan_bets['h_num_x'] == umatan_bets['2着馬番'])
        &
        (umatan_bets['h_num_y'] == umatan_bets['1着馬番'])
    )
]

print("点数：{} レース数:{} 出現頻度:{:.1%} 的中率:{:.1%} 的中数:{} 賭金:{:,}円 配当合計:{:,}円 最高配当:{:,}円 回収率:{:.1%}". format(
    len(umatan),\
    len(umatan.groupby('race_id')),\
    len(umatan.groupby('race_id')) / len(valid1.groupby('race_id')),\
    len(umatan_tekichu) / len(umaren),\
    len(umatan_tekichu),\
    len(umatan) * 200,\
    int(umatan_tekichu['馬単'].sum()),\
    int(umatan_tekichu['馬単'].max()),\
    (int(umatan_tekichu['馬単'].sum()) / (len(umatan) * 200))
))

点数：3077 レース数:1345 出現頻度:13.4% 的中率:12.1% 的中数:371 賭金:615,400円 配当合計:574,660円 最高配当:17,070円 回収率:93.4%


In [783]:
b_sanren = b_umaren.merge(x1, on='race_id')
b_sanren = b_sanren[
    (b_sanren['h_num_x'] != b_sanren['h_num'])
    &
    (b_sanren['h_num_y'] != b_sanren['h_num'])
]

t_sanren = b_sanren[
    (b_sanren['score'] > 0)
    &
    ((b_sanren['shisuu_x'] + b_sanren['shisuu_y'] + b_sanren['shisuu']) >= 95)
]

In [612]:
sanren = t_sanren.merge(haitou, on='race_id')[['race_id', 'h_num_x', 'h_num_y', 'h_num', '1着馬番', '2着馬番', '3着馬番', '3連複', '3連単']]
san_tekichu = sanren[
    (
        (sanren['h_num_x'] == sanren['1着馬番']) & (sanren['h_num_y'] == sanren['2着馬番']) & (sanren['h_num'] == sanren['3着馬番'])
    )
    |
    (
        (sanren['h_num_x'] == sanren['1着馬番']) & (sanren['h_num_y'] == sanren['3着馬番']) & (sanren['h_num'] == sanren['2着馬番'])
    )
    |
    
    (
        (sanren['h_num_x'] == sanren['2着馬番']) & (sanren['h_num_y'] == sanren['1着馬番']) & (sanren['h_num'] == sanren['3着馬番'])
    )
    |
    (
        (sanren['h_num_x'] == sanren['2着馬番']) & (sanren['h_num_y'] == sanren['3着馬番']) & (sanren['h_num'] == sanren['1着馬番'])
    )
    |
    (
        (sanren['h_num_x'] == sanren['3着馬番']) & (sanren['h_num_y'] == sanren['2着馬番']) & (sanren['h_num'] == sanren['1着馬番'])
    )
    |
    (
        (sanren['h_num_x'] == sanren['3着馬番']) & (sanren['h_num_y'] == sanren['1着馬番']) & (sanren['h_num'] == sanren['2着馬番'])
    )
]

print("点数：{} レース数:{} 出現頻度:{:.1%} 的中率:{:.1%} 的中数:{} 賭金:{:,}円 配当合計:{:,}円 最高配当:{:,}円 回収率:{:.1%}". format(
    len(sanren),\
    len(sanren.groupby('race_id')),\
    len(sanren.groupby('race_id')) / len(valid1.groupby('race_id')),\
    len(san_tekichu) / len(sanren),\
    len(san_tekichu),\
    len(sanren) * 100,\
    int(san_tekichu['3連複'].sum()),\
    int(san_tekichu['3連複'].max()),\
    (int(san_tekichu['3連複'].sum()) / (len(sanren) * 100))
))

点数：20578 レース数:1808 出現頻度:18.0% 的中率:2.6% 的中数:529 賭金:2,057,800円 配当合計:1,534,250円 最高配当:35,650円 回収率:74.6%


In [613]:
print("点数：{} レース数:{} 出現頻度:{:.1%} 的中率:{:.1%} 的中数:{} 賭金:{:,}円 配当合計:{:,}円 最高配当:{:,}円 回収率:{:.1%}". format(
    len(sanren),\
    len(sanren.groupby('race_id')),\
    len(sanren.groupby('race_id')) / len(valid1.groupby('race_id')),\
    len(san_tekichu) / len(sanren),\
    len(san_tekichu),\
    len(sanren) * 600,\
    int(san_tekichu['3連単'].sum()),\
    int(san_tekichu['3連単'].max()),\
    (int(san_tekichu['3連単'].sum()) / (len(sanren) * 600))
))

点数：20578 レース数:1808 出現頻度:18.0% 的中率:2.6% 的中数:529 賭金:12,346,800円 配当合計:8,418,330円 最高配当:180,340円 回収率:68.2%
