In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm
import re
from urllib.request import urlopen
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
from selenium.webdriver import Chrome,ChromeOptions

In [4]:
def scrape_race_results(race_id_list, pre_race_results={}):
    #race_results = pre_race_results
    race_results = pre_race_results.copy() #正しくはこちら。注意点で解説。
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            time.sleep(1)
            url = "https://db.netkeiba.com/race/" + race_id
            race_results[race_id] = pd.read_html(url)[0]
        except IndexError:
            continue
	#この部分は動画中に無いですが、捕捉できるエラーは拾った方が、エラーが出たときに分かりやすいです
        except Exception as e:
            print(e)
            break
        except:
	        break
    return race_results

#レース条件のデータ取り出し関数
def scrape_race_info(race_id_list):
    race_infos={}

    for race_id in tqdm(race_id_list):
        try:
            url = 'https://db.netkeiba.com/race/' + race_id
            html = requests.get(url)
            html.encoding = 'EUC-JP'
            soup = BeautifulSoup(html.text,'html.parser')

            texts = soup.find("div",attrs = {'class':'data_intro'}).find_all('p')[0].text + \
                soup.find("div",attrs = {'class':'data_intro'}).find_all('p')[1].text
            info = re.findall(r"\w+",texts)
            info_dict = {}
            for text in info:
                if text in ['芝','ダート']:
                    info_dict['race_type'] = text
                if '障' in text:
                    info_dict['race_type'] = '障害'
                if 'm' in text:
                    info_dict['course_len'] = int(re.findall(r'\d+',text)[0])
                if text in ['良','稍重','重','不良']:
                    info_dict['ground_state'] = text
                if text in ['曇','晴','雨','小雨','小雪','雪']:
                    info_dict['weather'] = text
                if '年' in text:
                    info_dict['date'] = text

            race_infos[race_id] = info_dict
            time.sleep(1)
        except IndexError:
            continue
        except:
            break
    return race_infos

#pickleデータの読み込み
#results = pd.read_pickle('results.pickle')
results2 = pd.read_pickle('results_addinfo.pickle')
#データの分別
def preprocessing(results):
    df = results.copy()

    # 着順に数字以外の文字列が含まれているものを取り除く
    df['着順'] = pd.to_numeric(df['着順'],errors='coerce')
    df.dropna(subset=['着順'],inplace=True)
    df["着順"] = df["着順"].astype(int)

    # 性齢を性と年齢に分ける
    df["性"] = df["性齢"].map(lambda x: str(x)[0])
    df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

    # 馬体重を体重と体重変化に分ける
    df["体重"] = df["馬体重"].str.split("(", expand=True)[0].astype(int)
    df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1].astype(int)

    # データをint, floatに変換
    df["単勝"] = df["単勝"].astype(float)
    df["course_len"] = df["course_len"].astype(int)

    # 不要な列を削除
    df.drop(["タイム", "着差", "調教師", "性齢", "馬体重",'馬名','騎手'], axis=1, inplace=True)

    #新しい規格
    df['date'] = pd.to_datetime(df['date'],format='%Y年%m月%d日')

    return df

#ダミーデータの生成
def split_data(df,test_size=0.3):
    sorted_id_list = df.sort_values('date').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list)*(1-test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list)*(1-test_size)):]
    train = df.loc[train_id_list].drop(['date'],axis=1)
    test = df.loc[test_id_list].drop(['date'],axis=1)
    return train,test

class HorseResults:
    def __init__(self,horse_results):
        self.horse_results = horse_results[['日付','着順','賞金']]
        self.preprocessing()
        #self.horse_results.rename(columns={'着順':'着順_ave','賞金':'賞金_ave'},inplace=True)

    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'],errors='coerce')
        df.dropna(subset=['着順'],inplace=True)
        #df = df[~(df["着順"].astype(str).str.contains("\D"))]
        df["着順"] = df["着順"].astype(int)

        #新しい規格
        df['date'] = pd.to_datetime(df['日付'])
        df.drop(['日付'],axis=1,inplace=True)

        #賞金のNaNを0で埋める
        df['賞金'].fillna(0,inplace=True)

        self.horse_results = df
    def average(self,horse_id_list,date,n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]

        #過去何年分のデータを取り出すかの設定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date',ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception['n_samples must be >0']
        
        average = filtered_df.groupby(level = 0)[['着順','賞金']].mean()
        return average.rename(columns={'着順':'着順_{}R'.format(n_samples),'賞金':'賞金_{}R'.format(n_samples)})

    def merge(self,results,date,n_samples = 'all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list,date,n_samples),left_on='horse_id',right_index=True,how='left')
        return merged_df
    def merge_all(self,results,n_samples = 'all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results,date,n_samples)for date in tqdm(date_list)])
        return merged_df

class Return:
    def __init__(self,return_tables):
        self.return_tables = return_tables
        #self.fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
        
    #変数のように使える()が必要ない
    @property
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
        wins = fukusho[1].str.split('br',expand=True).drop([3],axis=1)
        wins.columns = ['win_0','win_1','win_2']
        returns = fukusho[2].str.split('br',expand=True).drop([3],axis=1)
        returns.columns = ['return_0','return_1','return_2']

        df = pd.concat([wins,returns],axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',','')
        return df.fillna(0).astype(int)
    @property
    def tansho(self):
        tansho = self.return_tables[self.return_tables[0]=='単勝'][[1,2]]
        tansho.columns = ['win','return']

        for column in tansho.columns:
            tansho[column] = pd.to_numeric(tansho[column],errors='coerce')
        return tansho

#_idを使うため不要になった馬名と騎手を切り捨て
def prefix(results_fix):
    df = results_fix.copy()

    df.drop(['馬名','騎手'], axis=1, inplace=True)
    return df

class ModelEvaluator:
    def __init__(self,model,return_tables,std=True):
        self.model = model
        self.fukusho = Return(return_tables).fukusho
        self.tansho = Return(return_tables).tansho
        self.std = std
    
    def predict_proba(self,x):
        proba = pd.Series(self.model.predict_proba(x)[:,1], index=x.index)
        if self.std:
            standard_scaler = lambda x: (x - x.mean()) / x.std()
            proba = proba.groupby(level=0).transform(standard_scaler)
            proba = (proba - proba.min()) / (proba.max() - proba.min())
        return proba

    def predict(self,x,threshold=0.5):
        y_pred = self.predict_proba(x)
        return [0 if p<threshold else 1 for p in y_pred]

    def score(self,y_true,x):
        return roc_auc_score(y_true,self.predict_proba(x))

    def feature_importance(self,x,n_display=20):
        importances = pd.DataFrame({'features':x_train.columns,
                                    'importance':self.model.feature_importances_})
        return importances.sort_values('importance',ascending=False)[:n_display]

    def pred_table(self,x,threshold=0.5,bet_only=True):
        pred_table = x.copy()[['馬番']]
        pred_table['pred'] = self.predict(x,threshold)
        if bet_only:
            return pred_table[pred_table['pred']==1]['馬番']
        else:
            return pred_table
    
    def fukusho_return(self,x,threshold=0.5):
        pred_table = self.pred_table(x,threshold)
        n_bets = len(pred_table)
        money = -100 * n_bets
        df = self.fukusho.copy()
        df = df.merge(pred_table,left_index=True, right_index=True,how='right')
        for i in range(3):
            money += df[df['win_{}'.format(i)]==df['馬番']]['return_{}'.format(i)].sum()
        return n_bets,money
    def tansho_return(self,x,threshold=0.5):
        pred_table = self.pred_table(x,threshold)
        n_bets = len(pred_table)
        money = -100 * n_bets
        df = self.tansho.copy()
        df = df.merge(pred_table,left_index=True, right_index=True,how='right')
        money += df[df['win']==df['馬番']]['return'].sum()
        return n_bets,money

def gain(return_func,x,n_samples=100,lower=50,min_threshold=0.5):
    gain = {}
    for i in tqdm(range(n_samples)):
        threshold = 1*i/n_samples + min_threshold * (1-i/n_samples)
        n_bets,money = return_func(x,threshold)
        if n_bets > lower:
            gain[n_bets] = (n_bets*100 + money)/(n_bets*100)
    return pd.Series(gain)

def process_categorical(df,target_columns):
    df2 = df.copy()
    for column in target_columns:
        df2[column] = LabelEncoder().fit_transform(df2[column].fillna('Na'))
    #target_columns以外にカテゴリ変数があれば、ダミー変数にする
    df2 = pd.get_dummies(df2)
    
    for column in target_columns:
        df2[column] = df2[column].astype('category')
    return df2

In [6]:
race_id_list = []
for place in range(1, 11, 1):
    for kai in range(1, 6, 1):
        for day in range(1, 13, 1):
            for r in range(1, 13, 1):
                race_id = "2020" + str(place).zfill(2) + str(kai).zfill(2) +\
		        str(day).zfill(2) + str(r).zfill(2)
                race_id_list.append(race_id)


In [7]:
#スクレイピング
race_infos = scrape_race_info(race_id_list)

  0%|          | 0/7200 [00:00<?, ?it/s]