In [1]:
import pandas as pd
import numpy as np
import datetime
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import requests
from bs4 import BeautifulSoup
import time
from tqdm.notebook import tqdm
import re
from urllib.request import urlopen
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
from selenium.webdriver import Chrome,ChromeOptions

In [2]:
#pickleデータの読み込み
#results = pd.read_pickle('results.pickle')
results2 = pd.read_pickle('results_addinfo.pickle')
#データの分別
def preprocessing(results):
    df = results.copy()

    # 着順に数字以外の文字列が含まれているものを取り除く
    df['着順'] = pd.to_numeric(df['着順'],errors='coerce')
    df.dropna(subset=['着順'],inplace=True)
    df["着順"] = df["着順"].astype(int)

    # 性齢を性と年齢に分ける
    df["性"] = df["性齢"].map(lambda x: str(x)[0])
    df["年齢"] = df["性齢"].map(lambda x: str(x)[1:]).astype(int)

    # 馬体重を体重と体重変化に分ける
    df["体重"] = df["馬体重"].str.split("(", expand=True)[0].astype(int)
    df["体重変化"] = df["馬体重"].str.split("(", expand=True)[1].str[:-1].astype(int)

    # データをint, floatに変換
    df["単勝"] = df["単勝"].astype(float)
    df["course_len"] = df["course_len"].astype(int)

    # 不要な列を削除
    df.drop(["タイム", "着差", "調教師", "性齢", "馬体重",'馬名','騎手'], axis=1, inplace=True)

    #新しい規格
    df['date'] = pd.to_datetime(df['date'],format='%Y年%m月%d日')

    return df

#ダミーデータの生成
def split_data(df,test_size=0.3):
    sorted_id_list = df.sort_values('date').index.unique()
    train_id_list = sorted_id_list[:round(len(sorted_id_list)*(1-test_size))]
    test_id_list = sorted_id_list[round(len(sorted_id_list)*(1-test_size)):]
    train = df.loc[train_id_list].drop(['date'],axis=1)
    test = df.loc[test_id_list].drop(['date'],axis=1)
    return train,test

class HorseResults:
    def __init__(self,horse_results):
        self.horse_results = horse_results[['日付','着順','賞金']]
        self.preprocessing()
        #self.horse_results.rename(columns={'着順':'着順_ave','賞金':'賞金_ave'},inplace=True)

    def preprocessing(self):
        df = self.horse_results.copy()

        # 着順に数字以外の文字列が含まれているものを取り除く
        df['着順'] = pd.to_numeric(df['着順'],errors='coerce')
        df.dropna(subset=['着順'],inplace=True)
        #df = df[~(df["着順"].astype(str).str.contains("\D"))]
        df["着順"] = df["着順"].astype(int)

        #新しい規格
        df['date'] = pd.to_datetime(df['日付'])
        df.drop(['日付'],axis=1,inplace=True)

        #賞金のNaNを0で埋める
        df['賞金'].fillna(0,inplace=True)

        self.horse_results = df
    def average(self,horse_id_list,date,n_samples='all'):
        target_df = self.horse_results.loc[horse_id_list]

        #過去何年分のデータを取り出すかの設定
        if n_samples == 'all':
            filtered_df = target_df[target_df['date'] < date]
        elif n_samples > 0:
            filtered_df = target_df[target_df['date'] < date].\
                sort_values('date',ascending=False).groupby(level=0).head(n_samples)
        else:
            raise Exception['n_samples must be >0']
        
        average = filtered_df.groupby(level = 0)[['着順','賞金']].mean()
        return average.rename(columns={'着順':'着順_{}R'.format(n_samples),'賞金':'賞金_{}R'.format(n_samples)})

    def merge(self,results,date,n_samples = 'all'):
        df = results[results['date']==date]
        horse_id_list = df['horse_id']
        merged_df = df.merge(self.average(horse_id_list,date,n_samples),left_on='horse_id',right_index=True,how='left')
        return merged_df
    def merge_all(self,results,n_samples = 'all'):
        date_list = results['date'].unique()
        merged_df = pd.concat([self.merge(results,date,n_samples)for date in tqdm(date_list)])
        return merged_df

class Return:
    def __init__(self,return_tables):
        self.return_tables = return_tables
        #self.fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
        
    #変数のように使える()が必要ない
    @property
    def fukusho(self):
        fukusho = self.return_tables[self.return_tables[0]=='複勝'][[1,2]]
        wins = fukusho[1].str.split('br',expand=True).drop([3],axis=1)
        wins.columns = ['win_0','win_1','win_2']
        returns = fukusho[2].str.split('br',expand=True).drop([3],axis=1)
        returns.columns = ['return_0','return_1','return_2']

        df = pd.concat([wins,returns],axis=1)
        for column in df.columns:
            df[column] = df[column].str.replace(',','')
        return df.fillna(0).astype(int)
        
def prefix(results_fix):
    df = results_fix.copy()

    df.drop(['馬名','騎手'], axis=1, inplace=True)
    return df

In [1]:
print("hello")

hello


In [2]:
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options) 

driver.get('https://www.google.com/')
print(driver.title)

search_box = driver.find_element_by_name("q")
search_box.send_keys('ChromeDriver')
search_box.submit()
print(driver.title)

driver.save_screenshot('search_results_re.png')
driver.quit()

Google


  search_box = driver.find_element_by_name("q")


ChromeDriver - Google 検索


In [4]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = webdriver.Chrome(options=options)
q = u"ワールドカップ 2018"
driver.get('https://www.google.com/search?q=%s' % (q))

In [3]:
results = pd.read_pickle('results_id.pickle')
results

Unnamed: 0,着順,枠番,馬番,馬名,斤量,騎手,単勝,人気,course_len,weather,race_type,ground_state,date,horse_id,jockey_id,性,年齢,体重,体重変化
201901010101,1,1,1,ゴルコンダ,54.0,ルメール,1.4,1.0,1800,曇,芝,良,2019-07-27,2017105318,05339,牡,2,518,-16
201901010101,2,3,3,プントファイヤー,54.0,岩田康誠,3.5,2.0,1800,曇,芝,良,2019-07-27,2017104612,05203,牡,2,496,-8
201901010101,3,4,4,ラグリマスネグラス,51.0,団野大成,46.6,6.0,1800,曇,芝,良,2019-07-27,2017103879,01180,牡,2,546,6
201901010101,4,8,9,キタノコドウ,51.0,菅原明良,56.8,7.0,1800,曇,芝,良,2019-07-27,2017106259,01179,牡,2,458,-8
201901010101,5,5,5,ネモフィラブルー,54.0,川島信二,140.3,9.0,1800,曇,芝,良,2019-07-27,2017104140,01062,牡,2,436,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201910021212,12,6,11,スリープレッピー,56.0,森裕太朗,120.3,15.0,2600,曇,芝,重,2019-09-01,2013104167,01165,セ,6,458,8
201910021212,13,1,1,バリオラージュ,54.0,斎藤新,7.5,4.0,2600,曇,芝,重,2019-09-01,2014105643,01178,牡,5,460,2
201910021212,14,2,3,サンライズアミーゴ,54.0,亀田温心,99.2,12.0,2600,曇,芝,重,2019-09-01,2015102081,01176,牡,4,478,14
201910021212,15,6,12,トロハ,52.0,武豊,17.5,8.0,2600,曇,芝,重,2019-09-01,2016104221,00666,牝,3,468,2


In [6]:
horse_id_list = results['horse_id'].unique()

In [7]:
def scrape_peds(horse_id_list,pre_peds={}):
    peds = pre_peds.copy()
    for horse_id in tqdm(horse_id_list):
        if horse_id in peds.keys():
            continue
        try:
            url = 'https://db.netkeiba.com/horse/ped/' + horse_id
            df = pd.raed_html(url)[0]
            generations = {}
            for i in reversed(range(5)):
                generations[i] = df[i]
                df.drop([i],axis=1,inplace=True)
                df = df.drop_duplicates()
            ped = pd.concat([generations[i] for i in range(5)]).rename(horse_id)
            peds[horse_id] = ped.reset_index(drop=True)
            time.sleep(1)
        except IndexError:
            continue
	#この部分は動画中に無いですが、捕捉できるエラーは拾った方が、エラーが出たときに分かりやすいです
        except Exception as e:
            print(e)
            break
        except:
	        break
    return peds


In [8]:
peds = scrape_peds(horse_id_list[:5])

  0%|          | 0/5 [00:00<?, ?it/s]

module 'pandas' has no attribute 'raed_html'


In [11]:
url = 'https://db.netkeiba.com/horse/ped/2017105318'
pd.raed_html(url)[0]
pd.read_html(url)[0]
generations = {}

AttributeError: module 'pandas' has no attribute 'raed_html'