# 前処理

## 環境構築

In [1]:
# Notebook初期設定
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings('ignore')

In [6]:
import datetime as dt
import glob
import itertools
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

import sqlalchemy as sa
import sqlite3
import re
import shutil

sns.set_style(style='ticks')

In [4]:
# フォント初期設定
!pip install japanize_matplotlib
import japanize_matplotlib

Collecting japanize_matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 8.2 MB/s eta 0:00:01
Building wheels for collected packages: japanize-matplotlib
  Building wheel for japanize-matplotlib (setup.py) ... [?25ldone
[?25h  Created wheel for japanize-matplotlib: filename=japanize_matplotlib-1.1.3-py3-none-any.whl size=4120274 sha256=bd332adb455bb6d0ca247675f236ea618d572ffb121134822c30f97da7df5a32
  Stored in directory: /home/jovyan/.cache/pip/wheels/83/97/6b/e9e0cde099cc40f972b8dd23367308f7705ae06cd6d4714658
Successfully built japanize-matplotlib
Installing collected packages: japanize-matplotlib
Successfully installed japanize-matplotlib-1.1.3


In [5]:
DIR_IN = '../data/scatter_preprocess/in'
DIR_OUT = '../data/scatter_preprocess/out'

In [7]:
FN_DB = 'keiba.db'
TN_RACE = 'race'
TN_RES = 'result'

In [12]:
def make_df_by_query(path_db, query):
    """path_dbにqueryを投げた結果のdfを取得"""
    conn = sqlite3.connect(path_db, isolation_level=None)
    cur = conn.cursor()
    df = pd.read_sql(query, conn)
    return df

## raceテーブルの更新

In [18]:
def format_title(race_name):
    """race_nameの文字列から回数やグレードを除外"""
    # 空白文字を除外
    race_name_new = re.sub('\s$', '', race_name)
    # 第n回
    race_name_new = re.sub('第\d+回', '', race_name_new)
    # グレード：(Gn)，(J.Gn)
    race_name_new = re.sub('\((J.G|G)\d+\)', '', race_name_new)
    # リステッド競争：(L), 重賞：(G)
    race_name_new = re.sub('\((L|G)\)', '', race_name_new)
    # (n)
    race_name_new = re.sub('\(\d+\)', '', race_name_new)
    # 第n戦
    race_name_new = re.sub('第\d+戦', '', race_name_new)
    # 最後の数字
    race_name_new = re.sub('\d$', '', race_name_new)
    # 西暦
    race_name_new = re.sub('19[8-9][0-9]|20[0-2][0-9]', '', race_name_new)
    # 略号
    race_name_new = re.sub('’([8-9][0-9]|[0-2][0-9])', '', race_name_new)
    return race_name_new

In [19]:
def get_grade(race_name):
    """race_nameからグレード情報を抽出"""
    r = re.search('(\((J.G|G)\d+\)|\((L|G)\))', race_name)
    if r:
        grade = r.group().replace('(', '').replace(')', '')
    else:
        grade = None
    return grade

In [20]:
path_from = os.path.join(DIR_IN, FN_DB)
path_to = os.path.join(DIR_OUT, FN_DB)

In [21]:
# コピー先のDBが初期化されるので注意
shutil.copy(path_from, path_to)

'../data/scatter_preprocess/out/keiba.db'

In [26]:
# raceテーブルの全データを取得
path_db = os.path.join(DIR_OUT, FN_DB)
df_race = make_df_by_query(path_db, 'SELECT * FROM race;')

In [27]:
# 中身はこんな感じ
df_race.head().T

Unnamed: 0,0,1,2,3,4
index,0,1,2,3,4
race_id,198601010101,198601010102,198601010103,198601010104,198601010105
date,1986-06-07,1986-06-07,1986-06-07,1986-06-07,1986-06-07
place,札幌,札幌,札幌,札幌,札幌
race_name,4歳以上300万下,3歳新馬,4歳未勝利,4歳未勝利,4歳未勝利
distance,1500,1000,1200,1800,1500
dart,True,True,True,True,True
dart_cond,良,良,良,良,良
turf,False,False,False,False,False
turf_cond,,,,,


In [28]:
# 集計用にカラムを追加
df_race['title'] = df_race['race_name'].apply(
    lambda x: format_title(x))
df_race['grade'] = df_race['race_name'].apply(
    lambda x: get_grade(x))

In [29]:
# to_sqlを普通に実行するとすごく遅いため，method='multi'を選択したいがエラーが出る．
# どうやらPandasのバグっぽい（https://github.com/pandas-dev/pandas/issues/29921）
# 諦めてsqlalchemy経由で接続する
# この場合closeは不要？よくわからない
engine = sa.create_engine(
    f'sqlite:///{path_db}', echo=False)
df_race.to_sql(
    TN_RACE, engine, if_exists='replace',
    method='multi', chunksize=5000)

## 集計用csvの出力

- raceとresultをジョインし，重賞のみにフィルタした`all_res.csv`を作成
- 育成ウマ娘の適正データを保存した`umamusume.csv`はそのままコピー

### `all_res.csv`

In [30]:
def query_results_and_races_by_grade(grade):
    """グレードでレース結果を集計するクエリ"""
    q = f'''
        SELECT *
        FROM (
            SELECT * FROM race 
            WHERE grade = '{grade}'
            AND steeple = 'False'
        ) AS race_g
        INNER JOIN result
        ON race_g.race_id = result.race_id;
    '''
    return q

In [31]:
def add_average_speed_to_df(df):
    """平均速度を計算して追加"""
    df_new = df.copy()
    df_new = df_new[~df_new['seconds_total'].isna()].\
        reset_index(drop=True)
    df_new = df_new[~df_new['seconds_3f'].isna()].\
        reset_index(drop=True)
    df_new['speed_total'] = \
        df_new['distance'] / df_new['seconds_total'] * 60 * 60 / 1000
    df_new['speed_3f'] = \
        600 / df_new['seconds_3f'] * 60 * 60 / 1000
    return df_new

In [32]:
def get_distance_class(distance):
    """ウマ娘における距離区分を返す
    https://altema.jp/umamusume/kyoritekisei
    """
    if distance < 1600:
        return 'short'
    elif distance < 2000:
        return 'mile'
    elif distance < 2500:
        return 'intermediate'
    elif distance >= 2500:
        return 'long'
    else:
        return None

In [33]:
def add_distance_class_to_df(df):
    """距離区分をdfに追加"""
    df_new = df.copy()
    df_new['distance_class'] = \
        df_new['distance'].apply(
            lambda x: get_distance_class(x))
    return df_new

In [34]:
def make_df_by_grade_from_db(path_db, grade):
    """dbからgradeを元にpd.DataFrameを作成を作成"""
    q = query_results_and_races_by_grade(grade)
    df = make_df_by_query(path_db, q)
    df = add_average_speed_to_df(df)
    df = add_distance_class_to_df(df)
    return df

In [35]:
# フィルタで残すグレード
grades = ['G1', 'G2', 'G3', 'G', 'L']

In [36]:
df_all = pd.DataFrame()
for g in tqdm(grades):
    df = make_df_by_grade_from_db(path_db, g)
    df_all = pd.concat([df_all, df], ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [37]:
# 中身はこんな感じ
df_all.head().T

Unnamed: 0,0,1,2,3,4
level_0,1038,1038,1038,1038,1038
index,1038,1038,1038,1038,1038
race_id,198605020811,198605020811,198605020811,198605020811,198605020811
date,1986-05-11,1986-05-11,1986-05-11,1986-05-11,1986-05-11
place,東京,東京,東京,東京,東京
race_name,第36回安田記念(G1),第36回安田記念(G1),第36回安田記念(G1),第36回安田記念(G1),第36回安田記念(G1)
distance,1600,1600,1600,1600,1600
dart,False,False,False,False,False
dart_cond,,,,,
turf,True,True,True,True,True


In [39]:
# level_0とindexが邪魔なので削除
df_all = df_all.drop(columns=['index', 'level_0'])

In [41]:
df_all.to_csv(os.path.join(DIR_OUT, 'all_res.csv'), index=False)