In [None]:
!ls -a

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

import utils.read_data as rd
import utils.io_model as io_m
import utils.normalization as nm

In [None]:
import os
from os.path import join, dirname
from dotenv import load_dotenv
from pathlib import Path

In [None]:
load_dotenv(verbose=True)
dotenv_path = join(Path().resolve(), '.env')
load_dotenv(dotenv_path)

In [None]:
GOOGLE_DRIVE_PATH = os.environ.get("GOOGLE_DRIVE_PATH") + '/horse_racing'
DATA_PATH = GOOGLE_DRIVE_PATH + '/csv/'

In [None]:
print(GOOGLE_DRIVE_PATH)

In [None]:
df = rd.read_horse_csv(DATA_PATH)

In [None]:
df.info()

In [None]:
df.head()

## 前処理

In [None]:
# 学習に使用するカラム、過去データは3レース前までのデータを用いる
# これらのカラムに対して加工を行う為、新たなカラムが加わる。
# その為、最終的に用いる特徴量のカラムは別なセルで定義する(input_columns)
columns = ["race_course", "weather", "ground_status", 
           "where_racecourse", "race_class", "running_condition", 
           "frame_number", "horse_number",
           "sex_and_age", "burden_weight", "rider_id", 
           "tamer_id", "horse_weight", "odds", "popular",
           "rank", "total_horse_number_x", 
           "rank_1", "rank_2", "rank_3",
           "total_horse_number_x_1", "total_horse_number_x_2","total_horse_number_x_3",
           "goal_time_1", "goal_time_2", "goal_time_3",
           "last_time_1", "last_time_2", "last_time_3", 
           "half_way_rank_1", "half_way_rank_2", "half_way_rank_3", 
           "prize_1", "prize_2", "prize_3"]

df = df[columns]
df.head()

In [None]:
df["kyakusitu_1"] = [nm.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x_1"].values, df["half_way_rank_1"])]

In [None]:
df["where_racecourse"] = df["where_racecourse"].map(nm.extract_place)

df["sex"] = df["sex_and_age"].map(lambda sex_and_age: sex_and_age[0])
df["age"] = df["sex_and_age"].map(lambda sex_and_age: sex_and_age[1:])

df["goal_time_1"] = df["goal_time_1"].map(nm.to_seconds)
df["goal_time_2"] = df["goal_time_1"].map(nm.to_seconds)
df["goal_time_3"] = df["goal_time_1"].map(nm.to_seconds)

df["horse_weight"] = df["horse_weight"].map(nm.extract_weight)

df["prize_1"] = df["prize_1"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)
df["prize_2"] = df["prize_2"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)
df["prize_3"] = df["prize_3"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)

df["kyakusitu_1"] = [nm.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x_1"].values, df["half_way_rank_1"])]
df["kyakusitu_2"] = [nm.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x_2"].values, df["half_way_rank_2"])]
df["kyakusitu_3"] = [nm.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x_3"].values, df["half_way_rank_3"])]

# 欠損値処理
df = df.replace('---', -1)
df = df.fillna(-1)

df["odds"] = df["odds"].astype(np.float64)

In [None]:
# 欠損値の確認
df.isnull().sum().sum()

### ラベルの作成


In [None]:
def make_label(rank_values, horse_number_values):
    labels = []
    high = 1 / 3
    mid = 2 / 3
    for rank, horse_number in zip(rank_values, horse_number_values):
        # 欠損値の場合
        if rank == -1:
            labels.append(rank)
            continue
        # 順位が付かないデータに関しては最低レベルのラベルを付与
        not_rank = False
        for c in ["中", "取", "除", "降"]:
            if c in str(rank):
                labels.append("low")
                not_rank = True
                break
        if not_rank:
            continue
            
        relative_rank = int(rank) / horse_number
    
        if relative_rank < high:
            labels.append("high")
        elif relative_rank < mid:
            labels.append("middle")
        else:
            labels.append("low")
            
    return labels

In [None]:
df["label"] = make_label(df["rank"].values, df["total_horse_number_x"].values)
df["rank_1"] = make_label(df["rank_1"].values, df["total_horse_number_x_1"].values)
df["rank_2"] = make_label(df["rank_2"].values, df["total_horse_number_x_2"].values)
df["rank_3"] = make_label(df["rank_3"].values, df["total_horse_number_x_3"].values)

In [None]:
input_columns = ["race_course", "weather", "ground_status", 
                 "where_racecourse", "race_class", "running_condition", 
                 "frame_number", "horse_number",
                 "sex", "age", "burden_weight", "rider_id", 
                 "tamer_id", "horse_weight", "odds", "popular",
                 "rank_1", "rank_2", "rank_3", 
                 "goal_time_1", "goal_time_2", "goal_time_3",
                 "last_time_1", "last_time_2", "last_time_3", 
                 "kyakusitu_1", "kyakusitu_2", "kyakusitu_3", 
                 "prize_1", "prize_2", "prize_3", "label"]

In [None]:
# one-hot
df = pd.get_dummies(df[input_columns])

## 学習

In [None]:
# 学習に用いるデータセットの作成
x = np.array(df)
y = np.array(df[["label_high", "label_middle", "label_low"]])
#del df
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)
del x, y
print(x_train.shape)
print(x_test.shape)

In [None]:
df.head()

In [None]:
# データセットのシャッフルとバッチ化
train_ds = tf.data.Dataset.from_tensor_slices(
    (x_train, y_train)).shuffle(10000).batch(1024)

test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(1024)

In [None]:
import utils.sample_model as model
    
# モデルのインスタンスを作成
model = model.HorseModel(x_train.shape[1], 3)

In [None]:
metrics = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.AUC(name='auc'),
]
model.compile(optimizer='adam', 
              loss='categorical_crossentropy',
              metrics=metrics)

In [None]:
model.fit(train_ds, epochs=1) 

In [None]:
model.evaluate(test_ds, verbose=2)

In [None]:
# テストデータの予測値と正解ラベルの確認
#for x, y in zip(x_test, y_test):
#    print(f"pred: {model.predict(x.reshape(1, -1))}, label: {y}")

In [None]:
# モデルの保存
# io_m.save_model(model, model_name="first_model")

In [None]:
# 保存したモデルに不具合がないか確認
# model = io_m.read_model("first_model")
# model.evaluate(test_ds, verbose=2)