In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

import utils.read_data as rd
import utils.preprocessing as pp
import utils.io_model as im
import os
from os.path import join, dirname
from dotenv import load_dotenv
from pathlib import Path
import tensorflow as tf

In [None]:
load_dotenv(verbose=True)
dotenv_path = join(Path().resolve(), '.env')
load_dotenv(dotenv_path)
GOOGLE_DRIVE_PATH = os.environ.get("GOOGLE_DRIVE_PATH") + '/horse_racing'
DATA_PATH = GOOGLE_DRIVE_PATH + '/csv/'

# ジャパンカップ当日の別レースデータをこのノートブックで使えるようにディレクトリに配置
import shutil

target_file_names = [name for name in os.listdir(DATA_PATH+"test_data") if not "japan" in name and  not "arima" in name]
race_files = sorted([name for name in target_file_names if "race" in name])
horse_files = sorted([name for name in target_file_names if "horse" in name])
race_names = [race_name.split("-")[1] for race_name in race_files]
race_names = [race_name.split(".")[0] for race_name in race_files]
for race_name, race_file, horse_file in zip(race_names, race_files, horse_files):
    target_data_path = DATA_PATH+"test_data/target_"+race_name
    os.makedirs(target_data_path, exist_ok=True)
    shutil.move(DATA_PATH+"test_data/"+race_file, target_data_path+"/")
    shutil.move(DATA_PATH+"test_data/"+horse_file, target_data_path+"/")

In [None]:
os.listdir(DATA_PATH+"test_data")

In [None]:
#TARGET_DATA_PATH = GOOGLE_DRIVE_PATH + '/csv/test_data/target_race-202005050902/'
TARGET_DATA_PATH = GOOGLE_DRIVE_PATH + '/csv/arima_kinen/'

# 前処理

In [None]:
df_target_horse = rd.read_target_horse_csv(TARGET_DATA_PATH)
df_target_race  = rd.read_target_race_csv(TARGET_DATA_PATH)
df_target = pd.merge(df_target_horse, df_target_race, on='race_id', how='left')

In [None]:
df_horse = rd.read_horse_csv(DATA_PATH)
df_race = rd.read_race_csv(DATA_PATH)
#df = pd.read_csv(DATA_PATH+"/learning/horse_race.csv")
df = pd.merge(df_horse, df_race, on='race_id', how='left')

In [None]:
df = pp.join_n_race_for_prediction(df_target, df, 3)

In [None]:
df.head()

## 前処理

In [None]:
columns = ["race_course", "weather", "ground_status", 
           "where_racecourse", "race_class", "running_condition", 
           "frame_number", "horse_number",
           "sex_and_age", "burden_weight", "rider_id", 
           "tamer_id", "horse_weight",
           "total_horse_number_x", 
           "rank-1", "rank-2", "rank-3",
           "total_horse_number_x-1", "total_horse_number_x-2","total_horse_number_x-3",
           "goal_time-1", "goal_time-2", "goal_time-3",
           "last_time-1", "last_time-2", "last_time-3", 
           "half_way_rank-1", "half_way_rank-2", "half_way_rank-3", 
           "prize-1", "prize-2", "prize-3"]

df = df[columns]

In [None]:
df["where_racecourse"] = df["where_racecourse"].map(pp.extract_place)

df["sex"] = df["sex_and_age"].map(lambda sex_and_age: sex_and_age[0])
df["age"] = df["sex_and_age"].map(lambda sex_and_age: sex_and_age[1:])

df["goal_time-1"] = df["goal_time-1"].map(pp.to_seconds)
df["goal_time-2"] = df["goal_time-2"].map(pp.to_seconds)
df["goal_time-3"] = df["goal_time-3"].map(pp.to_seconds)

df["horse_weight"] = df["horse_weight"].map(pp.extract_weight).astype(np.int64)

df["prize-1"] = df["prize-1"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)
df["prize-2"] = df["prize-2"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)
df["prize-3"] = df["prize-3"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)

df["kyakusitu-1"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x-1"].values, df["half_way_rank-1"])]
df["kyakusitu-2"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x-2"].values, df["half_way_rank-2"])]
df["kyakusitu-3"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x-3"].values, df["half_way_rank-3"])]

# 欠損値処理
df = df.replace('---', -1)
df = df.fillna(-1)

In [None]:
df["rank-1"] = pp.make_label(df["rank-1"].values, df["total_horse_number_x-1"].values)
df["rank-2"] = pp.make_label(df["rank-2"].values, df["total_horse_number_x-2"].values)
df["rank-3"] = pp.make_label(df["rank-3"].values, df["total_horse_number_x-3"].values)

In [None]:
def fill_missing_columns(df_a, df_b):
    columns_for_b = set(df_a.columns) - set(df_b.columns)
    for column in columns_for_b:
        df_b[column] = 0

    columns_for_a = set(df_b.columns) - set(df_a.columns)
    for column in columns_for_a:
        df_a[column] = 0

In [None]:
# targetデータを予測に用いる為に、targetデータに存在していない学習時に用いたカテゴリデータを追加
df_used_learning = pd.read_csv(DATA_PATH+"/learning/horse_race.csv")

columns = ["race_course", "weather", "ground_status", 
           "where_racecourse", "race_class", "running_condition", 
           "frame_number", "horse_number",
           "sex_and_age", "burden_weight", "rider_id", 
           "tamer_id", "horse_weight",
           "total_horse_number_x", 
           "rank-1", "rank-2", "rank-3",
           "total_horse_number_x-1", "total_horse_number_x-2","total_horse_number_x-3",
           "goal_time-1", "goal_time-2", "goal_time-3",
           "last_time-1", "last_time-2", "last_time-3", 
           "half_way_rank-1", "half_way_rank-2", "half_way_rank-3", 
           "prize-1", "prize-2", "prize-3"]

df_used_learning = df_used_learning[columns]

df_used_learning["where_racecourse"] = df_used_learning["where_racecourse"].map(pp.extract_place)

df_used_learning["sex"] = df_used_learning["sex_and_age"].map(lambda sex_and_age: sex_and_age[0])
df_used_learning["age"] = df_used_learning["sex_and_age"].map(lambda sex_and_age: sex_and_age[1:])

df_used_learning["goal_time-1"] = df_used_learning["goal_time-1"].map(pp.to_seconds)
df_used_learning["goal_time-2"] = df_used_learning["goal_time-2"].map(pp.to_seconds)
df_used_learning["goal_time-3"] = df_used_learning["goal_time-3"].map(pp.to_seconds)

df_used_learning["horse_weight"] = df_used_learning["horse_weight"].map(pp.extract_weight).astype(np.int64)

df_used_learning["prize-1"] = df_used_learning["prize-1"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)
df_used_learning["prize-2"] = df_used_learning["prize-2"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)
df_used_learning["prize-3"] = df_used_learning["prize-3"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)

df_used_learning["kyakusitu-1"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df_used_learning["total_horse_number_x-1"].values, df_used_learning["half_way_rank-1"])]
df_used_learning["kyakusitu-2"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df_used_learning["total_horse_number_x-2"].values, df_used_learning["half_way_rank-2"])]
df_used_learning["kyakusitu-3"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df_used_learning["total_horse_number_x-3"].values, df_used_learning["half_way_rank-3"])]

# 欠損値処理
df_used_learning = df_used_learning.replace('---', -1)
df_used_learning = df_used_learning.fillna(-1)

df_used_learning["rank-1"] = pp.make_label(df_used_learning["rank-1"].values, df_used_learning["total_horse_number_x-1"].values)
df_used_learning["rank-2"] = pp.make_label(df_used_learning["rank-2"].values, df_used_learning["total_horse_number_x-2"].values)
df_used_learning["rank-3"] = pp.make_label(df_used_learning["rank-3"].values, df_used_learning["total_horse_number_x-3"].values)

input_columns = ["race_course", "weather", "ground_status", 
                 "where_racecourse", "race_class", "running_condition", 
                 "frame_number", "horse_number",
                 "sex", "age", "burden_weight", "rider_id", 
                 "tamer_id", "horse_weight",
                 "rank-1", "rank-2", "rank-3", 
                 "goal_time-1", "goal_time-2", "goal_time-3",
                 "last_time-1", "last_time-2", "last_time-3", 
                 "kyakusitu-1", "kyakusitu-2", "kyakusitu-3", 
                 "prize-1", "prize-2", "prize-3"]

df_used_learning = pp.one_hot_encoding(df_used_learning[input_columns])
df = pp.one_hot_encoding(df[input_columns])

fill_missing_columns(df, df_used_learning)

# 予測

In [None]:
model = im.read_model("second_model")

In [None]:
x = np.array(df)

In [None]:
test = []
for x_input, umaban, horse_id in zip(x, list(range(len(df_target["horse_id"]))), list(df_target["horse_id"])):
    pred = model.predict(x_input.reshape(1, -1))
    if umaban == 1 or umaban == 2 or umaban == 4 or umaban == 5:
        test.append(x_input)
    prediction_class = np.argmax([pred[0][0], pred[0][1], pred[0][2]])
    print("class: %d, (%f, %f, %f), %d (%d)" % (prediction_class, pred[0][0], pred[0][1], pred[0][2], umaban + 1, horse_id))