In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import pickle
import utils.read_data as rd
import utils.preprocessing as pp
import utils.join_race_data as jrd
import utils.prepare_data as prepare_data

import utils.io_model as im
import os
from os.path import join, dirname
from dotenv import load_dotenv
from pathlib import Path

In [None]:
load_dotenv(verbose=True)
dotenv_path = join(Path().resolve(), '.env')
load_dotenv(dotenv_path)
GOOGLE_DRIVE_PATH = os.environ.get("GOOGLE_DRIVE_PATH")
TRAIN_DATA_PATH = GOOGLE_DRIVE_PATH + '/train_data'

# ジャパンカップ当日の別レースデータをこのノートブックで使えるようにディレクトリに配置
import shutil

target_file_names = [name for name in os.listdir(DATA_PATH+"test_data") if not "japan" in name and  not "arima" in name]
race_files = sorted([name for name in target_file_names if "race" in name])
horse_files = sorted([name for name in target_file_names if "horse" in name])
race_names = [race_name.split("-")[1] for race_name in race_files]
race_names = [race_name.split(".")[0] for race_name in race_files]
for race_name, race_file, horse_file in zip(race_names, race_files, horse_files):
    target_data_path = DATA_PATH+"test_data/target_"+race_name
    os.makedirs(target_data_path, exist_ok=True)
    shutil.move(DATA_PATH+"test_data/"+race_file, target_data_path+"/")
    shutil.move(DATA_PATH+"test_data/"+horse_file, target_data_path+"/")

In [None]:
#TARGET_DATA_PATH = GOOGLE_DRIVE_PATH + '/csv/test_data/target_race-202005050902/'
TARGET_DATA_PATH = GOOGLE_DRIVE_PATH + '/test_data/takarazuka/'

# 前処理

In [None]:
df_target_horse = rd.read_target_horse_csv(TARGET_DATA_PATH)
df_target_race  = rd.read_target_race_csv(TARGET_DATA_PATH)
target_df = pd.merge(df_target_horse, df_target_race, on='race_id', how='left')

In [None]:
# ターゲットデータに過去3レース分の情報を追加
past_data_df = rd.read_horse_race_csv(TRAIN_DATA_PATH)
columns_past_data = [c for c in past_data_df.columns if "-" not in c]
df_for_prediction = jrd.join_n_race_for_test_data(past_data_df[columns_past_data], target_df, 3)

## 前処理

In [None]:
def make_label(rank):
    rank = str(rank)
    if not(rank.isdigit()):
        rank = 30

    return int(rank)

In [None]:
#df_for_prediction = df_for_prediction.rename(columns={'total_horse_number': 'total_horse_number_x'})
#df_for_prediction['rank'] = 1

In [None]:
df_for_prediction["rank-1"] = df_for_prediction["rank-1"].apply(make_label)
df_for_prediction["rank-2"] = df_for_prediction["rank-2"].apply(make_label)
df_for_prediction["rank-3"] = df_for_prediction["rank-3"].apply(make_label)

In [None]:
past_data_df["rank-1"] = past_data_df["rank-1"].apply(make_label)
past_data_df["rank-2"] = past_data_df["rank-2"].apply(make_label)
past_data_df["rank-3"] = past_data_df["rank-3"].apply(make_label)

In [None]:
df_for_prediction = prepare_data.prepare_data_for_prediction(df_for_prediction, past_data_df)

In [None]:
sorted(df_for_prediction.columns.tolist())

In [None]:
len(df_for_prediction.columns.tolist())

# 予測

In [None]:
file = './model_data/lambdarank/lgb_model.pkl'
model = pickle.load(open(file, 'rb'))

In [None]:
x = np.array(df_for_prediction)

In [None]:
pred = model.predict(x, num_iteration=model.best_iteration)

In [None]:
res = pd.DataFrame(columns=['horse_id', 'horse_number', 'score'])
for score, horse_number, horse_id in zip(pred, list(target_df["horse_number"]), list(target_df["horse_id"])):
    res = res.append(pd.DataFrame([[horse_id, horse_number, -score]], columns=['horse_id', 'horse_number', 'score']))

In [None]:
def softmax(a):
    x = np.exp(a)
    u = np.sum(x)
    return x/u

In [None]:
res['proc'] = softmax(np.array(res["score"]))

In [None]:
sum(res.proc)

In [None]:
res.sort_values('score', ascending=False)

In [None]:
target_df