In [None]:
import lightgbm as lgb

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

import utils.read_data as rd
import utils.io_model as io_m
import utils.preprocessing as pp
import pickle

In [None]:
import os
from os.path import join, dirname
from dotenv import load_dotenv
from pathlib import Path


In [None]:
load_dotenv(verbose=True)
dotenv_path = join(Path().resolve(), '.env')
load_dotenv(dotenv_path)

In [None]:
GOOGLE_DRIVE_PATH = os.environ.get("GOOGLE_DRIVE_PATH")
DATA_PATH = GOOGLE_DRIVE_PATH + '/train_data'

In [None]:
df = rd.read_horse_race_csv(DATA_PATH)

In [None]:
df

In [None]:
# 学習に使用するカラム、過去データは3レース前までのデータを用いる
# これらのカラムに対して加工を行う為、新たなカラムが加わる。
# その為、最終的に用いる特徴量のカラムは別なセルで定義する(input_columns)
columns = ['race_id', "race_course", "weather", "ground_status", 
           "where_racecourse", "race_class", "running_condition", 
           "frame_number", "horse_number",
           "sex_and_age", "burden_weight", "rider_id", 
           "tamer_id", "horse_weight", "odds", "popular",
           "rank", "total_horse_number_x", 
           "rank-1", "rank-2", "rank-3",
           "total_horse_number_x-1", "total_horse_number_x-2","total_horse_number_x-3",
           "goal_time-1", "goal_time-2", "goal_time-3",
           "last_time-1", "last_time-2", "last_time-3", 
           "half_way_rank-1", "half_way_rank-2", "half_way_rank-3", 
           "prize-1", "prize-2", "prize-3"]

df = df[columns]
df.head()

In [None]:
df["where_racecourse"] = df["where_racecourse"].map(pp.extract_place)

df["sex"] = df["sex_and_age"].map(lambda sex_and_age: sex_and_age[0])
df["age"] = df["sex_and_age"].map(lambda sex_and_age: sex_and_age[1:])

df["goal_time-1"] = df["goal_time-1"].map(pp.to_seconds)
df["goal_time-2"] = df["goal_time-2"].map(pp.to_seconds)
df["goal_time-3"] = df["goal_time-3"].map(pp.to_seconds)

df["horse_weight"] = df["horse_weight"].map(pp.extract_weight).astype(np.int64)

df["prize-1"] = df["prize-1"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)
df["prize-2"] = df["prize-2"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)
df["prize-3"] = df["prize-3"].map(lambda prize: prize.replace(",", "") if type(prize) == str else prize).astype(np.float64)

df["kyakusitu-1"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x-1"].values, df["half_way_rank-1"])]
df["kyakusitu-2"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x-2"].values, df["half_way_rank-2"])]
df["kyakusitu-3"] = [pp.kyakusitu_code_c(n, r) for n, r in zip(df["total_horse_number_x-3"].values, df["half_way_rank-3"])]

# 欠損値処理
df = df.replace('---', -1)
df = df.fillna(-1)

df["odds"] = df["odds"].astype(np.float64)

In [None]:
df

In [None]:
df = df.sort_values('race_id')

In [None]:
def make_label(rank):
    rank = str(rank)
    if not(rank.isdigit()):
        rank = 30

    return rank

In [None]:
df["rank"] = df["rank"].apply(make_label)
df["rank-1"] = df["rank-1"].apply(make_label)
df["rank-2"] = df["rank-2"].apply(make_label)
df["rank-3"] = df["rank-3"].apply(make_label)


In [None]:
query = list(df.groupby('race_id').count().race_course)

In [None]:
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [3, 5],
    'boosting_type': 'gbdt',
}

In [None]:
input_columns = ["race_course", "weather", "ground_status", 
                 "where_racecourse", "race_class", "running_condition", 
                 "frame_number", "horse_number",
                 "sex", "age", "burden_weight", "rider_id", 
                 "tamer_id", "horse_weight",
                 "rank-1", "rank-2", "rank-3", 
                 "goal_time-1", "goal_time-2", "goal_time-3",
                 "last_time-1", "last_time-2", "last_time-3", 
                 "kyakusitu-1", "kyakusitu-2", "kyakusitu-3", 
                 "prize-1", "prize-2", "prize-3"]

In [None]:
# one-hot
df_one_hot = pd.get_dummies(df[input_columns])

In [None]:
features = df_one_hot.columns.values.tolist()

In [None]:
len(features)

In [None]:
features

In [None]:
df[df['rank'] =='12(再)']['rank']

In [None]:
# 学習に用いるデータセットの作成
x = np.array(df_one_hot[features])
y = np.array(df['rank'])
#del df
split = int(len(query) / 3)
query_train = query[:split]  
x_train = x[:sum(query[:split])]
y_train = y[:sum(query[:split])]

query_test = query[split:]  
x_test = x[sum(query[:split]):]
y_test = y[sum(query[:split]):]
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=False)
#del x, y
print(x_train.shape)
print(x_test.shape)

In [None]:
dtrain = lgb.Dataset(x_train, y_train, group=query_train)
dval = lgb.Dataset(x_test, y_test, reference=dtrain, group=query_test)
model = lgb.train(params, dtrain, valid_sets=dval)

In [None]:
file = './model_data/lambdarank/lgb_model.pkl'
pickle.dump(model, open(file, 'wb'))

In [None]:
loaded_model = pickle.load(open(file, 'rb'))