In [None]:
import lightgbm as lgb

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

import utils.read_data as rd
import utils.io_model as io_m
import utils.preprocessing as pp
import utils.prepare_data as prepare_data
import pickle

In [None]:
import os
from os.path import join, dirname
from dotenv import load_dotenv
from pathlib import Path


In [None]:
load_dotenv(verbose=True)
dotenv_path = join(Path().resolve(), '.env')
load_dotenv(dotenv_path)

In [None]:
GOOGLE_DRIVE_PATH = os.environ.get("GOOGLE_DRIVE_PATH")
DATA_PATH = GOOGLE_DRIVE_PATH + '/train_data'

In [None]:
df = rd.read_horse_race_csv(DATA_PATH)

In [None]:
df = df.sort_values('race_id', ascending=False)

In [None]:
query = list(df.groupby('race_id').count().race_course)

### ↓ クエリー毎に複数のレースが含まれてるかの確認用

In [None]:
df.groupby('race_id').count().race_course

In [None]:
df.groupby('race_id').count().race_course.index

In [None]:
df.race_id

In [None]:
query

In [None]:
offset = 0
for q in query[:10]:
    race_id = df[offset:offset+q]["race_id"].values.astype(int)
    print(set(race_id))
    offset += q

groupbyメソッドはrace_idを昇順にしてソートして結果を出力している。

元々のdfはrace_idが降順に並んでいるのでクエリーに対応していない。

### ----ここまで----

In [None]:
# 修正版
df = df.sort_values('race_id', ascending=True)
query = list(df.groupby('race_id').count().race_course)

In [None]:
def make_label(rank):
    rank = str(rank)
    if not(rank.isdigit()):
        rank = 30

    return int(rank)

In [None]:
df["label"] = df["rank"].apply(make_label)
df["rank-1"] = df["rank-1"].apply(make_label)
df["rank-2"] = df["rank-2"].apply(make_label)
df["rank-3"] = df["rank-3"].apply(make_label)


In [None]:
df[df['label'] =='12(再)']['label']

In [None]:
df_for_learning = prepare_data.prepare_train_data(df)

In [None]:
df.head()

In [None]:
df_for_learning.head()

In [None]:
columns_for_learning = df_for_learning.columns.values.tolist()
columns_for_learning 

In [None]:
columns_for_learning.remove("label")

In [None]:
sorted(columns_for_learning)

In [None]:
# 学習に用いるデータセットの作成
x = np.array(df_for_learning[columns_for_learning])
y = np.array(df_for_learning['label'])
#del df
split = int(len(query) / 5)
query_train = query[:split]  
x_train = x[:sum(query[:split])]
y_train = y[:sum(query[:split])]

query_test = query[split:]  
x_test = x[sum(query[:split]):]
y_test = y[sum(query[:split]):]
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=False)
#del x, y
print(x_train.shape)
print(x_test.shape)

In [None]:
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [3, 5],
    'boosting_type': 'gbdt',
}
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'boosting_type': 'gbdt',
    'num_iterations': 500,
    'max_bin': 100,
    'num_leaves': 50,
    'learning_rate': 0.05,
    'early_stopping_rounds': 50,
}

In [None]:
dtrain = lgb.Dataset(x_train, y_train, group=query_train)
dval = lgb.Dataset(x_test, y_test, reference=dtrain, group=query_test)
model = lgb.train(params, dtrain, valid_sets=dval)

In [None]:
file = './model_data/lambdarank/lgb_model.pkl'
pickle.dump(model, open(file, 'wb'))

In [None]:
loaded_model = pickle.load(open(file, 'rb'))