In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import pickle
import utils.read_data as rd
import utils.preprocessing as pp
import utils.join_race_data as jrd
import utils.prepare_data as prepare_data

import utils.io_model as im
import os
from os.path import join, dirname
from dotenv import load_dotenv
from pathlib import Path

In [2]:
load_dotenv(verbose=True)
dotenv_path = join(Path().resolve(), '.env')
load_dotenv(dotenv_path)
GOOGLE_DRIVE_PATH = os.environ.get("GOOGLE_DRIVE_PATH")
TRAIN_DATA_PATH = GOOGLE_DRIVE_PATH + '/train_data'

In [3]:
TARGET_DATA_PATH = GOOGLE_DRIVE_PATH + '/test_data/arima/'

# 前処理

In [4]:
df_target_horse = rd.read_target_horse_csv(TARGET_DATA_PATH)
df_target_race  = rd.read_target_race_csv(TARGET_DATA_PATH)
target_df = pd.merge(df_target_horse, df_target_race, on='race_id', how='left')

In [5]:
# ターゲットデータに過去3レース分の情報を追加
past_data_df = rd.read_horse_race_csv(TRAIN_DATA_PATH)
columns_past_data = [c for c in past_data_df.columns if "-" not in c]
df_for_prediction = jrd.join_n_race_for_test_data(past_data_df[columns_past_data], target_df, 3)

horse_race_20201128.csv


  if (await self.run_code(code, result,  async_=asy)):


## 前処理

In [6]:
def make_label(rank):
    rank = str(rank)
    if not(rank.isdigit()):
        rank = 30

    return int(rank)

In [7]:
#df_for_prediction = df_for_prediction.rename(columns={'total_horse_number': 'total_horse_number_x'})
#df_for_prediction['rank'] = 1

In [8]:
df_for_prediction["rank-1"] = df_for_prediction["rank-1"].apply(make_label)
df_for_prediction["rank-2"] = df_for_prediction["rank-2"].apply(make_label)
df_for_prediction["rank-3"] = df_for_prediction["rank-3"].apply(make_label)

In [9]:
past_data_df["rank-1"] = past_data_df["rank-1"].apply(make_label)
past_data_df["rank-2"] = past_data_df["rank-2"].apply(make_label)
past_data_df["rank-3"] = past_data_df["rank-3"].apply(make_label)

In [10]:
df_for_prediction = prepare_data.prepare_data_for_prediction(df_for_prediction, past_data_df)

In [11]:
sorted(df_for_prediction.columns.tolist())

['age',
 'burden_weight',
 'frame_number',
 'goal_time-1',
 'goal_time-2',
 'goal_time-3',
 'ground_status_ダート : 不良',
 'ground_status_ダート : 稍重',
 'ground_status_ダート : 良',
 'ground_status_ダート : 重',
 'ground_status_芝 : 不良',
 'ground_status_芝 : 不良\xa0\xa0ダート : 不良',
 'ground_status_芝 : 稍重',
 'ground_status_芝 : 稍重\xa0\xa0ダート : 不良',
 'ground_status_芝 : 稍重\xa0\xa0ダート : 稍重',
 'ground_status_芝 : 稍重\xa0\xa0ダート : 重',
 'ground_status_芝 : 良',
 'ground_status_芝 : 良\xa0\xa0ダート : 不良',
 'ground_status_芝 : 良\xa0\xa0ダート : 稍重',
 'ground_status_芝 : 良\xa0\xa0ダート : 良',
 'ground_status_芝 : 良\xa0\xa0ダート : 重',
 'ground_status_芝 : 重',
 'ground_status_芝 : 重\xa0\xa0ダート : 不良',
 'ground_status_芝 : 重\xa0\xa0ダート : 重',
 'horse_number',
 'horse_weight',
 'kyakusitu-1',
 'kyakusitu-2',
 'kyakusitu-3',
 'last_time-1',
 'last_time-2',
 'last_time-3',
 'odds-1',
 'odds-2',
 'odds-3',
 'popular-1',
 'popular-2',
 'popular-3',
 'prize-1',
 'prize-2',
 'prize-3',
 'race_direction_右',
 'race_direction_左',
 'race_direction_直',
 

In [12]:
len(df_for_prediction.columns.tolist())

74

# 予測

In [13]:
file = './model_data/lambdarank/lgb_model.pkl'
model = pickle.load(open(file, 'rb'))

In [14]:
x = np.array(df_for_prediction)

In [15]:
pred = model.predict(x, num_iteration=model.best_iteration)

In [16]:
res = pd.DataFrame(columns=['horse_id', 'horse_number', 'score'])
for score, horse_number, horse_id in zip(pred, list(target_df["horse_number"]), list(target_df["horse_id"])):
    res = res.append(pd.DataFrame([[horse_id, horse_number, -score]], columns=['horse_id', 'horse_number', 'score']))

In [17]:
def softmax(a):
    x = np.exp(a)
    u = np.sum(x)
    return x/u

In [18]:
res['proc'] = softmax(np.array(res["score"]))

In [19]:
sum(res.proc)

0.9999999999999999

In [21]:
target_df

Unnamed: 0,race_id,rank,frame_number,horse_number,horse_id,sex_and_age,burden_weight,rider_id,goal_time,goal_time_dif,...,hukusyo_second,hukusyo_third,wakuren,umaren,wide_1_2,wide_1_3,wide_2_3,umatan,renhuku3,rentan3
0,202006100000.0,1,5,9,2016104750,牝4,55,1102,2:35.0,,...,770,160,380,10330,2320,270,2550,11360,7370,50150
1,202006100000.0,2,7,14,2015104793,牝5,55,1126,2:35.0,クビ,...,770,160,380,10330,2320,270,2550,11360,7370,50150
2,202006100000.0,3,7,13,2015105075,牡5,57,5339,2:35.1,クビ,...,770,160,380,10330,2320,270,2550,11360,7370,50150
3,202006100000.0,4,4,7,2015105046,牝5,55,1014,2:35.5,2.1/2,...,770,160,380,10330,2320,270,2550,11360,7370,50150
4,202006100000.0,5,3,5,2016104854,牡4,57,666,2:35.6,1/2,...,770,160,380,10330,2320,270,2550,11360,7370,50150
5,202006100000.0,5,5,10,2016105089,牝4,55,1032,2:35.6,同着,...,770,160,380,10330,2320,270,2550,11360,7370,50150
6,202006100000.0,7,4,8,2014105258,牡6,57,1096,2:35.6,アタマ,...,770,160,380,10330,2320,270,2550,11360,7370,50150
7,202006100000.0,8,2,3,2014104386,牡6,57,1163,2:35.8,1.1/2,...,770,160,380,10330,2320,270,2550,11360,7370,50150
8,202006100000.0,9,8,15,2016101209,牡4,57,660,2:35.9,クビ,...,770,160,380,10330,2320,270,2550,11360,7370,50150
9,202006100000.0,10,2,4,2016104648,牝4,55,5212,2:35.9,クビ,...,770,160,380,10330,2320,270,2550,11360,7370,50150


In [26]:
res["rank"] = res["score"].rank(ascending=False)
res

Unnamed: 0,horse_id,horse_number,score,proc,rank
0,2016104750,9,2.583979,0.122127,1.0
0,2015104793,14,1.721137,0.051533,9.0
0,2015105075,13,2.258222,0.088173,5.0
0,2015105046,7,2.462005,0.108103,3.0
0,2016104854,5,2.341699,0.095849,4.0
0,2016105089,10,2.104818,0.075633,6.0
0,2014105258,8,-0.099916,0.008341,16.0
0,2014104386,3,1.657182,0.04834,11.0
0,2016101209,15,0.458138,0.014574,15.0
0,2016104648,4,2.091183,0.074609,7.0


In [46]:
predicted_ranks = res["rank"].values
target_df["rank"].replace("中", 16, inplace=True)
target_ranks = target_df["rank"].astype(np.float64).values

In [47]:
print("予測データ", predicted_ranks)
print("正解データ", target_ranks)

予測データ [ 1.  9.  5.  3.  4.  6. 16. 11. 15.  7. 10. 13.  8.  2. 12. 14.]
正解データ [ 1.  2.  3.  4.  5.  5.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.]


In [48]:
import utils.correct_rate as cr

In [51]:
cr.correct_rate_of_horse_within_3(target_ranks, predicted_ranks)

0.3333333333333333

In [50]:
cr.correct_rate_of_top_horse(target_ranks, predicted_ranks)

1.0