In [None]:
import sys
sys.path.append("..")
import os
from os.path import join, dirname
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm

import lightgbm as lgb

import utils.top_n_box as top_n_box
import utils.read_data as rd
import utils.preprocessing as pp
import utils.prepare_data as prepare_data

In [None]:
load_dotenv(verbose=True)
dotenv_path = join(Path().resolve(), '.env')
load_dotenv(dotenv_path)

In [None]:
GOOGLE_DRIVE_PATH = os.environ.get("GOOGLE_DRIVE_PATH")
DATA_PATH = GOOGLE_DRIVE_PATH + '/train_data'

## サンプルデータを準備

In [None]:
df = rd.read_horse_race_csv(DATA_PATH)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
prize_columns = [
    "tansyo", 
    "hukusyo_first", 
    "hukusyo_second", 
    "hukusyo_third", 
    "umaren", 
    "umatan", 
    "wide_1_2", 
    "wide_1_3", 
    "wide_2_3",
    "renhuku3",
    "rentan3"
]

In [None]:
top_n_test_df = df
top_n_test_df.head()

In [None]:
race_ids = list(set(top_n_test_df["race_id"].values))
len(race_ids)

In [None]:
def make_label(rank):
    rank = str(rank)
    if not(rank.isdigit()):
        rank = 30

    return int(rank)

In [None]:
top_n_test_df["label"] = top_n_test_df["rank"].apply(make_label)
top_n_test_df["rank-1"] = top_n_test_df["rank-1"].apply(make_label)
top_n_test_df["rank-2"] = top_n_test_df["rank-2"].apply(make_label)
top_n_test_df["rank-3"] = top_n_test_df["rank-3"].apply(make_label)

In [None]:
top_n_test_df = top_n_test_df.sort_values('race_id', ascending=True)

In [None]:
query = list(top_n_test_df.groupby('race_id').count().race_course)

In [None]:
df_for_learning = prepare_data.prepare_train_data(top_n_test_df)

In [None]:
columns_for_learning = df_for_learning.columns.values.tolist()

In [None]:
columns_for_learning.remove("label")

In [None]:
# 学習に用いるデータセットの作成
x = np.array(df_for_learning[columns_for_learning])
y = np.array(df_for_learning['label'])
#del df
split = int(len(query) / 5)
query_train = query[:split]  
x_train = x[:sum(query[:split])]
y_train = y[:sum(query[:split])]

query_test = query[split:]  
x_test = x[sum(query[:split]):]
y_test = y[sum(query[:split]):]
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=False)
#del x, y
print(x_train.shape)
print(x_test.shape)

In [None]:
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [3, 5],
    'boosting_type': 'gbdt',
}
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'boosting_type': 'gbdt',
    'num_iterations': 500,
    'max_bin': 100,
    'num_leaves': 50,
    'learning_rate': 0.05,
    'early_stopping_rounds': 50,
}

In [None]:
dtrain = lgb.Dataset(x_train, y_train, group=query_train)
dval = lgb.Dataset(x_test, y_test, reference=dtrain, group=query_test)
model = lgb.train(params, dtrain, valid_sets=dval)

In [None]:
print(df_for_learning.shape)
print(top_n_test_df.shape)

In [None]:
prize_columns = [
    "tansyo",
    "hukusyo_first",
    "hukusyo_second",
    "hukusyo_third",
    "umaren",
    "umatan",
    "wide_1_2",
    "wide_1_3",
    "wide_2_3",
    "renhuku3",
    "rentan3"
]

In [None]:
predicted_ranks_df = pd.DataFrame(columns=["race_id", "rank"])

offset = sum(query_train)
for q in tqdm(query_test):
    df = df_for_learning[offset:offset+q][columns_for_learning]
    race_id = top_n_test_df[offset:offset+q]["race_id"].values.astype(int)
    x = np.array(df)
    pred = model.predict(x, num_iteration=model.best_iteration)
    predicted_ranks = np.argsort(abs(pred)) + 1
    df = pd.DataFrame({"race_id": race_id, "rank": predicted_ranks})
    predicted_ranks_df = predicted_ranks_df.append(df)
    
    offset += q
    
predicted_ranks_df = predicted_ranks_df.astype({"rank": int})
predicted_ranks_df.head()

In [None]:
print(sum(query[split:split+10]))
print(predicted_ranks_df.shape)

prize columns <br>
2   tansyo          61 non-null     object <br>
 3   hukusyo_first   61 non-null     object  <br>
 4   hukusyo_second  61 non-null     object  <br>
 5   hukusyo_third   61 non-null     object  <br>
 6   umaren          61 non-null     object  <br>
 7   umatan          61 non-null     object  <br>
 8   wide_1_2        61 non-null     object  <br>
 9   wide_1_3        61 non-null     object  <br>
 10  wide_2_3        61 non-null     object  <br>
 11  renhuku3        61 non-null     object  <br>

## リターン計算例

In [None]:
def extract_race(horse_race_df, race_id):
    return horse_race_df[horse_race_df["race_id"] == race_id]

In [None]:
from utils.top_n_box import TopNBox
import utils.return_calculation as return_calculation

In [None]:
ticket_types = TopNBox.ticket_types
print(ticket_types)

In [None]:
PRICE_OF_BETTING_TICKET = 100

In [None]:
test_race_ids = set(predicted_ranks_df["race_id"].values)

In [None]:
len(test_race_ids)

In [None]:
N = 3
hit_num = 0
ret = []
ret_rates = []
ticket_num = 0

for race_id in tqdm(test_race_ids):
    extracted_race_df = extract_race(top_n_test_df, race_id)
    target_ranks = extracted_race_df["label"].values.astype(int)
    prize = extracted_race_df["tansyo"].values[0]
    prize = int("".join(prize.split(",")))
    predicted_ranks = extract_race(predicted_ranks_df, race_id)["rank"].values
    
    top_n_box = TopNBox(predicted_ranks, N)
    tickets = top_n_box.output_tickets("tansho")
#     print("race_id:",race_id)
#     print("pred:",predicted_ranks)
#     print("target:",target_ranks)
    n, r = return_calculation.tansho(tickets, target_ranks, prize)
    hit_num += n
    ticket_num += len(tickets)
    ret.append(r)
    ret_rates.append(r/(ticket_num*PRICE_OF_BETTING_TICKET))
    
#     print(f"----{race_id}----")
#     print(prize)
#     print("predicted_ranks:", predicted_ranks)
#     print("target_ranks:", target_ranks)
#     print("tickets:", tickets)
#     print("ticket_num:", ticket_num)
#     print("hit_num:", n)
#     print("return: ", r)
    
print("")
print("-----tansho resulut-----")
print("total tickets:", ticket_num)
print("total hit:", hit_num)
print("total return:", np.sum(ret))
print("hit rate:", hit_num/ticket_num)
print("return rate:", np.sum(ret)/(ticket_num*PRICE_OF_BETTING_TICKET))
print("return rate std:", np.std(ret_rates, ddof=1))

In [None]:
def top_n_tansho_ret(race_df, predicted_ranks, target_ranks, N):
    prize = extracted_race_df["tansyo"].values[0]
    prize = int("".join(prize.split(",")))
    predicted_ranks = extract_race(predicted_ranks_df, race_id)["rank"].values
    
    top_n_box = TopNBox(predicted_ranks, N)
    tickets = top_n_box.output_tickets("tansho")
    n, r = return_calculation.tansho(tickets, target_ranks, prize)
    return n, r, tickets

In [None]:
def top_n_fukusho_ret(race_df, predicted_ranks, target_ranks, N):
    first_prize = race_df["hukusyo_first"].values[0]
    second_prize = race_df["hukusyo_second"].values[0]
    third_prize = race_df["hukusyo_third"].values[0]
    first_prize = int("".join(first_prize.split(",")))
    second_prize = int("".join(second_prize.split(",")))
    third_prize = int("".join(third_prize.split(",")))
    predicted_ranks = extract_race(predicted_ranks_df, race_id)["rank"].values
    
    top_n_box = TopNBox(predicted_ranks, N)
    tickets = top_n_box.output_tickets("fukusho")        
    n, r = return_calculation.fukusho(tickets, target_ranks, first_prize, second_prize, third_prize)   
    
    return n, r, tickets

In [None]:
def top_n_wide_ret(race_df, predicted_ranks, target_ranks, N):
    first_second_prize = race_df["wide_1_2"].values[0]
    first_third_prize = race_df["wide_1_3"].values[0]
    second_third_prize = race_df["hukusyo_third"].values[0]
    first_second_prize = int("".join(first_second_prize.split(",")))
    first_third_prize = int("".join(first_third_prize.split(",")))
    second_third_prize = int("".join(second_third_prize.split(",")))
    predicted_ranks = extract_race(predicted_ranks_df, race_id)["rank"].values
    
    top_n_box = TopNBox(predicted_ranks, N)
    tickets = top_n_box.output_tickets("wide")        
    n, r = return_calculation.wide(tickets, target_ranks, first_second_prize, first_third_prize, second_third_prize)   
    
    return n, r, tickets

In [None]:
def top_n_umaren_ret(race_df, predicted_ranks, target_ranks, N):
    prize = extracted_race_df["umaren"].values[0]
    prize = int("".join(prize.split(",")))
    predicted_ranks = extract_race(predicted_ranks_df, race_id)["rank"].values
    
    top_n_box = TopNBox(predicted_ranks, N)
    tickets = top_n_box.output_tickets("umaren")
    n, r = return_calculation.umaren(tickets, target_ranks, prize)
    return n, r, tickets

In [None]:
def top_n_umatan_ret(race_df, predicted_ranks, target_ranks, N):
    prize = extracted_race_df["umatan"].values[0]
    prize = int("".join(prize.split(",")))
    predicted_ranks = extract_race(predicted_ranks_df, race_id)["rank"].values
    
    top_n_box = TopNBox(predicted_ranks, N)
    tickets = top_n_box.output_tickets("umatan")
    n, r = return_calculation.umatan(tickets, target_ranks, prize)
    return n, r, tickets

In [None]:
def top_n_sanrempuku_ret(extracted_race_df, predicted_ranks, target_ranks, N):
    prize = extracted_race_df["renhuku3"].values[0]
    prize = int("".join(prize.split(",")))
    predicted_ranks = extract_race(predicted_ranks_df, race_id)["rank"].values
    
    top_n_box = TopNBox(predicted_ranks, N)
    tickets = top_n_box.output_tickets("sanrempuku")
    n, r = return_calculation.sanrempuku(tickets, target_ranks, prize)
    return n, r, tickets

In [None]:
def top_n_sanrentan_ret(extracted_race_df, predicted_ranks, target_ranks, N):
    prize = extracted_race_df["rentan3"].values[0]
    prize = int("".join(prize.split(",")))
    predicted_ranks = extract_race(predicted_ranks_df, race_id)["rank"].values
    
    top_n_box = TopNBox(predicted_ranks, N)
    tickets = top_n_box.output_tickets("sanrentan")
    n, r = return_calculation.sanrentan(tickets, target_ranks, prize)
    return n, r, tickets    

### Top-N-BoxのN=1~5までの結果をそれぞれ出力

In [None]:
N = 1

print(f"Top-{N}-Box")
for ticket_type in ticket_types[:2]:
    hit_num = 0
    ret = []
    ret_rates = []
    ticket_num = 0
    total_pay = 0
    print(ticket_type)
    for race_id in tqdm(test_race_ids):
        extracted_race_df = extract_race(top_n_test_df, race_id)
        target_ranks = extracted_race_df["label"].values.astype(int)
        if ticket_type == "tansho":
            n, r, tickets = top_n_tansho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        else: # "fukusho"
            n, r, ticketsr = top_n_fukusho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        
        hit_num += n
        ticket_num += len(tickets)
        ret.append(r)
        pay = ticket_num * PRICE_OF_BETTING_TICKET
        total_pay += pay
        ret_rates.append(r / pay)
    
    print(f"---{ticket_type} resulut---")
    print("total tickets:", ticket_num)
    print("total pay:", total_pay)
    print("total hit:", hit_num)
    print("total return:", np.sum(ret))
    print("hit rate:", hit_num/ticket_num)
    print("total return rate:", np.sum(ret)/total_pay)
    print("return rate avg:", np.mean(ret_rates))
    print("return rate std:", np.std(ret_rates, ddof=1))
    print()

In [None]:
N = 2

print(f"Top-{N}-Box")
for ticket_type in ticket_types[:5]:
    hit_num = 0
    ret = []
    ret_rates = []
    ticket_num = 0
    total_pay = 0
    print(ticket_type)
    for race_id in tqdm(test_race_ids):
        extracted_race_df = extract_race(top_n_test_df, race_id)
        target_ranks = extracted_race_df["label"].values.astype(int)
        if ticket_type == "tansho":
            n, r, tickets = top_n_tansho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "fukusho":
            n, r, tickets = top_n_fukusho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "wide":
            n, r, tickets = top_n_wide_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "umaren":
            n, r, tickets = top_n_umaren_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "umatan":
            n, r, tickets = top_n_umatan_ret(extracted_race_df, predicted_ranks, target_ranks, N)
            
        hit_num += n
        ticket_num += len(tickets)
        ret.append(r)
        pay = ticket_num * PRICE_OF_BETTING_TICKET
        total_pay += pay
        ret_rates.append(r / pay)
    
    print(f"---{ticket_type} resulut---")
    print("total tickets:", ticket_num)
    print("total pay:", total_pay)
    print("total hit:", hit_num)
    print("total return:", np.sum(ret))
    print("hit rate:", hit_num/ticket_num)
    print("total return rate:", np.sum(ret)/total_pay)
    print("return rate avg:", np.mean(ret_rates))
    print("return rate std:", np.std(ret_rates, ddof=1))
    print()

In [None]:
N = 3

print(f"Top-{N}-Box")
for ticket_type in ticket_types:
    hit_num = 0
    ret = []
    ret_rates = []
    ticket_num = 0
    total_pay = 0
    print(ticket_type)
    for race_id in tqdm(test_race_ids):
        extracted_race_df = extract_race(top_n_test_df, race_id)
        target_ranks = extracted_race_df["label"].values.astype(int)
        if ticket_type == "tansho":
            n, r, tickets = top_n_tansho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "fukusho":
            n, r, tickets = top_n_fukusho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "wide":
            n, r, tickets = top_n_wide_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "umaren":
            n, r, tickets = top_n_umaren_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "umatan":
            n, r, tickets = top_n_umatan_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "sanrempuku":
            n, r, tickets = top_n_sanrempuku_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "sanrentan":
            n, r, tickets = top_n_sanrentan_ret(extracted_race_df, predicted_ranks, target_ranks, N)
            
        hit_num += n
        ticket_num += len(tickets)
        ret.append(r)
        pay = ticket_num * PRICE_OF_BETTING_TICKET
        total_pay += pay
        ret_rates.append(r / pay)
    
    print(f"---{ticket_type} resulut---")
    print("total tickets:", ticket_num)
    print("total pay:", total_pay)
    print("total hit:", hit_num)
    print("total return:", np.sum(ret))
    print("hit rate:", hit_num/ticket_num)
    print("total return rate:", np.sum(ret)/total_pay)
    print("return rate avg:", np.mean(ret_rates))
    print("return rate std:", np.std(ret_rates, ddof=1))
    print()

In [None]:
N = 4

print(f"Top-{N}-Box")
for ticket_type in ticket_types:
    hit_num = 0
    ret = []
    ret_rates = []
    ticket_num = 0
    total_pay = 0
    print(ticket_type)
    for race_id in tqdm(test_race_ids):
        extracted_race_df = extract_race(top_n_test_df, race_id)
        target_ranks = extracted_race_df["label"].values.astype(int)
        if ticket_type == "tansho":
            n, r, tickets = top_n_tansho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "fukusho":
            n, r, tickets = top_n_fukusho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "wide":
            n, r, tickets = top_n_wide_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "umaren":
            n, r, tickets = top_n_umaren_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "umatan":
            n, r, tickets = top_n_umatan_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "sanrempuku":
            n, r, tickets = top_n_sanrempuku_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "sanrentan":
            n, r, tickets = top_n_sanrentan_ret(extracted_race_df, predicted_ranks, target_ranks, N)
            
        hit_num += n
        ticket_num += len(tickets)
        ret.append(r)
        pay = ticket_num * PRICE_OF_BETTING_TICKET
        total_pay += pay
        ret_rates.append(r / pay)
    
    print(f"---{ticket_type} resulut---")
    print("total tickets:", ticket_num)
    print("total pay:", total_pay)
    print("total hit:", hit_num)
    print("total return:", np.sum(ret))
    print("hit rate:", hit_num/ticket_num)
    print("total return rate:", np.sum(ret)/total_pay)
    print("return rate avg:", np.mean(ret_rates))
    print("return rate std:", np.std(ret_rates, ddof=1))
    print()

In [None]:
N = 5

print(f"Top-{N}-Box")
for ticket_type in ticket_types:
    hit_num = 0
    ret = []
    ret_rates = []
    ticket_num = 0
    total_pay = 0
    print(ticket_type)
    for race_id in tqdm(test_race_ids):
        extracted_race_df = extract_race(top_n_test_df, race_id)
        target_ranks = extracted_race_df["label"].values.astype(int)
        if ticket_type == "tansho":
            n, r, tickets = top_n_tansho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "fukusho":
            n, r, tickets = top_n_fukusho_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "wide":
            n, r, tickets = top_n_wide_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "umaren":
            n, r, tickets = top_n_umaren_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "umatan":
            n, r, tickets = top_n_umatan_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "sanrempuku":
            n, r, tickets = top_n_sanrempuku_ret(extracted_race_df, predicted_ranks, target_ranks, N)
        elif ticket_type == "sanrentan":
            n, r, tickets = top_n_sanrentan_ret(extracted_race_df, predicted_ranks, target_ranks, N)
            
        hit_num += n
        ticket_num += len(tickets)
        ret.append(r)
        pay = ticket_num * PRICE_OF_BETTING_TICKET
        total_pay += pay
        ret_rates.append(r / pay)
    
    print(f"---{ticket_type} resulut---")
    print("total tickets:", ticket_num)
    print("total pay:", total_pay)
    print("total hit:", hit_num)
    print("total return:", np.sum(ret))
    print("hit rate:", hit_num/ticket_num)
    print("total return rate:", np.sum(ret)/total_pay)
    print("return rate avg:", np.mean(ret_rates))
    print("return rate std:", np.std(ret_rates, ddof=1))
    print()