# preparation

In [1]:
import pandas as pd
import numpy as np

#モデル
import lightgbm as lgb

#クロスバリデーション
from sklearn.model_selection import KFold

#エヴァリュエーション
from sklearn.metrics import precision_score, recall_score, mean_squared_error

#可視化
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#保存
import pickle

In [2]:
race_id = 202355092306
csv_file_path = f'../datasets/targetdata/rawdata/racepage/{race_id}.csv'
n_targetrace_df = pd.read_csv(csv_file_path)

In [3]:
# モデルを読み込む
with open('bestmodels/benchmark_optunaed.pkl', 'rb') as f:
    models = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'bestmodels/benchmark_optunaed.pkl'

In [None]:
n_uma_race_df = pd.read_pickle('../datasets/targetdata/n_uma_race.pkl')

In [None]:
n_targetrace_df.head(3)

In [None]:
n_uma_race_df.head(3)

In [None]:
# カラムのデータ型をstrに変換
columns_to_convert = [
                    'keibajo_code',
                    'race_bango',
                    'kaisai_nen',
                    'kaisai_tsukihi',
                    'wakuban',
                    'umaban',
                    ]

for column in columns_to_convert:
    n_uma_race_df[column] = n_uma_race_df[column].astype(str)
    n_targetrace_df[column] = n_targetrace_df[column].astype(str)
    missing_values = n_uma_race_df[column].isna().sum()
    print(f"n_uma_race_dfの{column} の欠損値の量: {missing_values}")
    missing_values = n_targetrace_df[column].isna().sum()
    print(f"n_targetrace_dfの{column} の欠損値の量: {missing_values}")

In [None]:
# merged_df = pd.merge(n_uma_race_df, n_targetrace_df, on=['keibajo_code', 
#                                                         'race_bango', 
#                                                         'kaisai_nen', 
#                                                         'kaisai_tsukihi', 
#                                                         'wakuban', 
#                                                         'umaban'
#                                                        ])

In [None]:
# n_uma_race_dfとn_targetrace_dfをマージ
keys = ['keibajo_code', 'race_bango', 'kaisai_nen', 'kaisai_tsukihi', 'wakuban', 'umaban']
merged_df = pd.merge(n_uma_race_df, n_targetrace_df, on=keys, suffixes=('', '_from_target'))

# n_targetrace_dfから必要なカラム（bataiju, zogen_fugo, zogen_sa, kyakushitsu_hantei）を選択
columns_from_target = ['bataiju_from_target', 'zogen_fugo_from_target', 'zogen_sa_from_target', 'kyakushitsu_hantei_from_target']

# n_uma_race_dfから必要なカラムを選択
columns_from_uma_race = [col for col in n_uma_race_df.columns if col not in ['bataiju', 'zogen_fugo', 'zogen_sa', 'kyakushitsu_hantei']]

# これらのカラムを使って新しいDataFrameを作成
final_df = merged_df[columns_from_uma_race + columns_from_target]

# カラム名をリネームして接尾辞を削除
final_df.rename(columns={
    'bataiju_from_target': 'bataiju',
    'zogen_fugo_from_target': 'zogen_fugo',
    'zogen_sa_from_target': 'zogen_sa',
    'kyakushitsu_hantei_from_target': 'kyakushitsu_hantei'
}, inplace=True)


In [None]:
final_df

In [None]:
# 各カラムを数値化し、欠損値の量を調べる
for column in columns_to_convert:
    final_df[column] = pd.to_numeric(final_df[column], errors='coerce')
    missing_values = final_df[column].isna().sum()
    print(f"{column} の欠損値の量: {missing_values}")

In [None]:
final_df.head(3)

In [None]:
final_df.info()

In [None]:
target_uma_race_df = final_df

### ターゲットデータで予測

In [None]:
# 新しいグループを作成
target_uma_race_df['group'] = target_uma_race_df['kaisai_nen'].astype(str) +"-"+ target_uma_race_df['kaisai_tsukihi'].astype(str) +"-"+  target_uma_race_df['keibajo_code'].astype(str) +"-"+  target_uma_race_df['race_bango'].astype(str)
# グループごとの個数を計算
group_counts = target_uma_race_df['group'].value_counts()
print(group_counts)

In [None]:
# ランキング学習のために必要な特徴量とターゲットを設定
features = [
            'kaisai_nen',
            'kaisai_tsukihi',
            'keibajo_code',
            # 'kaisai_kai',
            # 'kaisai_nichime',
            'race_bango',
            'wakuban',
            'umaban',
            'ketto_toroku_bango',
            # 'bamei',
            'umakigo_code',
            'seibetsu_code',
            'hinshu_code',
            'moshoku_code',
            'barei',
            'tozai_shozoku_code',
            'chokyoshi_code',
            'banushi_code',
            # 'banushimei',
            'futan_juryo',
            'blinker_shiyo_kubun',
            'kishu_code',
            # 'kishumei_ryakusho',
            'kishu_minarai_code',
            'bataiju',
            'zogen_fugo',
            'zogen_sa',
            'ijo_kubun_code',
            # 'nyusen_juni',
            ## 'kakutei_chakujun',
            # 'dochaku_kubun',
            # 'dochaku_tosu',
            # 'soha_time',
            # 'chakusa_code_1',
            # 'chakusa_code_2',
            # 'chakusa_code_3',
            # 'corner_1',
            # 'corner_2',
            # 'corner_3',
            # 'corner_4',
            # 'tansho_odds',
            # 'tansho_ninkijun',
            # 'kakutoku_honshokin',
            # 'kakutoku_fukashokin',
            # 'kohan_4f',
            # 'kohan_3f',
            # 'aiteuma_joho_1',
            # 'aiteuma_joho_2',
            # 'aiteuma_joho_3',
            # 'time_sa',
            # 'record_koshin_kubun',
            'kyakushitsu_hantei',
            ]


target = 'kakutei_chakujun'

In [None]:
target_uma_race_df['y_pred'] = sum([model.predict(target_uma_race_df[features], num_iteration=model.best_iteration) for model in models]) / len(models)

# 予測されたランクをグループごとに計算
target_uma_race_df['predicted_rank'] = target_uma_race_df.groupby('group')['y_pred'].rank(method='min')

In [None]:
# groupとpredicted_rankでソート
sorted_df = target_uma_race_df.sort_values(by=['group', 'predicted_rank'])

# # pandasの表示オプションを変更して、すべての行を表示
# pd.set_option('display.max_rows', None)

# print(sorted_df[['group', 'y_pred', 'predicted_rank', 'umaban', 'bamei']])

In [None]:
# マッピングのルールを定義
race_venue_mapping = {
    1: '札幌',
    2: '函館',
    3: '福島',
    4: '新潟',
    5: '東京',
    6: '中山',
    7: '中京',
    8: '京都',
    9: '阪神',
    10: '小倉',
    30: '門別',
    31: '北見',
    32: '岩見沢',
    33: '帯広',
    34: '旭川',
    35: '盛岡',
    36: '水沢',
    37: '上山',
    38: '三条',
    39: '足利',
    40: '宇都宮',
    41: '高崎',
    42: '浦和',
    43: '船橋',
    44: '大井',
    45: '川崎',
    46: '金沢',
    47: '笠松',
    48: '名古屋',
    49: '紀三井寺',
    50: '園田',
    51: '姫路',
    52: '益田',
    53: '福山',
    54: '高知',
    55: '佐賀',
    56: '荒尾',
    57: '中津',
    58: '札幌（地方競馬）',
    59: '函館（地方競馬）',
    60: '新潟（地方競馬）',
    61: '中京（地方競馬）',
    0: '未設定・未整備時'
}
sorted_df['keibajo'] = sorted_df['keibajo_code'].map(race_venue_mapping)

In [None]:
sorted_df = sorted_df[['predicted_rank', 'bamei', 'kaisai_nen', 'kaisai_tsukihi', 'keibajo', 'race_bango', 'wakuban', 'umaban', 'zogen_sa', 'y_pred']]

In [None]:
sorted_df