In [216]:
# 基本ライブラリ
import sys
from pathlib import Path
import pandas as pd

# Jupyter環境でのカレントディレクトリを使う
current_dir = Path.cwd()  # 現在の作業ディレクトリ
sys.path.append(str(current_dir.resolve().parent.parent))
SAVE_DIR = Path("data", "rawdf")


In [222]:
full_path = Path(sys.path[-1])
print(full_path)
race_results = pd.read_csv(full_path / SAVE_DIR / "preprocessed_race_results.csv", sep="\t")
horse_results = pd.read_csv(full_path / SAVE_DIR / "preprocessed_horse_results.csv", sep="\t")

C:\Users\kenni\horse_racing_predictions


In [None]:
horse_results

In [None]:
horse_id_list = [2012100683,2022110151]

In [None]:
horse_results.query("horse_id in @horse_id_list")

In [None]:
# レース情報取得

from bs4 import BeautifulSoup
import re
# 定数の定義
HTML_DIR = Path("data", "html")
SAVE_DIR = Path("data", "rawdf")
HTML_RACE_DIR = HTML_DIR / "race"
HTML_HORSE_DIR = HTML_DIR / "horse"
html_path_list = list((full_path / HTML_RACE_DIR).glob("*bin"))

In [None]:
with open(html_path_list[0], "rb") as f:
    html = f.read()

In [None]:
html

In [None]:
soup = BeautifulSoup(html, "lxml")

In [None]:
soup_info = soup.find("div",class_="data_intro")

In [None]:
soup_info

In [None]:
soup_info.find("h1").text

In [None]:
tmp = soup_info.find("p").text.replace(" ","")
tmp

In [None]:
# バイナリ文字摘出
re.findall(r"\w+",tmp)

In [None]:
# :も抽出
re.findall(r"[\w:]+",tmp)


In [None]:
tmp_1 = soup.find_all("p")[4].text
tmp_1

In [None]:
re.findall(r"[\w:]+",tmp_1)

In [None]:
info_dict = {}
info_dict["title"] = soup_info.find("h1").text
info_dict["info1"] = re.findall(
    r"[\w:]+", soup_info.find("p").text.replace(" ","")
)
info_dict["info2"] = re.findall(
    r"[\w:]+", soup.find_all("p")[4].text
)
info_dict

In [None]:
pd.DataFrame().from_dict(info_dict, orient="index").T

In [None]:
from tqdm import tqdm_notebook as tqdm

RACE_INFO_CSV = "race_info.csv"
dfs = {}
# 日付の正規表現パターン
DATE_PATTERN = r'\d{4}年\d{1,2}月\d{1,2}日'

for html_path in tqdm(html_path_list):
    with open(html_path, "rb") as f:
        try:
            html = f.read()
            soup = BeautifulSoup(html, "lxml").find("div", class_="data_intro")
            info_dict = {}
            info_dict["title"] = soup.find("h1").text
            p_list = soup.find_all("p")
            # レース名取得
            info_dict["info1"] = re.findall(r"[\w:]+", p_list[0].text.replace(" ", ""))
            # 日付を含むpタグ取得
            for i, p_2 in enumerate(p_list):
                if re.search(DATE_PATTERN, p_2.text):
                    break  # 最初に見つかったものだけ欲しいなら break
            info_dict["info2"] = re.findall(r"[\w:]+", p_2.text)
            df = pd.DataFrame.from_dict(info_dict, orient="index").T
            # ファイル名からrace_idを取得
            race_id = html_path.stem
            df.index = [race_id] * len(df)
            dfs[race_id] = df
        except IndexError as e:
            print(f"table not found at {race_id}")
            continue
        except AttributeError as e:
            print(f"{e} at {race_id}")
            continue

concat_df = pd.concat(dfs.values())
concat_df.index.name = "race_id"
concat_df.columns = concat_df.columns.str.replace(" ", "")
SAVE_DIR.mkdir(exist_ok=True, parents=True)
concat_df.to_csv(SAVE_DIR / RACE_INFO_CSV, sep="\t")

In [None]:
race_infos = pd.read_csv(full_path / SAVE_DIR / "race_info.csv", sep="\t")

In [None]:
race_infos

In [None]:
# race_id: レースのID
# 正規表現マッチング処理の関数化
def get_match(pattern, string, group_num=1):
    match = re.search(pattern, string)
    return match.group(group_num) if match else None


In [None]:
# 正規表現パターン
RACE_TYPE_PATTERN = r"(芝|ダ|障)"
AROUND_PATTERN = r"(右|左)"
CORCE_LEN_PATTERN = r"(\d+)m"
GROUND_STATE_PATTERN = r"(芝|ダート|障):(.+)"
PLACE_PATTERN = r"(\d+回(\w+)\d日目)"

# カラム列名
COLUMN_RACE_ID = "race_id"
COLUMN_HORSE_ID = "horse_id"
COLUMN_JOCKEY_ID = "jockey_id"
COLUMN_TRAINER_ID = "trainer_id"
COLUMN_OWNER_ID = "owner_id"
COLUMN_RANK = "rank"
COLUMN_WAKUBAN = "wakuban"
COLUMN_UMABAN = "umaban"
COLUMN_SEX = "sex"
COLUMN_AGE = "age"
COLUMN_WEIGHT = "weight"
COLUMN_WEIGHT_DIFF = "weight_diff"
COLUMN_TANSYO = "tansyo"
COLUMN_POPULARITY = "popularity"
COLUMN_IMPOST = "impost"
COLUMN_DATE = "date"
COLUMN_WEATHER = "weather"
COLUMN_RACE_TYPE = "race_type"
COLUMN_COURSE_LEN = "course_len"
COLUMN_GROUND_STATE = "ground_state"
COLUMN_RANK_DIFF = "rank_diff"
COLUMN_PRIZE = "prize"
COLUMN_RACE_CLASS = "race_class"
COLUMN_AROUND = "around"
COLUMN_PLACE = "place"

import pandas as pd
import ast
import re

# 必要な列名
columns = [
    COLUMN_RACE_ID,
    COLUMN_DATE,
    COLUMN_RACE_TYPE,
    COLUMN_AROUND,
    COLUMN_COURSE_LEN,
    COLUMN_WEATHER,
    COLUMN_GROUND_STATE,
    COLUMN_RACE_CLASS,
    COLUMN_PLACE,
]

# 空のデータフレームを作成（最終結果を格納）
result_df = pd.DataFrame(columns=columns)

# 各行をループ処理
for index, row in race_infos.iterrows():
    # info1 と info2 を文字列から辞書またはリストに変換
    info1 = ast.literal_eval(row['info1'])
    info2 = ast.literal_eval(row['info2'])

    # 各列の値を計算
    race_id = row["race_id"]
    formatted_date = re.sub(r"(\d+)年(\d+)月(\d+)日", r"\1-\2-\3", info2[0])
    formatted_date = re.sub(r"-(\d)-", r"-0\1-", formatted_date) # 月の0埋め
    formatted_date = re.sub(r"-(\d)$", r"-0\1", formatted_date)  # 日の0埋め
    race_type = get_match(RACE_TYPE_PATTERN, info1[0])
    around = get_match(AROUND_PATTERN, info1[0])
    course_len = get_match(CORCE_LEN_PATTERN, info1[0])
    weather = info1[1][3:]
    ground_state = get_match(GROUND_STATE_PATTERN, info1[2], group_num=2)
    race_class = info2[2]
    place = get_match(PLACE_PATTERN, info2[1], group_num=2)

    # 新しい行を作成
    new_row = {
        COLUMN_RACE_ID: race_id,
        COLUMN_DATE: formatted_date,
        COLUMN_RACE_TYPE: race_type,
        COLUMN_AROUND: around,
        COLUMN_COURSE_LEN: course_len,
        COLUMN_WEATHER: weather,
        COLUMN_GROUND_STATE: ground_state,
        COLUMN_RACE_CLASS: race_class,
        COLUMN_PLACE: place,
    }

    # 新しい行を DataFrame に追加
    result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)

# 保存処理
SAVE_DIR.mkdir(parents=True, exist_ok=True)
result_df.to_csv(SAVE_DIR / "race_info_transformed.csv", sep="\t", index=False)


In [266]:
# mappingファイル読み込み
from mapping import MappingLoader
current_dir = Path.cwd()  # 現在の作業ディレクトリ
print(current_dir / "mapping")
mapping_loader = MappingLoader(mapping_dir = current_dir / "mappng")

df = pd.read_csv(SAVE_DIR / "race_info_transformed.csv", sep="\t")
df[COLUMN_DATE] = pd.to_datetime(df[COLUMN_DATE])
df[COLUMN_RACE_TYPE] = df[COLUMN_RACE_TYPE].map(mapping_loader.get_race_type_mapping())
df[COLUMN_AROUND] = df[COLUMN_AROUND].map(mapping_loader.get_around_mapping())
df[COLUMN_COURSE_LEN] = df[COLUMN_COURSE_LEN].map(mapping_loader.get_cource_len_mapping())
df[COLUMN_WEATHER] = df[COLUMN_WEATHER].map(mapping_loader.get_weather_mapping())
df[COLUMN_GROUND_STATE] = df[COLUMN_GROUND_STATE].map(mapping_loader.get_ground_state_mapping())
df[COLUMN_RACE_CLASS] = df[COLUMN_RACE_CLASS].map(mapping_loader.get_race_class_info_mapping())
df[COLUMN_PLACE] = df[COLUMN_PLACE].map(mapping_loader.get_place_mapping())
# ここに使用する列名を列挙
df = df[
    [
        COLUMN_RACE_ID,
        COLUMN_DATE,
        COLUMN_RACE_TYPE,
        COLUMN_AROUND,
        COLUMN_COURSE_LEN,
        COLUMN_WEATHER,
        COLUMN_GROUND_STATE,
        COLUMN_RACE_CLASS,
        COLUMN_PLACE
    ]
]
SAVE_DIR.mkdir(parents=True, exist_ok=True)
df.to_csv(SAVE_DIR / "race_info_transformed2.csv", sep="\t", index=False)


c:\Users\kenni\horse_racing_predictions\notebooks\preprocessing\mapping


AttributeError: 'MappingLoader' object has no attribute 'get_around_mapping'