# Clean names and stuff

In [None]:

# Target labels as described in Borowski & Chlebus (2021)
merged["target_win"] = (merged["finish_position"] == 1).astype(int)
merged["target_quinella"] = (merged["finish_position"] <= 2).astype(int)

# Race-wise z-score standardization
z_candidates = [
    "weight_lb",
    "decimal_price",
    "age",
    "distance_behind",
    "over_weight",
    "out_handicap",
    "rpr_rating",
    "tr_rating",
    "or_rating",
    "race_runners",
    "margin",
    "race_distance",
    "winning_time",
    "prize_total",
]
for col in z_candidates:
    if col in merged.columns:
        def _race_z(x):
            std = x.std(ddof=0)
            if std == 0 or np.isnan(std):
                return pd.Series(np.zeros(len(x)), index=x.index)
            return (x - x.mean()) / std
        merged[f"{col}_z"] = merged.groupby("race_id")[col].transform(_race_z)

# Rolling history features per horse (last 3 races)
window = 3
merged = merged.sort_values(["horse_id", "race_date", "race_time", "race_id"])
merged["mean_finish_pos_3"] = merged.groupby("horse_id")["finish_position"].transform(
    lambda x: x.shift().rolling(window).mean()
)
merged["top3_ratio_3"] = merged.groupby("horse_id")["finish_position"].transform(
    lambda x: (x.shift() <= 3).rolling(window).mean()
)
merged["avg_decimal_price_3"] = merged.groupby("horse_id")["decimal_price"].transform(
    lambda x: x.shift().rolling(window).mean()
)
if "prize_total" in merged.columns:
    merged["total_prize_3"] = merged.groupby("horse_id")["prize_total"].transform(
        lambda x: x.shift().rolling(window).sum()
    )

history_cols = [
    col
    for col in ["mean_finish_pos_3", "top3_ratio_3", "avg_decimal_price_3", "total_prize_3"]
    if col in merged.columns
]
for col in history_cols:
    merged[col] = merged[col].fillna(0)

# Assemble feature table
identifier_cols = [
    "race_id",
    "horse_id",
    "horse_name",
    "race_date",
    "course",
    "going",
    "race_distance",
    "trainer_name",
    "jockey_name",
]
identifier_cols = [col for col in identifier_cols if col in merged.columns]
z_cols = [f"{col}_z" for col in z_candidates if f"{col}_z" in merged.columns]
target_cols = ["target_win", "target_quinella"]

feature_df = merged[identifier_cols + z_cols + history_cols + target_cols].copy()

# Persist outputs alongside original data
output_dir = Path("data")
output_dir.mkdir(parents=True, exist_ok=True)
merged_path = output_dir / "merged_horse_race_2019.csv"
feature_path = output_dir / "feature_engineered_2019.csv"
merged.to_csv(merged_path, index=False)
feature_df.to_csv(feature_path, index=False)

print(f"Merged dataset shape: {merged.shape}")
print(f"Feature dataset shape: {feature_df.shape}")
print("Feature columns:")
print(feature_df.columns.tolist())
if z_cols:
    print("Sample standardized features:")
    print(feature_df[z_cols].head())


# merge dfs

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Load data
#races_path = "races_2019.csv"
#horses_path = "horses_2019.csv"
#races_df = pd.read_csv(races_path)
#horses_df = pd.read_csv(horses_path)

# Merge race and horse data on race_id
df = merged.merge(races_df, on="rid", how="inner", suffixes=("_horse", "_race"))

# Basic cleaning: ensure numeric conversions
numeric_cols = ["weight_carried", "odds", "draw", "age", "distance", "time", "prize"]
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Target variables
df["target_win"] = (df["finish_position"] == 1).astype(int)
df["target_quinella"] = (df["finish_position"] <= 2).astype(int)

# Per-race z-scores for numeric performance columns
z_cols = [col for col in numeric_cols if col in df.columns]
for col in z_cols:
    df[f"{col}_z"] = df.groupby("rid")[col].transform(
        lambda x: (x - x.mean()) / x.std(ddof=0) if x.std(ddof=0) else 0
    )

# Historical features per horse: last 3 races
window = 3
df = df.sort_values(["horse_id", "date", "rid"])

df["mean_finish_pos_3"] = (
    df.groupby("horse_id")["finish_position"].transform(lambda x: x.shift().rolling(window).mean())
)
df["top3_ratio_3"] = (
    df.groupby("horse_id")["finish_position"]
    .transform(lambda x: (x.shift() <= 3).rolling(window).mean())
)
df["avg_odds_3"] = (
    df.groupby("horse_id")["odds"].transform(lambda x: x.shift().rolling(window).mean())
)
if "prize" in df.columns:
    df["total_prize_3"] = (
        df.groupby("horse_id")["prize"].transform(lambda x: x.shift().rolling(window).sum())
    )

# Drop rows with missing essential data
required_cols = ["finish_position", "odds"]
df = df.dropna(subset=[col for col in required_cols if col in df.columns])

# Select relevant columns for outputs
identifier_cols = [
    "race_id",
    "horse_id",
    "horse_name",
    "jockey",
    "trainer",
    "date",
    "track",
    "going",
]
identifier_cols = [col for col in identifier_cols if col in df.columns]
z_cols = [f"{col}_z" for col in z_cols]
history_cols = [col for col in ["mean_finish_pos_3", "top3_ratio_3", "avg_odds_3", "total_prize_3"] if col in df.columns]
target_cols = ["target_win", "target_quinella"]

merged_cols = sorted(set(horses.columns).union(set(races.columns)))
merged_df = df[merged_cols]
features_df = df[identifier_cols + z_cols + history_cols + target_cols]

# Save outputs
merged_df.to_csv("merged_horse_race_2019.csv", index=False)
features_df.to_csv("feature_engineered_2019.csv", index=False)

# Summary output
print(f"Merged dataset shape: {merged_df.shape}")
print(f"Feature dataset shape: {features_df.shape}")
print("Feature columns:")
print(features_df.columns.tolist())
print("Sample standardized features:")
print(features_df[z_cols].head())

KeyError: 'finish_position'