# HORSE + RACE Feature Engineering Encyclopedia — V2 (Expanded)
Generated: 2025-11-10 23:44:10

This edition expands each field with **multiple** engineered features (10x+ where sensible).  Format: reasoning markdown, then a separate code cell writing to `merged`.  Safe parsing and within‑race standardization used where applicable.

In [None]:
import numpy as np, pandas as pd
pd.options.mode.chained_assignment = None
if "race_date" in merged.columns:
    merged["race_date"] = pd.to_datetime(merged["race_date"], errors="coerce")

## HORSE Fields — Expanded

### finish_position: Finishing position, 40 if horse didn't finish

#### Normalized rank within race
0–1 rank comparable across field sizes.

In [None]:
merged["finish_rank_pct"] = merged.groupby("race_id")["finish_position"].rank(pct=True)

#### Binary flags (finish/place/win)
Convenience multi-task targets.

In [None]:
fp = pd.to_numeric(merged["finish_position"], errors="coerce")
merged["finished_flag"] = (fp < 40).astype(int)
merged["placed_flag"] = (fp <= 3).astype(int)
merged["won_flag"] = (fp == 1).astype(int)

#### Top-k finishes
Flexible thresholds for evaluation.

In [None]:
for k in [2,3,4,5]:
    merged[f"top{k}"] = (pd.to_numeric(merged["finish_position"], errors="coerce") <= k).astype(int)

#### Within-race z-score
Standardize by race distribution.

In [None]:
def _z(s): sd=s.std(); return (s-s.mean())/sd if pd.notna(sd) and sd!=0 else (s*0)
merged["finish_z"] = merged.groupby("race_id")["finish_position"].transform(lambda x: _z(pd.to_numeric(x, errors="coerce")))

#### Expected rank from implied_prob
Market baseline vs realized finish.

In [None]:
if "implied_prob" in merged.columns:
    merged["prob_rank"] = merged.groupby("race_id")["implied_prob"].rank(ascending=False, method="average")
    merged["finish_residual_vs_prob"] = pd.to_numeric(merged["finish_position"], errors="coerce") - merged["prob_rank"]

#### Brier-style error vs prob
Per-horse calibration error.

In [None]:
if "implied_prob" in merged.columns:
    merged["brier_finish_win"] = (merged["won_flag"] - merged["implied_prob"])**2

#### Surprise magnitude
Absolute deviation from market rank.

In [None]:
if "prob_rank" in merged.columns:
    merged["finish_surprise_mag"] = merged["finish_residual_vs_prob"].abs()

#### Longshot overperformance
Wins where implied_prob is small.

In [None]:
if "implied_prob" in merged.columns:
    merged["win_longshot"] = ((merged["won_flag"]==1) & (merged["implied_prob"]<0.1)).astype(int)

#### DNF flag
Finish position=40 treated as Did Not Finish.

In [None]:
merged["dnf_flag"] = (pd.to_numeric(merged["finish_position"], errors="coerce") >= 40).astype(int)

#### Rank delta vs draw
Alignment between post position and finish.

In [None]:
if "draw" in merged.columns:
    merged["draw_rank"] = merged.groupby("race_id")["draw"].rank()
    merged["finish_minus_draw_rank"] = merged["finish_rank_pct"] - (merged["draw_rank"]/merged.groupby("race_id")["draw"].transform("size"))

### positionL: How far behind next horse at finish (lengths)

#### Numeric mapping for text codes
Map 'hd','nk','shd' etc. to lengths.

In [None]:
_map={"hd":0.2,"nk":0.3,"shd":0.1,"snk":0.25}
raw = merged["positionL"].astype(str).str.lower()
num = pd.to_numeric(raw, errors="coerce")
mask = raw.isin(_map)
merged["positionL_num"] = num.where(~mask, raw.map(_map))

#### Cumulative deficit
Approx sum of gaps to winner.

In [None]:
merged["cum_lengths_deficit"] = merged.groupby("race_id")["positionL_num"].transform(lambda x: x.fillna(0).cumsum())

#### Quartile bins
Nonlinear bucketization.

In [None]:
q = merged["positionL_num"].quantile([0.25,0.5,0.75])
merged["positionL_bin"] = pd.cut(merged["positionL_num"], bins=[-np.inf,q[0.25],q[0.5],q[0.75],np.inf], labels=[0,1,2,3])

#### Z-score per race
Standardized lengths.

In [None]:
merged["positionL_z"] = merged.groupby("race_id")["positionL_num"].transform(lambda s: (s - s.mean())/s.std() if s.std() not in (0,np.nan) else 0)

### distance_behind: Distance behind winner (lengths)

#### Numeric + clipping
Cap outliers at 99th pct.

In [None]:
merged["distance_behind_num"] = pd.to_numeric(merged["distance_behind"], errors="coerce")
q99 = merged["distance_behind_num"].quantile(0.99)
merged["distance_behind_num"] = merged["distance_behind_num"].clip(lower=0, upper=q99)

#### Within L thresholds
1L/2L/5L proximity flags.

In [None]:
for L in [1,2,5]:
    merged[f"within_{L}L"] = (merged["distance_behind_num"] <= L).astype(int)

#### Residual vs market speed
Compare to implied probability.

In [None]:
if "implied_prob" in merged.columns:
    merged["db_resid_prob"] = merged["distance_behind_num"] - merged.groupby("race_id")["implied_prob"].transform(lambda s: (1-s))

### weight_st / weight_lb / official_weight: Weight features

#### Unified kg
Build `weight_kg` from all sources.

In [None]:
merged["weight_st_num"] = pd.to_numeric(merged.get("weight_st", np.nan), errors="coerce")
merged["weight_lb_num"] = pd.to_numeric(merged.get("weight_lb", np.nan), errors="coerce")
wkg = pd.Series(np.nan, index=merged.index)
if "official_weight" in merged.columns:
    wkg = pd.to_numeric(merged["official_weight"], errors="coerce")
wkg = wkg.fillna(merged["weight_st_num"]*6.35029).fillna(merged["weight_lb_num"]*0.453592)
merged["weight_kg"] = wkg

#### Z-weight per race
Standardize within race.

In [None]:
merged["z_weight"] = merged.groupby("race_id")["weight_kg"].transform(lambda s: (s - s.mean())/s.std() if s.std() not in (0,np.nan) else 0)

#### Weight×distance
Interaction captures load sensitivity over distance.

In [None]:
if "race_distance" in merged.columns:
    merged["weightXdistance"] = pd.to_numeric(merged["race_distance"], errors="coerce") * merged["weight_kg"]

#### Weight decile
Rank by race to mitigate scale.

In [None]:
merged["weight_decile"] = merged.groupby("race_id")["weight_kg"].rank(pct=True)

### over_weight / out_handicap: Penalties

#### Numeric penalties + flags
Convert to numeric and binary indicators.

In [None]:
for col in ["over_weight","out_handicap"]:
    merged[f"{col}_num"] = pd.to_numeric(merged.get(col, np.nan), errors="coerce")
    merged[f"{col}_flag"] = (merged[f"{col}_num"].fillna(0) > 0).astype(int)

#### Penalty×distance
Heavier penalties more costly at long trips.

In [None]:
for col in ["over_weight_num","out_handicap_num"]:
    if col in merged.columns and "race_distance" in merged.columns:
        merged[f"{col}_X_dist"] = merged[col] * pd.to_numeric(merged["race_distance"], errors="coerce")

### headgear: Equipment codes

#### Type flags
Blinkers/visor/hood/tongue‑tie flags.

In [None]:
hg = merged["headgear"].astype(str).str.upper().str.strip()
for k, v in {"B":"blinkers","V":"visor","C":"cheekpieces","H":"hood","T":"tongue_tie","P":"pieces"}.items():
    merged[f"headgear_{v}"] = hg.str.contains(k, na=False).astype(int)

#### First‑time change
Detect gear change vs last start.

In [None]:
merged = merged.sort_values(["horse_id","race_date"])
hg = merged["headgear"].fillna("")
merged["headgear_changed"] = (hg != hg.groupby(merged["horse_id"]).shift(1)).astype(int)

#### Headgear×prob
Market interaction.

In [None]:
if "implied_prob" in merged.columns:
    merged["headgear_prob_interaction"] = merged["headgear_changed"] * merged["implied_prob"]

### rpr_rating: RP rating

#### Numeric + within‑race z & percentile
Standardize for comparability.

In [None]:
merged["rpr_rating_num"] = pd.to_numeric(merged.get("rpr_rating", np.nan), errors="coerce")
merged["rpr_rating_z"] = merged.groupby("race_id")[f"rpr_rating_num"].transform(lambda s: (s-s.mean())/s.std() if s.std() not in (0,np.nan) else 0)
merged["rpr_rating_pct"] = merged[f"rpr_rating_num"].rank(pct=True)

#### Momentum (diff from prev)
Rating delta for the same horse vs last run.

In [None]:
merged = merged.sort_values(["horse_id","race_date"])
merged["rpr_rating_delta_prev"] = merged.groupby("horse_id")[f"rpr_rating_num"].diff()

#### Rate×distance
Interaction for stamina/speed profiles.

In [None]:
if "race_distance" in merged.columns:
    merged["rpr_rating_X_dist"] = merged[f"rpr_rating_num"] * pd.to_numeric(merged["race_distance"], errors="coerce")

#### Residual vs market z
Rating z minus implied‑prob z per race.

In [None]:
if "implied_prob" in merged.columns:
    prob_z = merged.groupby("race_id")["implied_prob"].transform(lambda s: (s - s.mean())/s.std() if s.std() not in (0,np.nan) else 0)
    merged["rpr_rating_resid_probz"] = merged["rpr_rating_z"] - prob_z

### tr_rating: Topspeed rating

#### Numeric + within‑race z & percentile
Standardize for comparability.

In [None]:
merged["tr_rating_num"] = pd.to_numeric(merged.get("tr_rating", np.nan), errors="coerce")
merged["tr_rating_z"] = merged.groupby("race_id")[f"tr_rating_num"].transform(lambda s: (s-s.mean())/s.std() if s.std() not in (0,np.nan) else 0)
merged["tr_rating_pct"] = merged[f"tr_rating_num"].rank(pct=True)

#### Momentum (diff from prev)
Rating delta for the same horse vs last run.

In [None]:
merged = merged.sort_values(["horse_id","race_date"])
merged["tr_rating_delta_prev"] = merged.groupby("horse_id")[f"tr_rating_num"].diff()

#### Rate×distance
Interaction for stamina/speed profiles.

In [None]:
if "race_distance" in merged.columns:
    merged["tr_rating_X_dist"] = merged[f"tr_rating_num"] * pd.to_numeric(merged["race_distance"], errors="coerce")

#### Residual vs market z
Rating z minus implied‑prob z per race.

In [None]:
if "implied_prob" in merged.columns:
    prob_z = merged.groupby("race_id")["implied_prob"].transform(lambda s: (s - s.mean())/s.std() if s.std() not in (0,np.nan) else 0)
    merged["tr_rating_resid_probz"] = merged["tr_rating_z"] - prob_z

### or_rating: Official rating

#### Numeric + within‑race z & percentile
Standardize for comparability.

In [None]:
merged["or_rating_num"] = pd.to_numeric(merged.get("or_rating", np.nan), errors="coerce")
merged["or_rating_z"] = merged.groupby("race_id")[f"or_rating_num"].transform(lambda s: (s-s.mean())/s.std() if s.std() not in (0,np.nan) else 0)
merged["or_rating_pct"] = merged[f"or_rating_num"].rank(pct=True)

#### Momentum (diff from prev)
Rating delta for the same horse vs last run.

In [None]:
merged = merged.sort_values(["horse_id","race_date"])
merged["or_rating_delta_prev"] = merged.groupby("horse_id")[f"or_rating_num"].diff()

#### Rate×distance
Interaction for stamina/speed profiles.

In [None]:
if "race_distance" in merged.columns:
    merged["or_rating_X_dist"] = merged[f"or_rating_num"] * pd.to_numeric(merged["race_distance"], errors="coerce")

#### Residual vs market z
Rating z minus implied‑prob z per race.

In [None]:
if "implied_prob" in merged.columns:
    prob_z = merged.groupby("race_id")["implied_prob"].transform(lambda s: (s - s.mean())/s.std() if s.std() not in (0,np.nan) else 0)
    merged["or_rating_resid_probz"] = merged["or_rating_z"] - prob_z

### father: Sire name

#### Shrunk win rate
Target‑encoding with Bayesian shrinkage for sparse pedigrees.

In [None]:
merged["won"] = (pd.to_numeric(merged["finish_position"], errors="coerce")==1).astype(int)
agg = merged.groupby("father")["won"].agg(["mean","count"]).rename(columns={"mean":"m","count":"n"})
global_m = merged["won"].mean(); prior=50
agg["shrunk"] = (agg["m"]*agg["n"] + global_m*prior)/(agg["n"]+prior)
merged["father_win_shrunk"] = merged["father"].map(agg["shrunk"])

#### Surface pref
Win rate by surface_condition pivoted per pedigree.

In [None]:
if "surface_condition" in merged.columns:
    pv = merged.groupby(["father","surface_condition"])["won"].mean().unstack(fill_value=0).add_prefix("father_cond_")
    merged = merged.merge(pv, on="father", how="left")

#### Distance group pref
Sprint/mid/staying preferences.

In [None]:
if "metric" in merged.columns:
    d = pd.to_numeric(merged["metric"], errors="coerce")
    merged["dist_group"] = pd.cut(d, [0,1400,2000,4000,1e9], labels=["sprint","mid","staying","extreme"])
    pv = merged.groupby(["father","dist_group"])["won"].mean().unstack(fill_value=0).add_prefix("father_dist_")
    merged = merged.merge(pv, on="father", how="left")

#### Pedigree popularity
Count of runners carrying this pedigree (log scaled).

In [None]:
cnt = merged["father"].value_counts()
merged["father_log_count"] = merged["father"].map(cnt).fillna(0).pipe(lambda s: np.log1p(s))

### mother: Dam name

#### Shrunk win rate
Target‑encoding with Bayesian shrinkage for sparse pedigrees.

In [None]:
merged["won"] = (pd.to_numeric(merged["finish_position"], errors="coerce")==1).astype(int)
agg = merged.groupby("mother")["won"].agg(["mean","count"]).rename(columns={"mean":"m","count":"n"})
global_m = merged["won"].mean(); prior=50
agg["shrunk"] = (agg["m"]*agg["n"] + global_m*prior)/(agg["n"]+prior)
merged["mother_win_shrunk"] = merged["mother"].map(agg["shrunk"])

#### Surface pref
Win rate by surface_condition pivoted per pedigree.

In [None]:
if "surface_condition" in merged.columns:
    pv = merged.groupby(["mother","surface_condition"])["won"].mean().unstack(fill_value=0).add_prefix("mother_cond_")
    merged = merged.merge(pv, on="mother", how="left")

#### Distance group pref
Sprint/mid/staying preferences.

In [None]:
if "metric" in merged.columns:
    d = pd.to_numeric(merged["metric"], errors="coerce")
    merged["dist_group"] = pd.cut(d, [0,1400,2000,4000,1e9], labels=["sprint","mid","staying","extreme"])
    pv = merged.groupby(["mother","dist_group"])["won"].mean().unstack(fill_value=0).add_prefix("mother_dist_")
    merged = merged.merge(pv, on="mother", how="left")

#### Pedigree popularity
Count of runners carrying this pedigree (log scaled).

In [None]:
cnt = merged["mother"].value_counts()
merged["mother_log_count"] = merged["mother"].map(cnt).fillna(0).pipe(lambda s: np.log1p(s))

### gfather: Grandsire name

#### Shrunk win rate
Target‑encoding with Bayesian shrinkage for sparse pedigrees.

In [None]:
merged["won"] = (pd.to_numeric(merged["finish_position"], errors="coerce")==1).astype(int)
agg = merged.groupby("gfather")["won"].agg(["mean","count"]).rename(columns={"mean":"m","count":"n"})
global_m = merged["won"].mean(); prior=50
agg["shrunk"] = (agg["m"]*agg["n"] + global_m*prior)/(agg["n"]+prior)
merged["gfather_win_shrunk"] = merged["gfather"].map(agg["shrunk"])

#### Surface pref
Win rate by surface_condition pivoted per pedigree.

In [None]:
if "surface_condition" in merged.columns:
    pv = merged.groupby(["gfather","surface_condition"])["won"].mean().unstack(fill_value=0).add_prefix("gfather_cond_")
    merged = merged.merge(pv, on="gfather", how="left")

#### Distance group pref
Sprint/mid/staying preferences.

In [None]:
if "metric" in merged.columns:
    d = pd.to_numeric(merged["metric"], errors="coerce")
    merged["dist_group"] = pd.cut(d, [0,1400,2000,4000,1e9], labels=["sprint","mid","staying","extreme"])
    pv = merged.groupby(["gfather","dist_group"])["won"].mean().unstack(fill_value=0).add_prefix("gfather_dist_")
    merged = merged.merge(pv, on="gfather", how="left")

#### Pedigree popularity
Count of runners carrying this pedigree (log scaled).

In [None]:
cnt = merged["gfather"].value_counts()
merged["gfather_log_count"] = merged["gfather"].map(cnt).fillna(0).pipe(lambda s: np.log1p(s))

### race_runners: Field size

#### Numeric + small/large flags
Size categories and interactions.

In [None]:
merged["race_runners_num"] = pd.to_numeric(merged["race_runners"], errors="coerce")
merged["field_small"] = (merged["race_runners_num"]<=6).astype(int)
merged["field_large"] = (merged["race_runners_num"]>=12).astype(int)

#### HHI of implied probabilities
Dominance/concentration per race.

In [None]:
if "implied_prob" in merged.columns:
    merged["race_hhi_prob"] = merged.assign(p2=lambda df: df["implied_prob"]**2).groupby("race_id")["p2"].transform("sum")

#### Entropy of market
Uncertainty across entrants.

In [None]:
if "implied_prob" in merged.columns:
    def ent(s):
        p = s/s.sum() if s.sum()>0 else s
        p = p.replace(0, np.nan)
        return -(p*np.log(p)).sum()
    merged["race_prob_entropy"] = merged.groupby("race_id")["implied_prob"].transform(ent)

### margin: Sum of decimalPrices (implied prob) per race

#### Overround
Sum of implied probabilities; >1 means bookmaker margin.

In [None]:
if "implied_prob" in merged.columns:
    merged["race_overround"] = merged.groupby("race_id")["implied_prob"].transform("sum")

#### Normalized implied probability
Scale to sum=1 within race.

In [None]:
if "implied_prob" in merged.columns:
    rr = merged.groupby("race_id")["implied_prob"].transform("sum")
    merged["implied_prob_norm"] = merged["implied_prob"]/rr

#### Fav-longshot spread
Difference between top and median probability.

In [None]:
if "implied_prob" in merged.columns:
    top = merged.groupby("race_id")["implied_prob"].transform("max")
    med = merged.groupby("race_id")["implied_prob"].transform("median")
    merged["fav_spread"] = top - med

### result_win / result_place: Outcome flags

#### Binary normalization
Ensure 0/1 ints.

In [None]:
for col in ["result_win","result_place"]:
    if col in merged.columns:
        merged[col] = pd.to_numeric(merged[col], errors="coerce").fillna(0).astype(int)

#### Calib error vs prob
Per-race calibration residual.

In [None]:
if "implied_prob" in merged.columns and "result_win" in merged.columns:
    merged["prob_error"] = merged["result_win"] - merged["implied_prob"]

## RACE Fields — Expanded

### race_id: Unique race identifier

#### Race size
Entrants per race.

In [None]:
merged["race_size"] = merged.groupby("race_id")["horse_id"].transform("size")

#### Num starters vs scratches
Requires a 'scratched' flag if available.

In [None]:
if "scratched" in merged.columns:
    merged["num_starters"] = merged.groupby("race_id")["scratched"].transform(lambda s: (s==0).sum())

### course: Course (country in brackets; AW=All Weather)

#### Clean name + country code
Split course and country.

In [None]:
course = merged["course"].astype(str).str.strip()
merged["course_country"] = course.str.extract(r"\(([^)]+)\)", expand=False).str.upper()
merged["course_name_clean"] = course.str.replace(r"\s*\([^\)]*\)","",regex=True).str.strip()

#### Top‑k course dummies
Avoid dimensional explosion.

In [None]:
top = merged["course_name_clean"].value_counts().head(20).index
for c in top:
    merged[f"course_is_{c[:25].replace(' ','_').lower()}"] = (merged["course_name_clean"]==c).astype(int)

#### Home/away proxy
If horse has 'home_course' in history.

In [None]:
if "home_course" in merged.columns:
    merged["is_home_course"] = (merged["course_name_clean"]==merged["home_course"]).astype(int)

### race_time: hh:mm local (London TZ)

#### Hour/minute/session
Temporal buckets.

In [None]:
t = merged["race_time"].astype(str).str.extract(r"^(\d{1,2}):(\d{2})$")
merged["race_hour"] = pd.to_numeric(t[0], errors="coerce")
merged["race_minute"] = pd.to_numeric(t[1], errors="coerce")
merged["race_evening"] = ((merged["race_hour"]>=17)&(merged["race_hour"]<=21)).astype(int)

#### Peak‑hour flag
Crowded cards may shift tactics.

In [None]:
merged["race_peakhour"] = merged["race_hour"].between(14,17).astype(int)

### race_date: Date

#### Calendar features
Year/month/week/weekday.

In [None]:
merged["race_year"] = merged["race_date"].dt.year
merged["race_month"] = merged["race_date"].dt.month
merged["race_week"] = merged["race_date"].dt.isocalendar().week.astype("int32")
merged["race_dow"] = merged["race_date"].dt.dayofweek

#### Meet cycle index
Progress through the meet (0..1).

In [None]:
grp = merged.groupby("course_name_clean")["race_date"]
min_d = grp.transform("min"); max_d = grp.transform("max")
den = (max_d - min_d).dt.days.replace(0,1)
merged["course_meet_progress"] = (merged["race_date"] - min_d).dt.days / den

### title: Race title

#### Keyword flags
Extract class/type clues.

In [None]:
txt = merged["title"].astype(str).str.lower()
for kw in ["maiden","handicap","listed","group","stakes","novice","claiming","derby","cup"]:
    merged[f"title_kw_{kw}"] = txt.str.contains(kw, na=False).astype(int)
merged["title_len"] = txt.str.len()

### rclass / race_class: Class

#### Ordinal mapping
Lower ordinal = higher class.

In [None]:
rc = merged["rclass"].astype(str).str.upper().str.strip()
map_order = {"GROUP 1":1,"G1":1,"GROUP 2":2,"G2":2,"GROUP 3":3,"G3":3,"LISTED":4,"L":4,
             "CLASS 1":5,"CLASS 2":6,"CLASS 3":7,"CLASS 4":8,"CLASS 5":9,"CLASS 6":10}
merged["rclass_ord"] = rc.map(map_order)

#### Disagreement flag
Raw vs derived inconsistency.

In [None]:
if "race_class" in merged.columns:
    merged["race_class_disagree"] = (pd.to_numeric(merged["race_class"], errors="coerce") != merged["rclass_ord"]).astype(int)

### band: Rating band

#### Min/max extraction
From like '0-85'.

In [None]:
b = merged["band"].astype(str)
rng = b.str.extract(r"(\d+)\s*[-–]\s*(\d+)", expand=True)
merged["band_min"] = pd.to_numeric(rng[0], errors="coerce")
merged["band_max"] = pd.to_numeric(rng[1], errors="coerce")

### ages: Age restrictions

#### Parse minimum and only flag
Support '3yo+', '2yo only'.

In [None]:
ages = merged["ages"].astype(str).str.lower()
merged["age_min_allowed"] = pd.to_numeric(ages.str.extract(r"(\d+)\s*yo", expand=False), errors="coerce")
merged["age_only_flag"] = ages.str.contains("only", na=False).astype(int)

#### Compliance
Horse meets rules?

In [None]:
if "age" in merged.columns and "age_min_allowed" in merged.columns:
    merged["age_within_rules"] = (pd.to_numeric(merged["age"], errors="coerce") >= merged["age_min_allowed"]).astype(int)

### race_distance / metric: Distance

#### Meters canonical + log
Use metric when available.

In [None]:
merged["race_distance_m"] = pd.to_numeric(merged.get("metric", merged.get("race_distance")), errors="coerce")
merged["log_distance"] = np.log1p(merged["race_distance_m"])

#### Distance bins
Sprint/mid/staying/extreme.

In [None]:
merged["distance_bin"] = pd.cut(merged["race_distance_m"], [0,1400,2000,4000,1e9], labels=["sprint","mid","staying","extreme"])

#### Distance×field
Interactions for congestion effects.

In [None]:
if "race_runners_num" in merged.columns:
    merged["distXfield"] = merged["race_distance_m"] * merged["race_runners_num"]

### surface_condition: Going

#### Ordinal mapping
Firm→Heavy.

In [None]:
cond = merged["surface_condition"].astype(str).str.lower().str.strip()
order = {"firm":1,"good to firm":2,"good":3,"good to soft":4,"soft":5,"heavy":6,"aw":3}
merged["going_ord"] = cond.map(order)

#### Going×distance
Interaction to capture stamina on soft ground.

In [None]:
if "going_ord" in merged.columns and "race_distance_m" in merged.columns:
    merged["goingXdistance"] = merged["going_ord"] * merged["race_distance_m"]

### hurdles: Obstacles

#### Count + type flags
Extract count and hurdle/chase identifiers.

In [None]:
hud = merged["hurdles"].astype(str).str.lower()
merged["hurdle_count"] = pd.to_numeric(hud.str.extract(r"(\d+)", expand=False), errors="coerce")
for kw in ["hurdle","fence","chase","steeple"]:
    merged[f"hurdles_kw_{kw}"] = hud.str.contains(kw, na=False).astype(int)

### prize_total / prize_breakdown: Purse

#### Log prize + per runner
Stabilize and scale by field size.

In [None]:
merged["prize_total_num"] = pd.to_numeric(merged.get("prize_total", np.nan), errors="coerce")
merged["log_prize_total"] = np.log1p(merged["prize_total_num"])
if "race_runners_num" in merged.columns:
    merged["prize_per_runner"] = merged["prize_total_num"] / merged["race_runners_num"]

#### Prize slot count
From breakdown entries.

In [None]:
bd = merged.get("prize_breakdown", pd.Series("", index=merged.index)).astype(str).str.lower()
merged["prize_slots"] = bd.str.findall(r"(\d+[\.,]?\d*)").apply(lambda x: len(x) if isinstance(x, list) else np.nan)

### winning_time: Time

#### Seconds parsed
Parse m:s.ms to seconds.

In [None]:
wt = merged["winning_time"].astype(str)
mins = pd.to_numeric(wt.str.extract(r"^(\d+)\s*[:m]", expand=False), errors="coerce").fillna(0)
secs = pd.to_numeric(wt.str.extract(r"(\d+(?:\.\d+)?)\s*s?$", expand=False), errors="coerce")
merged["winning_time_s"] = mins*60 + secs

#### Speed m/s + class z
Distance‑normalized speed and class standardization.

In [None]:
if "race_distance_m" in merged.columns:
    merged["speed_mps"] = merged["race_distance_m"] / merged["winning_time_s"]
    merged["speed_class_z"] = merged.groupby("rclass")["speed_mps"].transform(lambda s: (s-s.mean())/s.std() if s.std() not in (0,np.nan) else 0)

### country_code / ncond: Jurisdiction & condition type

#### Country dummies + hemisphere
Jurisdictional effects.

In [None]:
cc = merged["country_code"].astype(str).str.upper().str.strip()
top = cc.value_counts().head(10).index
for c in top:
    merged[f"country_is_{c}"] = (cc==c).astype(int)
merged["hemisphere_north"] = cc.isin({"GB","IE","FR","US","CA","JP","HK"}).astype(int)

#### ncond one‑hots
Encode derived condition buckets.

In [None]:
nc = merged["ncond"].astype(str).str.lower().str.strip()
top = nc.value_counts().head(10).index
for v in top:
    merged[f"ncond_is_{str(v)[:25].replace(' ','_')}"] = (nc==v).astype(int)

## Final Sanity Checks

In [None]:
engineered = sorted(set(merged.columns) - set([
    "finish_position","positionL","distance_behind","weight_st","weight_lb","over_weight","out_handicap","headgear",
    "rpr_rating","tr_rating","or_rating","father","mother","gfather","race_runners","margin","official_weight",
    "result_win","result_place","race_id","course","race_time","race_date","title","rclass","band","ages",
    "race_distance","surface_condition","hurdles","prize_breakdown","winning_time","prize_total","metric",
    "country_code","ncond","race_class","draw","horse_id","implied_prob"
]))
print("Engineered feature count:", len(engineered))
engineered[:100]