<a href="https://colab.research.google.com/github/hedgingmybets/Horse-racing-value/blob/main/DailyLongshotModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install xgboost reportlab



In [None]:
import pandas as pd, numpy as np, re, xgboost as xgb
from reportlab.lib.pagesizes import landscape, A4
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet

# === Paths ===
MODEL_PATH = "/content/xgboost_longshot.model"   # upload once, reuse daily
DATA_PATH  = "/content/racedata-2025-10-01.csv"  # replace with today’s file name

# === Features used in training ===
FEATURES = [
    "PR Rank","VDW Rank","SHorBES","Class Par","ClassEdge",
    "SClGR","SClLr4","ACSPCLTD","SPDFIGLrAdj","Horse Strength",
    "Trainer Win Rate in Last 14 Days","Jockey Win Rate in Last 14 Days",
    "CDflag","Top4Count"
]

In [None]:
# Distance parser (robust for m/f/y)
def parse_distance(val):
    if pd.isna(val): return np.nan
    s = str(val).lower().replace("½","0.5").replace("¼","0.25").replace("¾","0.75")
    miles = re.search(r"(\d+)m", s)
    furlongs = re.search(r"(\d+)f", s)
    yards = re.search(r"(\d+)y", s)
    m = float(miles.group(1)) * 8 if miles else 0
    f = float(furlongs.group(1)) if furlongs else 0
    y = float(yards.group(1))/220 if yards else 0
    return m+f+y

def prepare_df(df):
    # Numeric conversions
    for col in FEATURES:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col].astype(str).str.replace("%",""), errors="coerce")

    # Derived features
    df["ClassEdge"] = df.get("SHorBES",0) - df.get("Class Par",0)
    df["CDflag"] = df.get("Course Distance Winner","").astype(str).str.contains("Course|Distance",case=False).astype(int)
    def count_top4(s):
        if pd.isna(s): return 0
        return sum(ch in "1234" for ch in str(s)[:6])
    df["Top4Count"] = df.get("Last 6 Form Figs","").apply(count_top4)
    df["DistanceF"] = df.get("Distance","").apply(parse_distance)
    return df

In [None]:
# Load model
bst = xgb.Booster()
bst.load_model(MODEL_PATH)

# Load race data
df = pd.read_csv(DATA_PATH)
df = prepare_df(df)

In [None]:
df["Probability"] = bst.inplace_predict(xgb.DMatrix(df[FEATURES].fillna(0).astype("float32")))

# Apply locked-in rules
mask = (
    df["Classifications"].str.contains("handicap",case=False,na=False) &
    ~df["Classifications"].str.contains("nursery|apprentice|conditional",case=False,na=False) &
    (df["DistanceF"] < 14) &
    (df["Runners"].fillna(0) <= 15) &
    df["Class"].between(2,6, inclusive="both") &
    (df["CDflag"] == 1) &
    (pd.to_numeric(df["Trainer Win Rate in Last 14 Days"], errors="coerce").fillna(0) >= 5) &
    ((df["Last Winning Class"].notna()) | (df["SHorBES"] >= df["Class Par"]))
)

shortlist = df[mask & (df["Probability"] >= 0.58)].copy()
shortlist = shortlist.sort_values(["Time","Course","Probability"], ascending=[True,True,False])

In [None]:
diagnostics = {
    "Total runners": len(df),
    "Handicaps only": mask.sum(),
    "Final shortlist": len(shortlist),
    "≥0.70 band": (shortlist["Probability"]>=0.70).sum(),
    "0.60–0.69 band": ((shortlist["Probability"]>=0.60)&(shortlist["Probability"]<0.70)).sum(),
    "0.55–0.59 band": ((shortlist["Probability"]>=0.55)&(shortlist["Probability"]<0.60)).sum(),
}
diagnostics

In [None]:
pdf_path = "/content/Longshot_Selections.pdf"
csv_path = "/content/Longshot_Selections.csv"

# Save CSV
shortlist.to_csv(csv_path, index=False)

# Save PDF
doc = SimpleDocTemplate(pdf_path, pagesize=landscape(A4))
styles = getSampleStyleSheet()
elements = [Paragraph("XGBoost Longshot Model — Daily Selections", styles["Title"]), Spacer(1,12)]

if not shortlist.empty:
    table_data = [["Time","Course","Horse","Probability","Advisory Odds"]]
    for _,r in shortlist.iterrows():
        if r["Probability"] >= 0.70: band="No minimum"
        elif r["Probability"] >= 0.60: band="≥15/1"
        elif r["Probability"] >= 0.55: band="≥20/1"
        else: band="Discard"
        table_data.append([r.get("Time",""),r.get("Course",""),r.get("Horse",""),f"{r['Probability']:.2f}",band])
    tbl = Table(table_data, repeatRows=1)
    tbl.setStyle(TableStyle([
        ("BACKGROUND",(0,0),(-1,0),colors.grey),
        ("TEXTCOLOR",(0,0),(-1,0),colors.whitesmoke),
        ("ALIGN",(0,0),(-1,-1),"CENTER"),
        ("GRID",(0,0),(-1,-1),0.5,colors.black)
    ]))
    elements.append(tbl)
else:
    elements.append(Paragraph("No qualifying runners today", styles["Normal"]))

doc.build(elements)

(pdf_path, csv_path, shortlist[["Time","Course","Horse","Probability"]])