# Cross-Market Arbitrage Scanner (Polymarket + Kalshi)

This notebook fetches live market data from **Polymarket** and **Kalshi**, heuristically matches similar YES/NO markets, and scans for potential cross-market arbitrage opportunities.

It focuses on two locked-payout structures for matched binary questions:
- Buy **YES** on Polymarket + Buy **NO** on Kalshi
- Buy **NO** on Polymarket + Buy **YES** on Kalshi

A positive `net_locked_return` means the pair may be profitable **after** a configurable fee buffer.

> Always manually verify that matched questions refer to the exact same proposition and resolution criteria before trading.

In [1]:
#!/usr/bin/env python3
from __future__ import annotations

import importlib
import json
import re
from difflib import SequenceMatcher
from pathlib import Path

import pandas as pd
import requests

import polymarket_edge as _polymarket_edge
_polymarket_edge = importlib.reload(_polymarket_edge)
fetch_markets = _polymarket_edge.fetch_markets
fetch_markets_kalshi = _polymarket_edge.fetch_markets_kalshi

pd.set_option("display.max_colwidth", 120)
pd.set_option("display.width", 200)

MAX_PM_MARKETS = 300
MAX_KALSHI_MARKETS = 300

MIN_VOLUME = 5_000
MIN_LIQUIDITY = 1_000

# Kalshi often reports low/zero volume+liquidity on active markets via API, so use separate filters.
MIN_KALSHI_VOLUME = 0
MIN_KALSHI_LIQUIDITY = 0

# Mode switch: "strict" or "exploratory"
SCAN_MODE = "exploratory"

# Relaxed defaults for exploratory matching.
MIN_SIMILARITY = 0.45  # text similarity threshold for candidate matching
MIN_TOKEN_OVERLAP = 1
FEE_BUFFER = 0.00      # set > 0 for realistic fee/slippage assumptions
NEAR_MISS_MIN_NET_RETURN = -0.25  # show pairs down to -25% net when no positive arb exists

# Suggested notional to deploy per venue leg for sizing hints.
POSITION_NOTIONAL_USD = 50.0

# Manual market injection for known missing API pagination coverage.
MANUAL_PM_EVENT_SLUGS = [
    "fed-decision-in-march-885",
]
MANUAL_KALSHI_MARKET_TICKERS = [
    "KXFEDDECISION-26MAR-H0",
]

TOP_N = 30
OUTPUT_PATH = Path("results/cross_market_arbitrage_candidates.csv")

print("Configuration loaded.")

Configuration loaded.


In [2]:
# Stopwords toexclude from token overlap and similarity calculations, as they add noise and aren't helpful for matching.
STOPWORDS = {
    "the", "and", "for", "with", "from", "this", "that", "will",
    "what", "when", "where", "which", "into", "over", "under",
    "market", "event", "price", "contract", "yes", "no",
}

PM_EVENTS_API_URL = "https://gamma-api.polymarket.com/events"
KALSHI_MARKET_API_URL = "https://api.elections.kalshi.com/trade-api/v2/markets"

def _to_float(value: object, default: float = 0.0) -> float:
    try:
        return float(value)
    except Exception:
        return default


def _parse_list(value: object) -> list:
    if isinstance(value, list):
        return value
    if isinstance(value, str):
        try:
            parsed = json.loads(value)
            return parsed if isinstance(parsed, list) else []
        except Exception:
            return []
    return []


def _normalize_text(text: str) -> str:
    out = text.lower()
    out = re.sub(r"[^a-z0-9\s]", " ", out)
    out = re.sub(r"\s+", " ", out).strip()
    return out


def _token_set(text: str) -> set[str]:
    tokens = [
        t
        for t in _normalize_text(text).split()
        if len(t) >= 3 and t not in STOPWORDS and not t.isdigit()
    ]
    return set(tokens)


def _similarity(a: str, b: str) -> float:
    return SequenceMatcher(None, _normalize_text(a), _normalize_text(b)).ratio()


def _extract_polymarket_yes_no(market: dict) -> tuple[float | None, float | None]:
    outcomes = _parse_list(market.get("outcomes"))
    prices = _parse_list(market.get("outcomePrices"))
    if len(outcomes) != len(prices) or len(outcomes) < 2:
        return None, None

    yes_idx = None
    no_idx = None
    for idx, outcome in enumerate(outcomes):
        label = str(outcome).strip().lower()
        if label == "yes":
            yes_idx = idx
        elif label == "no":
            no_idx = idx

    if yes_idx is None or no_idx is None:
        return None, None

    yes_price = _to_float(prices[yes_idx], -1)
    no_price = _to_float(prices[no_idx], -1)
    if not (0 <= yes_price <= 1 and 0 <= no_price <= 1):
        return None, None
    return yes_price, no_price


def _extract_kalshi_yes_no(market: dict) -> tuple[float | None, float | None]:
    yes_ask = _to_float(market.get("yes_ask"), -1)
    no_ask = _to_float(market.get("no_ask"), -1)
    if yes_ask < 0 or no_ask < 0:
        return None, None
    yes_price = yes_ask / 100.0
    no_price = no_ask / 100.0
    if not (0 <= yes_price <= 1 and 0 <= no_price <= 1):
        return None, None
    return yes_price, no_price


def _fetch_polymarket_markets_for_event_slug(event_slug: str) -> list[dict]:
    try:
        resp = requests.get(PM_EVENTS_API_URL, params={"slug": event_slug}, timeout=30)
        resp.raise_for_status()
        payload = resp.json()
    except Exception:
        return []

    if not isinstance(payload, list) or not payload:
        return []

    event = payload[0]
    markets = event.get("markets", []) if isinstance(event, dict) else []
    return markets if isinstance(markets, list) else []


def _fetch_kalshi_market_by_ticker(ticker: str) -> dict | None:
    try:
        resp = requests.get(f"{KALSHI_MARKET_API_URL}/{ticker}", timeout=30)
        if resp.status_code != 200:
            return None
        payload = resp.json()
    except Exception:
        return None

    if not isinstance(payload, dict):
        return None
    market = payload.get("market")
    return market if isinstance(market, dict) else None

In [3]:
# Build DataFrames for Polymarket and Kalshi markets with necessary fields and filtering applied.

def build_polymarket_df(max_markets: int = MAX_PM_MARKETS) -> pd.DataFrame:
    rows = []
    seen_ids: set[str] = set()

    def append_market(m: dict) -> None:
        yes_price, no_price = _extract_polymarket_yes_no(m)
        if yes_price is None or no_price is None:
            return

        volume = _to_float(m.get("volumeNum", m.get("volume")), 0.0)
        liquidity = _to_float(m.get("liquidityNum", m.get("liquidity")), 0.0)
        if volume < MIN_VOLUME or liquidity < MIN_LIQUIDITY:
            return

        market_id = str(m.get("id", ""))
        question = str(m.get("question", "")).strip()
        slug = str(m.get("slug", market_id))

        if not market_id or not question or market_id in seen_ids:
            return

        seen_ids.add(market_id)
        rows.append({
            "provider": "polymarket",
            "market_id": market_id,
            "question": question,
            "yes_price": yes_price,
            "no_price": no_price,
            "spread": max(0.0, _to_float(m.get("bestAsk"), yes_price) - _to_float(m.get("bestBid"), yes_price)),
            "volume": volume,
            "liquidity": liquidity,
            "close_time": m.get("endDate"),
            "url": f"https://polymarket.com/event/{slug}",
            "tokens": _token_set(question),
        })

    for m in fetch_markets(max_markets=max_markets):
        append_market(m)

    manual_added = 0
    for event_slug in MANUAL_PM_EVENT_SLUGS:
        for m in _fetch_polymarket_markets_for_event_slug(event_slug):
            before = len(rows)
            append_market(m)
            if len(rows) > before:
                manual_added += 1

    if manual_added:
        print(f"Manual Polymarket markets added: {manual_added}")

    return pd.DataFrame(rows)


def build_kalshi_df(max_markets: int = MAX_KALSHI_MARKETS) -> pd.DataFrame:
    rows = []
    seen_ids: set[str] = set()

    def append_market(m: dict) -> None:
        status = str(m.get("status", "")).lower()
        if status not in {"open", "active"}:
            return

        yes_price, no_price = _extract_kalshi_yes_no(m)
        if yes_price is None or no_price is None:
            return

        volume = _to_float(m.get("volume"), 0.0)
        liquidity = _to_float(m.get("liquidity"), 0.0)
        if volume < MIN_KALSHI_VOLUME or liquidity < MIN_KALSHI_LIQUIDITY:
            return

        market_id = str(m.get("ticker", ""))
        question = str(
            m.get("title", m.get("subtitle", m.get("yes_sub_title", m.get("no_sub_title", ""))))
        ).strip()

        if not market_id or not question or market_id in seen_ids:
            return

        seen_ids.add(market_id)
        rows.append({
            "provider": "kalshi",
            "market_id": market_id,
            "question": question,
            "yes_price": yes_price,
            "no_price": no_price,
            "spread": max(0.0, _to_float(m.get("yes_ask"), yes_price * 100) / 100 - _to_float(m.get("yes_bid"), yes_price * 100) / 100),
            "volume": volume,
            "liquidity": liquidity,
            "close_time": m.get("close_time"),
            "url": f"https://kalshi.com/markets/{market_id}",
            "tokens": _token_set(question),
        })

    for m in fetch_markets_kalshi(max_markets=max_markets):
        append_market(m)

    manual_added = 0
    for ticker in MANUAL_KALSHI_MARKET_TICKERS:
        market = _fetch_kalshi_market_by_ticker(ticker)
        if market is None:
            print(f"Manual Kalshi ticker not found: {ticker}")
            continue
        before = len(rows)
        append_market(market)
        if len(rows) > before:
            manual_added += 1

    if manual_added:
        print(f"Manual Kalshi markets added: {manual_added}")

    return pd.DataFrame(rows)


pm_df = build_polymarket_df()
kalshi_df = build_kalshi_df()

print(f"Polymarket rows: {len(pm_df)}")
print(f"Kalshi rows: {len(kalshi_df)}")

pm_df.head(3)

Manual Polymarket markets added: 4
Manual Kalshi markets added: 1
Polymarket rows: 297
Kalshi rows: 301


Unnamed: 0,provider,market_id,question,yes_price,no_price,spread,volume,liquidity,close_time,url,tokens
0,polymarket,517310,"Will Trump deport less than 250,000?",0.0305,0.9695,0.005,1186489.0,18874.37802,2025-12-31T12:00:00Z,https://polymarket.com/event/will-trump-deport-less-than-250000,"{trump, less, than, deport}"
1,polymarket,517311,"Will Trump deport 250,000-500,000 people?",0.902,0.098,0.004,7504796.0,5919.51741,2025-12-31T12:00:00Z,https://polymarket.com/event/will-trump-deport-250000-500000-people,"{trump, deport, people}"
2,polymarket,517313,"Will Trump deport 500,000-750,000- people?",0.0325,0.9675,0.011,525604.1,4110.90003,2025-12-31T12:00:00Z,https://polymarket.com/event/will-trump-deport-500000-750000-people,"{trump, deport, people}"


In [4]:
#   Scan for cross-market arbitrage opportunities by comparing each Polymarket market against each Kalshi market, applying text similarity and token overlap filters, and calculating potential locked returns for both "YES/NO" pairing strategies.

def scan_cross_market_arbitrage(
    pm: pd.DataFrame,
    kalshi: pd.DataFrame,
    min_net_locked_return: float = 0.0,
    enforce_text_match: bool = True,
) -> pd.DataFrame:
    if pm.empty or kalshi.empty:
        return pd.DataFrame()

    opportunities: list[dict] = []

    pm_rows = pm.to_dict(orient="records")
    kalshi_rows = kalshi.to_dict(orient="records")

    for pm_row in pm_rows:
        pm_tokens = pm_row["tokens"]
        if not pm_tokens and enforce_text_match:
            continue

        for ks_row in kalshi_rows:
            overlap = len(pm_tokens.intersection(ks_row["tokens"]))
            sim = _similarity(pm_row["question"], ks_row["question"])

            if enforce_text_match:
                if overlap < MIN_TOKEN_OVERLAP:
                    continue
                if sim < MIN_SIMILARITY:
                    continue

            cost_yes_pm_no_ks = pm_row["yes_price"] + ks_row["no_price"]
            gross_yes_pm_no_ks = 1.0 - cost_yes_pm_no_ks
            net_yes_pm_no_ks = gross_yes_pm_no_ks - FEE_BUFFER

            cost_no_pm_yes_ks = pm_row["no_price"] + ks_row["yes_price"]
            gross_no_pm_yes_ks = 1.0 - cost_no_pm_yes_ks
            net_no_pm_yes_ks = gross_no_pm_yes_ks - FEE_BUFFER

            if net_yes_pm_no_ks <= min_net_locked_return and net_no_pm_yes_ks <= min_net_locked_return:
                continue

            if net_yes_pm_no_ks >= net_no_pm_yes_ks:
                strategy = "Buy YES (Polymarket) + Buy NO (Kalshi)"
                gross_locked = gross_yes_pm_no_ks
                net_locked = net_yes_pm_no_ks
                leg_cost = cost_yes_pm_no_ks
            else:
                strategy = "Buy NO (Polymarket) + Buy YES (Kalshi)"
                gross_locked = gross_no_pm_yes_ks
                net_locked = net_no_pm_yes_ks
                leg_cost = cost_no_pm_yes_ks

            cheap_side_pm = "YES" if pm_row["yes_price"] <= pm_row["no_price"] else "NO"
            expensive_side_pm = "NO" if cheap_side_pm == "YES" else "YES"
            cheap_side_kalshi = "YES" if ks_row["yes_price"] <= ks_row["no_price"] else "NO"
            expensive_side_kalshi = "NO" if cheap_side_kalshi == "YES" else "YES"

            pm_order = f"BUY {cheap_side_pm} (short {expensive_side_pm})"
            kalshi_order = f"BUY {cheap_side_kalshi} (short {expensive_side_kalshi})"

            pm_order_price = pm_row["yes_price"] if cheap_side_pm == "YES" else pm_row["no_price"]
            kalshi_order_price = ks_row["yes_price"] if cheap_side_kalshi == "YES" else ks_row["no_price"]

            pm_est_contracts = round(POSITION_NOTIONAL_USD / max(pm_order_price, 0.01), 2)
            kalshi_est_contracts = round(POSITION_NOTIONAL_USD / max(kalshi_order_price, 0.01), 2)
            position_size_hint = (
                f"~${POSITION_NOTIONAL_USD:.0f}/leg: {pm_est_contracts} PM contracts, "
                f"{kalshi_est_contracts} Kalshi contracts"
            )

            trade_ready = (
                sim >= 0.75
                and overlap >= 2
                and 0 < ks_row["yes_price"] < 1
                and 0 < ks_row["no_price"] < 1
            )

            opportunities.append({
                "similarity": round(sim, 4),
                "token_overlap": overlap,
                "strategy": strategy,
                "cheap_side_pm": cheap_side_pm,
                "expensive_side_pm": expensive_side_pm,
                "pm_order": pm_order,
                "cheap_side_kalshi": cheap_side_kalshi,
                "expensive_side_kalshi": expensive_side_kalshi,
                "kalshi_order": kalshi_order,
                "pm_order_price": round(pm_order_price, 4),
                "kalshi_order_price": round(kalshi_order_price, 4),
                "pm_est_contracts": pm_est_contracts,
                "kalshi_est_contracts": kalshi_est_contracts,
                "position_size_hint": position_size_hint,
                "trade_ready": trade_ready,
                "leg_cost": round(leg_cost, 4),
                "gross_locked_return": round(gross_locked, 4),
                "net_locked_return": round(net_locked, 4),
                "pm_market_id": pm_row["market_id"],
                "pm_question": pm_row["question"],
                "pm_yes_price": round(pm_row["yes_price"], 4),
                "pm_no_price": round(pm_row["no_price"], 4),
                "pm_volume": round(pm_row["volume"], 2),
                "pm_liquidity": round(pm_row["liquidity"], 2),
                "pm_url": pm_row["url"],
                "kalshi_market_id": ks_row["market_id"],
                "kalshi_question": ks_row["question"],
                "kalshi_yes_price": round(ks_row["yes_price"], 4),
                "kalshi_no_price": round(ks_row["no_price"], 4),
                "kalshi_volume": round(ks_row["volume"], 2),
                "kalshi_liquidity": round(ks_row["liquidity"], 2),
                "kalshi_url": ks_row["url"],
            })

    if not opportunities:
        return pd.DataFrame()

    out = pd.DataFrame(opportunities)
    out = out.sort_values(
        ["net_locked_return", "similarity", "token_overlap"],
        ascending=[False, False, False],
    ).reset_index(drop=True)
    return out


all_matches_df = scan_cross_market_arbitrage(
    pm_df,
    kalshi_df,
    min_net_locked_return=-1.0,
    enforce_text_match=True,
)

if all_matches_df.empty:
    print("No text-matched pairs found under current similarity/token thresholds.")
    display_df = all_matches_df
else:
    review_df = all_matches_df.copy()
    review_df["yes_price_mismatch"] = (review_df["pm_yes_price"] - review_df["kalshi_yes_price"]).round(4)
    review_df["no_price_mismatch"] = (review_df["pm_no_price"] - review_df["kalshi_no_price"]).round(4)
    review_df["yes_abs_mismatch"] = review_df["yes_price_mismatch"].abs().round(4)
    review_df["no_abs_mismatch"] = review_df["no_price_mismatch"].abs().round(4)
    review_df["max_abs_mismatch"] = review_df[["yes_abs_mismatch", "no_abs_mismatch"]].max(axis=1).round(4)

    display_cols = [
        "pm_question",
        "kalshi_question",
        "pm_yes_price",
        "kalshi_yes_price",
        "yes_price_mismatch",
        "pm_no_price",
        "kalshi_no_price",
        "no_price_mismatch",
        "max_abs_mismatch",
        "strategy",
        "leg_cost",
        "gross_locked_return",
        "net_locked_return",
        "similarity",
        "token_overlap",
        "trade_ready",
        "pm_order",
        "kalshi_order",
        "position_size_hint",
        "pm_url",
        "kalshi_url",
    ]
    display_df = review_df[display_cols].sort_values(
        ["max_abs_mismatch", "similarity", "token_overlap"],
        ascending=[False, False, False],
    ).reset_index(drop=True)

    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    display_df.to_csv(OUTPUT_PATH, index=False)

    print(f"Saved {len(display_df)} matched pairs to {OUTPUT_PATH}")
    print(f"Trade-ready pairs: {int(display_df['trade_ready'].sum())}")
    print(f"Median max mismatch: {display_df['max_abs_mismatch'].median():.4f}")

display_df

Saved 10 matched pairs to results\cross_market_arbitrage_candidates.csv
Trade-ready pairs: 3
Median max mismatch: 0.0845


Unnamed: 0,pm_question,kalshi_question,pm_yes_price,kalshi_yes_price,yes_price_mismatch,pm_no_price,kalshi_no_price,no_price_mismatch,max_abs_mismatch,strategy,...,gross_locked_return,net_locked_return,similarity,token_overlap,trade_ready,pm_order,kalshi_order,position_size_hint,pm_url,kalshi_url
0,Will the Fed increase interest rates by 25+ bps after the March 2026 meeting?,Will the Federal Reserve Hike rates by 0bps at their March 2026 meeting?,0.0065,0.93,-0.9235,0.9935,0.08,0.9135,0.9235,Buy YES (Polymarket) + Buy NO (Kalshi),...,0.9135,0.9135,0.7534,3,True,BUY YES (short NO),BUY NO (short YES),"~$50/leg: 5000.0 PM contracts, 625.0 Kalshi contracts",https://polymarket.com/event/will-the-fed-increase-interest-rates-by-25-bps-after-the-march-2026-meeting,https://kalshi.com/markets/KXFEDDECISION-26MAR-H0
1,Will the Fed decrease interest rates by 50+ bps after the March 2026 meeting?,Will the Federal Reserve Hike rates by 0bps at their March 2026 meeting?,0.0075,0.93,-0.9225,0.9925,0.08,0.9125,0.9225,Buy YES (Polymarket) + Buy NO (Kalshi),...,0.9125,0.9125,0.7671,3,True,BUY YES (short NO),BUY NO (short YES),"~$50/leg: 5000.0 PM contracts, 625.0 Kalshi contracts",https://polymarket.com/event/will-the-fed-decrease-interest-rates-by-50-bps-after-the-march-2026-meeting,https://kalshi.com/markets/KXFEDDECISION-26MAR-H0
2,Will the Fed decrease interest rates by 25 bps after the March 2026 meeting?,Will the Federal Reserve Hike rates by 0bps at their March 2026 meeting?,0.055,0.93,-0.875,0.945,0.08,0.865,0.875,Buy YES (Polymarket) + Buy NO (Kalshi),...,0.865,0.865,0.7534,3,True,BUY YES (short NO),BUY NO (short YES),"~$50/leg: 909.09 PM contracts, 625.0 Kalshi contracts",https://polymarket.com/event/will-the-fed-decrease-interest-rates-by-25-bps-after-the-march-2026-meeting,https://kalshi.com/markets/KXFEDDECISION-26MAR-H0
3,Will the San Antonio Spurs win the 2026 NBA Finals?,"no New York wins by over 4.5 Points,yes San Antonio wins by over 4.5 Points",0.0845,0.39,-0.3055,0.9155,1.0,-0.0845,0.3055,Buy YES (Polymarket) + Buy NO (Kalshi),...,-0.0845,-0.0845,0.464,2,False,BUY YES (short NO),BUY YES (short NO),"~$50/leg: 591.72 PM contracts, 128.21 Kalshi contracts",https://polymarket.com/event/will-the-san-antonio-spurs-win-the-2026-nba-finals,https://kalshi.com/markets/KXMVESPORTSMULTIGAMEEXTENDED-S2026A05791C388C-03F11FD183E
4,Will the San Antonio Spurs win the 2026 NBA Finals?,"yes Cleveland,yes San Antonio,yes Minnesota,yes Sacramento",0.0845,0.0,0.0845,0.9155,1.0,-0.0845,0.0845,Buy NO (Polymarket) + Buy YES (Kalshi),...,0.0845,0.0845,0.463,2,False,BUY YES (short NO),BUY YES (short NO),"~$50/leg: 591.72 PM contracts, 5000.0 Kalshi contracts",https://polymarket.com/event/will-the-san-antonio-spurs-win-the-2026-nba-finals,https://kalshi.com/markets/KXMVESPORTSMULTIGAMEEXTENDED-S2026F20BE6A61C1-9893E17E82A
5,Will the San Antonio Spurs win the 2026 NBA Finals?,"yes Cleveland,yes San Antonio,yes Memphis",0.0845,0.0,0.0845,0.9155,1.0,-0.0845,0.0845,Buy NO (Polymarket) + Buy YES (Kalshi),...,0.0845,0.0845,0.4615,2,False,BUY YES (short NO),BUY YES (short NO),"~$50/leg: 591.72 PM contracts, 5000.0 Kalshi contracts",https://polymarket.com/event/will-the-san-antonio-spurs-win-the-2026-nba-finals,https://kalshi.com/markets/KXMVESPORTSMULTIGAMEEXTENDED-S2026C915B8B0A32-E6738274FE0
6,Will the Detroit Pistons win the 2026 NBA Finals?,"yes Detroit,yes Cade Cunningham: 20+,yes Duncan Robinson: 15+",0.0825,0.14,-0.0575,0.9175,1.0,-0.0825,0.0825,Buy NO (Polymarket) + Buy YES (Kalshi),...,-0.0575,-0.0575,0.4571,1,False,BUY YES (short NO),BUY YES (short NO),"~$50/leg: 606.06 PM contracts, 357.14 Kalshi contracts",https://polymarket.com/event/will-the-detroit-pistons-win-the-2026-nba-finals,https://kalshi.com/markets/KXMVESPORTSMULTIGAMEEXTENDED-S2026CCCE3DE20A4-62D111C3F6B
7,Will the Cleveland Cavaliers win the 2026 NBA Finals?,"yes Michael Porter Jr.: 20+,yes Cleveland wins by over 12.5 Points",0.068,0.0,0.068,0.932,1.0,-0.068,0.068,Buy NO (Polymarket) + Buy YES (Kalshi),...,0.068,0.068,0.487,1,False,BUY YES (short NO),BUY YES (short NO),"~$50/leg: 735.29 PM contracts, 5000.0 Kalshi contracts",https://polymarket.com/event/will-the-cleveland-cavaliers-win-the-2026-nba-finals,https://kalshi.com/markets/KXMVESPORTSMULTIGAMEEXTENDED-S2026D8FE1833D66-82DD9D06E6D
8,Will the Cleveland Cavaliers win the 2026 NBA Finals?,"yes Cleveland,yes San Antonio,yes Memphis",0.068,0.0,0.068,0.932,1.0,-0.068,0.068,Buy NO (Polymarket) + Buy YES (Kalshi),...,0.068,0.068,0.4516,1,False,BUY YES (short NO),BUY YES (short NO),"~$50/leg: 735.29 PM contracts, 5000.0 Kalshi contracts",https://polymarket.com/event/will-the-cleveland-cavaliers-win-the-2026-nba-finals,https://kalshi.com/markets/KXMVESPORTSMULTIGAMEEXTENDED-S2026C915B8B0A32-E6738274FE0
9,Will there be no change in Fed interest rates after the March 2026 meeting?,Will the Federal Reserve Hike rates by 0bps at their March 2026 meeting?,0.925,0.93,-0.005,0.075,0.08,-0.005,0.005,Buy YES (Polymarket) + Buy NO (Kalshi),...,-0.005,-0.005,0.6621,3,False,BUY NO (short YES),BUY NO (short YES),"~$50/leg: 666.67 PM contracts, 625.0 Kalshi contracts",https://polymarket.com/event/will-there-be-no-change-in-fed-interest-rates-after-the-march-2026-meeting,https://kalshi.com/markets/KXFEDDECISION-26MAR-H0


## Tuning Tips
- Increase `MIN_SIMILARITY` (e.g. to `0.80`) to reduce false matches.
- Increase `FEE_BUFFER` if your real all-in fees/slippage are higher.
- Raise `MIN_VOLUME` / `MIN_LIQUIDITY` for easier execution.
- Inspect `pm_question` vs `kalshi_question` manually before acting.

In [10]:
# Below we fetch all open Kalshi event questions and export to CSV for potential manual review or LLM processing.
# This version includes per-question market options (e.g., Democratic party | Republican party).
KALSHI_OPEN_EXPORT_PATH = Path("results/kalshi_open_questions_prices.csv")
KALSHI_OPEN_EXPORT_PATH.parent.mkdir(parents=True, exist_ok=True)

KALSHI_EVENTS_API_URL = "https://api.elections.kalshi.com/trade-api/v2/events"
KALSHI_MARKETS_API_URL = "https://api.elections.kalshi.com/trade-api/v2/markets"

# Set to False to export all open event questions.
POLITICS_ONLY = True
POLITICS_KEYWORDS = [
    "election", "president", "senate", "house", "governor", "mayor",
    "nominee", "party", "congress", "parliament", "prime minister",
    "cabinet", "vote", "voter", "campaign", "republican", "democratic",
    "gop", "trump", "biden", "rfk", "harris",
]

def _is_politics_event(title: str, category: str) -> bool:
    category_lc = category.lower()
    title_lc = title.lower()
    if "politic" in category_lc or "election" in category_lc:
        return True
    return any(keyword in title_lc for keyword in POLITICS_KEYWORDS)

def _clean_market_option(value: object) -> str:
    text = str(value or "").strip()
    if text.lower() in {"", "nan", "none", "::"}:
        return ""
    if text.startswith("::"):
        text = text[2:].strip()
    return text

def _to_float(value: object, default: float = -1.0) -> float:
    try:
        return float(value)
    except Exception:
        return default

def _format_market_options(options: list[str], max_items: int = 8) -> str:
    unique = list(dict.fromkeys([_clean_market_option(x) for x in options if _clean_market_option(x)]))
    if not unique:
        return ""
    if len(unique) > max_items:
        return " | ".join(unique[:max_items]) + f" | +{len(unique) - max_items} more"
    return " | ".join(unique)

def _format_price_ladder(price_rows: list[dict[str, object]], max_items: int = 5) -> str:
    if not price_rows:
        return ""

    by_option: dict[str, dict[str, object]] = {}
    for row in price_rows:
        option = _clean_market_option(row.get("option"))
        if not option:
            continue
        yes_price = _to_float(row.get("yes_price"), -1.0)
        no_price = _to_float(row.get("no_price"), -1.0)
        if not (0 <= yes_price <= 1 and 0 <= no_price <= 1):
            continue

        key = option.lower()
        prev = by_option.get(key)
        if prev is None or yes_price > float(prev["yes_price"]):
            by_option[key] = {
                "option": option,
                "yes_price": yes_price,
                "no_price": no_price,
            }

    ranked = sorted(
        by_option.values(),
        key=lambda row: (float(row["yes_price"]), -float(row["no_price"]), str(row["option"]).lower()),
        reverse=True,
    )
    if not ranked:
        return ""

    shown = ranked[:max_items]
    items = [
        f"({row['option']}; Yes {float(row['yes_price']):.2f}; No {float(row['no_price']):.2f})"
        for row in shown
    ]
    if len(ranked) > max_items:
        items.append(f"+{len(ranked) - max_items} more")
    return " | ".join(items)

def _request_json_with_backoff(url: str, params: dict | None = None, retries: int = 5) -> dict:
    import time

    params = params or {}
    for attempt in range(retries):
        resp = requests.get(url, params=params, timeout=30)
        if resp.status_code != 429:
            resp.raise_for_status()
            return resp.json() if resp.content else {}

        sleep_s = min(2 ** attempt, 12)
        print(f"Rate limited at {url}; retrying in {sleep_s}s (attempt {attempt + 1}/{retries})")
        time.sleep(sleep_s)

    raise RuntimeError(f"Exceeded retries due to rate limits for {url}")

def _extract_kalshi_option_price_row(market: dict) -> dict[str, object] | None:
    option = _clean_market_option(market.get("subtitle"))
    if not option:
        option = _clean_market_option(market.get("yes_sub_title"))
    if not option:
        return None

    yes_price = _to_float(market.get("yes_ask"), -1.0)
    no_price = _to_float(market.get("no_ask"), -1.0)
    if yes_price < 0 or no_price < 0:
        return None

    yes_price /= 100.0
    no_price /= 100.0
    if not (0 <= yes_price <= 1 and 0 <= no_price <= 1):
        return None

    return {
        "option": option,
        "yes_price": yes_price,
        "no_price": no_price,
    }

def _fetch_event_market_details(event_ticker: str) -> tuple[list[str], list[dict[str, object]]]:
    url = f"{KALSHI_EVENTS_API_URL}/{event_ticker}"
    try:
        payload = _request_json_with_backoff(url)
    except Exception:
        return [], []

    markets = payload.get("markets", []) if isinstance(payload, dict) else []
    if not isinstance(markets, list):
        return [], []

    options: list[str] = []
    price_rows: list[dict[str, object]] = []
    for market in markets:
        if not isinstance(market, dict):
            continue
        row = _extract_kalshi_option_price_row(market)
        if row is None:
            continue
        options.append(str(row["option"]))
        price_rows.append(row)

    return options, price_rows

def fetch_open_kalshi_event_rows(
    max_events: int = 10000,
    page_size: int = 200,
    politics_only: bool = POLITICS_ONLY,
    max_markets: int = 10000,
    market_page_size: int = 200,
    detail_enrichment_limit: int = 350,
    ) -> list[dict[str, str]]:
    event_map: dict[str, str] = {}
    event_category_map: dict[str, str] = {}

    cursor = None
    while len(event_map) < max_events:
        params = {
            "limit": min(page_size, max_events - len(event_map)),
            "status": "open",
        }
        if cursor:
            params["cursor"] = cursor

        payload = _request_json_with_backoff(KALSHI_EVENTS_API_URL, params=params)
        events = payload.get("events", []) if isinstance(payload, dict) else []
        if not events:
            break

        for event in events:
            title = str(event.get("title", "")).strip()
            category = str(event.get("category", "")).strip()
            event_ticker = str(event.get("event_ticker", "")).strip()
            if not title or not event_ticker:
                continue
            if politics_only and not _is_politics_event(title, category):
                continue
            event_map[event_ticker] = title
            event_category_map[event_ticker] = category

        cursor = payload.get("cursor") if isinstance(payload, dict) else None
        if not cursor or len(events) < params["limit"]:
            break

    market_options_by_event: dict[str, list[str]] = {}
    market_price_rows_by_event: dict[str, list[dict[str, object]]] = {}
    cursor = None
    fetched_markets = 0
    while fetched_markets < max_markets:
        params = {
            "limit": min(market_page_size, max_markets - fetched_markets),
            "status": "open",
        }
        if cursor:
            params["cursor"] = cursor

        payload = _request_json_with_backoff(KALSHI_MARKETS_API_URL, params=params)
        markets = payload.get("markets", []) if isinstance(payload, dict) else []
        if not markets:
            break

        fetched_markets += len(markets)
        for market in markets:
            event_ticker = str(market.get("event_ticker", "")).strip()
            if not event_ticker or event_ticker not in event_map:
                continue

            row = _extract_kalshi_option_price_row(market)
            if row is None:
                continue

            market_options_by_event.setdefault(event_ticker, []).append(str(row["option"]))
            market_price_rows_by_event.setdefault(event_ticker, []).append(row)

        cursor = payload.get("cursor") if isinstance(payload, dict) else None
        if not cursor or len(markets) < params["limit"]:
            break

    missing_tickers = [ticker for ticker in event_map if not market_options_by_event.get(ticker)]
    priority_tickers = [
        ticker for ticker in missing_tickers
        if any(token in event_map[ticker].lower() for token in ["winner", "nominee", "primary", "matchup"])
    ]
    enriched = 0
    for event_ticker in priority_tickers:
        if enriched >= detail_enrichment_limit:
            break
        options, price_rows = _fetch_event_market_details(event_ticker)
        if options:
            market_options_by_event[event_ticker] = options
        if price_rows:
            market_price_rows_by_event[event_ticker] = price_rows
        enriched += 1

    rows: list[dict[str, str]] = []
    for event_ticker, question in event_map.items():
        options = market_options_by_event.get(event_ticker, [])
        price_rows = market_price_rows_by_event.get(event_ticker, [])
        rows.append({
            "event_ticker": event_ticker,
            "question": question,
            "market": _format_market_options(options, max_items=12),
            "market_prices_top5": _format_price_ladder(price_rows, max_items=5),
            "category": event_category_map.get(event_ticker, ""),
        })

    rows.sort(key=lambda row: row["question"].lower())
    return rows

kalshi_open_rows = fetch_open_kalshi_event_rows()
kalshi_open_prices_df = pd.DataFrame(kalshi_open_rows)
kalshi_open_prices_df.to_csv(KALSHI_OPEN_EXPORT_PATH, index=False)
print(f"Saved {len(kalshi_open_prices_df)} open Kalshi event questions with market options to {KALSHI_OPEN_EXPORT_PATH}")
kalshi_open_prices_df.head(20)

Saved 1414 open Kalshi event questions with market options to results\kalshi_open_questions_prices.csv


Unnamed: 0,event_ticker,question,market,market_prices_top5,category
0,KXBALANCEPOWERCOMBO-27FEB,2026 2026 Midterms: Congress Balance of Power?,,,Politics
1,KXWICOURT-26,2026 2026 Wisconsin Supreme Court winner?,,,Politics
2,KXDEMSENATEPRIMARYCOMBO-26NOV03,2026 Democratic Senate primaries combo,,,Politics
3,KXFARRERBYELECTION-26DEC31,2026 Farrer by-election winner?,ALP | LP | NP | GRN | PHON | Family First Party | Independent,(NP; Yes 0.42; No 0.68) | (Independent; Yes 0.34; No 0.75) | (PHON; Yes 0.24; No 0.82) | (LP; Yes 0.15; No 0.95) | (...,Politics
4,KXMOVNJ11SPECIAL-26APR16,2026 NJ-11 special election margin of victory?,,,Elections
5,KXGOPSENATEPRIMARYCOMBO-26NOV03,2026 Republican Senate primaries combo,,,Politics
6,KXTXSENCOMBO-26NOV,2026 Texas Senate matchup?,Talarico vs. Paxton | Talarico vs. Cornyn | Talarico vs. Hunt | Crockett vs. Paxton | Crockett vs. Cornyn | Crockett...,(Talarico vs. Paxton; Yes 0.63; No 0.39) | (Crockett vs. Paxton; Yes 0.25; No 0.79) | (Talarico vs. Cornyn; Yes 0.12...,Politics
7,KXTRUMPBEARCASECOMBO-27DEC,2026: Trump's bad year?,,,Politics
8,KXTRUMPBULLCASECOMBO-27DEC,2026: Trump's dream year?,,,Politics
9,KXPRESNOMD-28,2028 Democratic nominee for President?,Hunter Biden | James Talarico | Ruben Gallego | Ro Khanna | Chris Murphy | Zohran Mamdani | Kamala Harris | Gavin Ne...,(Gavin Newsom; Yes 0.31; No 0.70) | (Alexandria Ocasio-Cortez; Yes 0.11; No 0.90) | (Kamala Harris; Yes 0.07; No 0.9...,Politics


In [None]:
# Polymarket doesn't have a public API endpoint for listing all open event questions, so we fetch via pagination and apply similar politics-only filtering as Kalshi.

PM_OPEN_EXPORT_PATH = Path("results/polymarket_open_questions_prices.csv")
PM_OPEN_EXPORT_PATH.parent.mkdir(parents=True, exist_ok=True)

# Mirrors Cell 7 behavior; inherits POLITICS_ONLY and POLITICS_KEYWORDS if available.
PM_POLITICS_ONLY = POLITICS_ONLY if "POLITICS_ONLY" in globals() else True
PM_POLITICS_KEYWORDS = (
    POLITICS_KEYWORDS
    if "POLITICS_KEYWORDS" in globals()
    else [
        "election", "president", "senate", "house", "governor", "mayor",
        "nominee", "party", "congress", "parliament", "prime minister",
        "cabinet", "vote", "voter", "campaign", "republican", "democratic",
        "gop", "trump", "biden", "rfk", "harris",
    ]
)

def _is_pm_politics_event(title: str) -> bool:
    title_lc = title.lower()
    return any(keyword in title_lc for keyword in PM_POLITICS_KEYWORDS)

def _clean_pm_option(value: object) -> str:
    text = str(value or "").strip()
    if text.lower() in {"", "nan", "none"}:
        return ""
    return text

def _parse_pm_list(value: object) -> list[str]:
    if isinstance(value, list):
        return [str(x).strip() for x in value if str(x).strip()]
    if isinstance(value, str):
        text = value.strip()
        if not text:
            return []
        try:
            parsed = json.loads(text)
            if isinstance(parsed, list):
                return [str(x).strip() for x in parsed if str(x).strip()]
        except Exception:
            pass
    return []

def _to_float(value: object, default: float = -1.0) -> float:
    try:
        return float(value)
    except Exception:
        return default

def _norm_pm_text(text: object) -> str:
    out = str(text or "").strip().lower()
    out = re.sub(r"[^a-z0-9\s]", " ", out)
    out = re.sub(r"\s+", " ", out).strip()
    return out

def _pm_child_matches_event(event_title: str, market: dict) -> bool:
    event_norm = _norm_pm_text(event_title)
    if not event_norm:
        return False

    child_parts = [
        market.get("question", ""),
        market.get("title", ""),
        market.get("subtitle", ""),
        market.get("groupItemTitle", ""),
    ]
    child_norm = _norm_pm_text(" ".join([str(x or "") for x in child_parts]))
    if not child_norm:
        return False

    sim = SequenceMatcher(None, event_norm, child_norm).ratio()
    event_tokens = set(event_norm.split())
    child_tokens = set(child_norm.split())
    overlap = len(event_tokens.intersection(child_tokens))

    if sim >= 0.60:
        return True
    if overlap >= 3:
        return True
    return False

def _format_pm_market_options(options: list[str], max_items: int = 12) -> str:
    unique = list(dict.fromkeys([_clean_pm_option(x) for x in options if _clean_pm_option(x)]))
    if not unique:
        return ""
    if len(unique) > max_items:
        return " | ".join(unique[:max_items]) + f" | +{len(unique) - max_items} more"
    return " | ".join(unique)

def _format_pm_price_ladder(price_rows: list[dict[str, object]], max_items: int = 5) -> str:
    if not price_rows:
        return ""

    by_option: dict[str, dict[str, object]] = {}
    for row in price_rows:
        option = _clean_pm_option(row.get("option"))
        if not option:
            continue

        yes_price = _to_float(row.get("yes_price"), -1.0)
        no_price = _to_float(row.get("no_price"), -1.0)
        if not (0 <= yes_price <= 1 and 0 <= no_price <= 1):
            continue

        key = option.lower()
        prev = by_option.get(key)
        if prev is None or yes_price > float(prev["yes_price"]):
            by_option[key] = {
                "option": option,
                "yes_price": yes_price,
                "no_price": no_price,
            }

    ranked = sorted(
        by_option.values(),
        key=lambda row: (float(row["yes_price"]), -float(row["no_price"]), str(row["option"]).lower()),
        reverse=True,
    )
    if not ranked:
        return ""

    shown = ranked[:max_items]
    items = [
        f"({row['option']}; Yes {float(row['yes_price']):.2f}; No {float(row['no_price']):.2f})"
        for row in shown
    ]
    if len(ranked) > max_items:
        items.append(f"+{len(ranked) - max_items} more")
    return " | ".join(items)

def _extract_pm_market_price_rows(event: dict, event_title: str) -> list[dict[str, object]]:
    out: list[dict[str, object]] = []
    markets = event.get("markets", []) if isinstance(event, dict) else []
    if not isinstance(markets, list):
        return out

    for market in markets:
        if not isinstance(market, dict):
            continue
        if not _pm_child_matches_event(event_title, market):
            continue

        outcomes = _parse_pm_list(market.get("outcomes"))
        prices = _parse_pm_list(market.get("outcomePrices"))
        yes_price = -1.0
        no_price = -1.0

        if len(outcomes) == len(prices) and outcomes:
            for idx, outcome in enumerate(outcomes):
                label = str(outcome).strip().lower()
                if label == "yes":
                    yes_price = _to_float(prices[idx], -1.0)
                elif label == "no":
                    no_price = _to_float(prices[idx], -1.0)

        if not (0 <= yes_price <= 1 and 0 <= no_price <= 1):
            continue

        option = ""
        for key in ["groupItemTitle", "subtitle", "title"]:
            option = _clean_pm_option(market.get(key))
            if option and option.lower() not in {"yes", "no"}:
                break
            option = ""

        if not option:
            non_binary = [x for x in outcomes if _clean_pm_option(x).lower() not in {"", "yes", "no"}]
            if non_binary:
                option = _clean_pm_option(non_binary[0])

        if not option:
            continue

        out.append({
            "option": option,
            "yes_price": yes_price,
            "no_price": no_price,
        })

    return out

def fetch_open_polymarket_events(
    max_events: int = 10000,
    page_size: int = 200,
    politics_only: bool = PM_POLITICS_ONLY,
    ) -> list[dict[str, str]]:
    events_by_title: dict[str, dict[str, str]] = {}
    offset = 0

    while len(events_by_title) < max_events:
        params = {
            "limit": min(page_size, max_events - len(events_by_title)),
            "offset": offset,
            "active": True,
            "closed": False,
            "archived": False,
        }

        resp = requests.get(PM_EVENTS_API_URL, params=params, timeout=30)
        resp.raise_for_status()
        events = resp.json()
        if not isinstance(events, list) or not events:
            break

        for event in events:
            title = str(event.get("title", "")).strip()
            if not title:
                continue
            if politics_only and not _is_pm_politics_event(title):
                continue
            if title in events_by_title:
                continue

            price_rows = _extract_pm_market_price_rows(event, title)
            options = [str(row["option"]) for row in price_rows]
            events_by_title[title] = {
                "market": _format_pm_market_options(options, max_items=12),
                "market_prices_top5": _format_pm_price_ladder(price_rows, max_items=5),
            }

        offset += len(events)
        if len(events) < params["limit"]:
            break

    rows = [
        {
            "question": question,
            "market": events_by_title[question]["market"],
            "market_prices_top5": events_by_title[question]["market_prices_top5"],
        }
        for question in sorted(events_by_title.keys(), key=lambda s: s.lower())
    ]
    return rows

pm_event_rows = fetch_open_polymarket_events()
polymarket_open_questions_df = pd.DataFrame(pm_event_rows, columns=["question", "market", "market_prices_top5"])
polymarket_open_questions_df.to_csv(PM_OPEN_EXPORT_PATH, index=False)
print(f"Saved {len(polymarket_open_questions_df)} unique open Polymarket event questions to {PM_OPEN_EXPORT_PATH}")
polymarket_open_questions_df.head(20)
# Cell 9: Fuzzy match Kalshi and Polymarket open question sets to find potential overlaps for manual review.
KALSHI_INPUT_PATH = Path("results/kalshi_open_questions_prices.csv")
POLYMARKET_INPUT_PATH = Path("results/polymarket_open_questions_prices.csv")
JOINED_OUTPUT_PATH = Path("results/kalshi_polymarket_question_join.csv")
JOINED_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

MATCH_THRESHOLD = 0.62
HIGH_CONF_THRESHOLD = 0.80

GENERIC_TOKENS = {
    "election", "winner", "wins", "win", "nominee", "nominees", "primary", "primaries",
    "democratic", "republican", "president", "presidential", "senate", "house", "governor",
    "party", "parties", "mayor", "special", "round", "first", "second", "third", "place",
    "who", "will", "before", "after", "by", "for", "in", "of", "the", "and", "to",
}

kalshi_questions_df = pd.read_csv(KALSHI_INPUT_PATH)
polymarket_questions_df = pd.read_csv(POLYMARKET_INPUT_PATH)

def _clean_text(value: object) -> str:
    text = "" if value is None else str(value).strip()
    return "" if text.lower() in {"", "nan", "none"} else text

def _first_present_str(row: pd.Series, cols: list[str]) -> str:
    for col in cols:
        if col in row.index:
            text = _clean_text(row[col])
            if text:
                return text
    return ""

def _build_value_map(df: pd.DataFrame, value_cols: list[str]) -> dict[str, str]:
    if "question" not in df.columns:
        return {}
    out: dict[str, str] = {}
    for _, row in df.iterrows():
        question = _clean_text(row.get("question", ""))
        if not question:
            continue
        value = _first_present_str(row, value_cols)
        if value and question not in out:
            out[question] = value
    return out

def _build_market_map(df: pd.DataFrame, provider: str) -> dict[str, str]:
    q_col = "question" if "question" in df.columns else None
    if q_col is None:
        return {}

    out: dict[str, str] = {}
    market_cols = ["market", "market_id", "id", "ticker", "slug", "event_slug"]
    for _, row in df.iterrows():
        question = _clean_text(row.get(q_col, ""))
        if not question:
            continue

        market_value = _first_present_str(row, ["url", "market_url"])
        if not market_value:
            market_value = _first_present_str(row, market_cols)
            if market_value and ("slug" in row.index or "event_slug" in row.index):
                slug = _first_present_str(row, ["slug", "event_slug"])
                if slug:
                    market_value = f"https://kalshi.com/markets/{slug}" if provider == "kalshi" else f"https://polymarket.com/event/{slug}"

        if market_value and question not in out:
            out[question] = market_value
    return out

def _merge_maps(base: dict[str, str], extra: dict[str, str]) -> dict[str, str]:
    merged = dict(base)
    for question, market in extra.items():
        if question not in merged or not merged[question]:
            merged[question] = market
    return merged

def _safe_read_csv(path: Path) -> pd.DataFrame:
    try:
        return pd.read_csv(path)
    except pd.errors.EmptyDataError:
        return pd.DataFrame()

def _parse_text_list(value: object) -> list[str]:
    if isinstance(value, list):
        return [_clean_text(x) for x in value if _clean_text(x)]
    if isinstance(value, str):
        text = value.strip()
        if not text:
            return []
        try:
            parsed = json.loads(text)
            if isinstance(parsed, list):
                return [_clean_text(x) for x in parsed if _clean_text(x)]
        except Exception:
            pass
        return [_clean_text(x) for x in re.split(r"[|,]", text) if _clean_text(x)]
    return []

def _format_market_options(options: list[str], max_items: int = 8) -> str:
    unique = list(dict.fromkeys([_clean_text(x) for x in options if _clean_text(x)]))
    if not unique:
        return ""
    if len(unique) > max_items:
        return " | ".join(unique[:max_items]) + f" | +{len(unique) - max_items} more"
    return " | ".join(unique)

def _build_live_market_option_map(provider: str) -> dict[str, str]:
    out: dict[str, str] = {}
    try:
        if provider == "kalshi":
            max_markets = max(int(globals().get("MAX_KALSHI_MARKETS", 800)), 2500)
            live_markets = fetch_markets_kalshi(max_markets=max_markets, page_size=200)
            grouped: dict[str, list[str]] = {}
            for market in live_markets:
                question = _clean_text(market.get("title", market.get("subtitle", "")))
                if not question:
                    continue
                option = _clean_text(market.get("subtitle"))
                if not option:
                    option = _clean_text(market.get("yes_sub_title"))
                if option:
                    grouped.setdefault(question, []).append(option)
            for question, options in grouped.items():
                formatted = _format_market_options(options)
                if formatted:
                    out[question] = formatted
        else:
            max_markets = max(int(globals().get("MAX_PM_MARKETS", 800)), 2500)
            live_markets = fetch_markets(max_markets=max_markets, page_size=200)
            for market in live_markets:
                question = _clean_text(market.get("question", ""))
                if not question or question in out:
                    continue
                outcomes = _parse_text_list(market.get("outcomes"))
                formatted = _format_market_options(outcomes)
                if formatted:
                    out[question] = formatted
    except Exception as exc:
        print(f"Live {provider} market enrichment skipped: {exc}")
    return out

kalshi_market_map = _build_market_map(kalshi_questions_df, provider="kalshi")
polymarket_market_map = _build_market_map(polymarket_questions_df, provider="polymarket")
kalshi_market_prices_map = _build_value_map(kalshi_questions_df, ["market_prices_top5", "market_prices"])
polymarket_market_prices_map = _build_value_map(polymarket_questions_df, ["market_prices_top5", "market_prices"])

KALSHI_EDGE_PATH = Path("results/kalshi_edge_candidates.csv")
POLYMARKET_EDGE_PATH = Path("results/polymarket_edge_candidates.csv")
CROSS_CANDIDATES_PATH = Path("results/cross_market_arbitrage_candidates.csv")

if KALSHI_EDGE_PATH.exists():
    ks_edge_df = _safe_read_csv(KALSHI_EDGE_PATH)
    if not ks_edge_df.empty:
        kalshi_market_map = _merge_maps(kalshi_market_map, _build_market_map(ks_edge_df, provider="kalshi"))

if POLYMARKET_EDGE_PATH.exists():
    pm_edge_df = _safe_read_csv(POLYMARKET_EDGE_PATH)
    if not pm_edge_df.empty:
        polymarket_market_map = _merge_maps(polymarket_market_map, _build_market_map(pm_edge_df, provider="polymarket"))

if CROSS_CANDIDATES_PATH.exists():
    cross_df = _safe_read_csv(CROSS_CANDIDATES_PATH)
    if cross_df.empty:
        cross_df = pd.DataFrame()
    if "kalshi_question" in cross_df.columns and "kalshi_url" in cross_df.columns:
        for _, row in cross_df[["kalshi_question", "kalshi_url"]].dropna().drop_duplicates().iterrows():
            question = _clean_text(row["kalshi_question"])
            market = _clean_text(row["kalshi_url"])
            if question and market and question not in kalshi_market_map:
                kalshi_market_map[question] = market
    if "pm_question" in cross_df.columns and "pm_url" in cross_df.columns:
        for _, row in cross_df[["pm_question", "pm_url"]].dropna().drop_duplicates().iterrows():
            question = _clean_text(row["pm_question"])
            market = _clean_text(row["pm_url"])
            if question and market and question not in polymarket_market_map:
                polymarket_market_map[question] = market

kalshi_live_market_map = _build_live_market_option_map(provider="kalshi")
if kalshi_live_market_map:
    kalshi_market_map = _merge_maps(kalshi_live_market_map, kalshi_market_map)

polymarket_live_market_map = _build_live_market_option_map(provider="polymarket")
if polymarket_live_market_map:
    polymarket_market_map = _merge_maps(polymarket_live_market_map, polymarket_market_map)

kalshi_questions = [
    str(q).strip()
    for q in kalshi_questions_df.get("question", pd.Series(dtype=str)).dropna().tolist()
    if str(q).strip()
]
polymarket_questions = [
    str(q).strip()
    for q in polymarket_questions_df.get("question", pd.Series(dtype=str)).dropna().tolist()
    if str(q).strip()
]

def _norm_text(text: str) -> str:
    out = text.lower()
    out = re.sub(r"[^a-z0-9\s]", " ", out)
    out = re.sub(r"\s+", " ", out).strip()
    return out

def _tokenize(text: str) -> set[str]:
    return {tok for tok in _norm_text(text).split() if len(tok) >= 2 and not tok.isdigit()}

def _content_tokens(text: str) -> set[str]:
    return {tok for tok in _tokenize(text) if tok not in GENERIC_TOKENS and len(tok) >= 3}

def _numbers(text: str) -> set[str]:
    return set(re.findall(r"\d+", text))

def _jaccard(a: set[str], b: set[str]) -> float:
    if not a or not b:
        return 0.0
    inter = len(a.intersection(b))
    union = len(a.union(b))
    return inter / union if union else 0.0

def _question_features(question: str) -> dict[str, object]:
    norm = _norm_text(question)
    tokens = {tok for tok in norm.split() if len(tok) >= 2 and not tok.isdigit()}
    content = {tok for tok in tokens if tok not in GENERIC_TOKENS and len(tok) >= 3}
    numbers = set(re.findall(r"\d+", question))
    return {
        "question": question,
        "norm": norm,
        "tokens": tokens,
        "content": content,
        "numbers": numbers,
    }

def _match_score(k_feat: dict[str, object], p_feat: dict[str, object]) -> float:
    seq = SequenceMatcher(None, k_feat["norm"], p_feat["norm"]).ratio()
    jac_all = _jaccard(k_feat["tokens"], p_feat["tokens"])
    jac_content = _jaccard(k_feat["content"], p_feat["content"])
    return 0.55 * seq + 0.20 * jac_all + 0.25 * jac_content

def _normalize_option_text(text: str) -> str:
    out = _clean_text(text).lower()
    out = out.replace("current incumbent:", "").replace("retiring incumbent:", "")
    out = re.sub(r"[^a-z0-9\s]", " ", out)
    out = re.sub(r"\s+", " ", out).strip()
    return out

def _parse_price_ladder(text: str) -> list[dict[str, object]]:
    out: list[dict[str, object]] = []
    if not text:
        return out
    pattern = re.compile(r"\((.*?); Yes ([0-9]*\.?[0-9]+); No ([0-9]*\.?[0-9]+)\)")
    for option, yes_raw, no_raw in pattern.findall(text):
        try:
            yes_price = float(yes_raw)
            no_price = float(no_raw)
        except Exception:
            continue
        if not (0 <= yes_price <= 1 and 0 <= no_price <= 1):
            continue
        out.append({
            "option": _clean_text(option),
            "option_norm": _normalize_option_text(option),
            "yes": yes_price,
            "no": no_price,
        })
    return out

def _arb_potential_score(kalshi_ladder: str, polymarket_ladder: str) -> float:
    ks_rows = _parse_price_ladder(kalshi_ladder)
    pm_rows = _parse_price_ladder(polymarket_ladder)
    if not ks_rows or not pm_rows:
        return -1.0

    best = -1.0
    matched = False
    for ks in ks_rows:
        for pm in pm_rows:
            sim = SequenceMatcher(None, str(ks["option_norm"]), str(pm["option_norm"])).ratio()
            if sim < 0.72:
                continue
            matched = True
            ks_yes = float(ks["yes"])
            ks_no = float(ks["no"])
            pm_yes = float(pm["yes"])
            pm_no = float(pm["no"])

            locked_yes_pm_no_ks = 1.0 - (pm_yes + ks_no)
            locked_yes_ks_no_pm = 1.0 - (ks_yes + pm_no)
            best = max(best, locked_yes_pm_no_ks, locked_yes_ks_no_pm)

    if not matched:
        return -1.0
    return round(best, 4)

# Deduplicate while preserving order.
kalshi_questions = list(dict.fromkeys(kalshi_questions))
polymarket_questions = list(dict.fromkeys(polymarket_questions))

kalshi_features = [_question_features(q) for q in kalshi_questions]
polymarket_features = [_question_features(q) for q in polymarket_questions]

pm_indices_no_numbers: list[int] = []
pm_indices_by_number: dict[str, list[int]] = {}
for idx, feat in enumerate(polymarket_features):
    nums = feat["numbers"]
    if not nums:
        pm_indices_no_numbers.append(idx)
    else:
        for n in nums:
            pm_indices_by_number.setdefault(n, []).append(idx)

candidate_matches = []
for k_feat in kalshi_features:
    kalshi_q = k_feat["question"]
    kalshi_numbers = k_feat["numbers"]
    kalshi_content = k_feat["content"]

    if kalshi_numbers:
        candidate_idx = set(pm_indices_no_numbers)
        for n in kalshi_numbers:
            candidate_idx.update(pm_indices_by_number.get(n, []))
        candidate_indices = sorted(candidate_idx)
    else:
        candidate_indices = range(len(polymarket_features))

    for pm_idx in candidate_indices:
        p_feat = polymarket_features[pm_idx]
        polymarket_q = p_feat["question"]
        polymarket_numbers = p_feat["numbers"]
        polymarket_content = p_feat["content"]

        if kalshi_numbers and polymarket_numbers and not kalshi_numbers.intersection(polymarket_numbers):
            continue

        content_overlap = len(kalshi_content.intersection(polymarket_content))
        score = _match_score(k_feat, p_feat)

        if content_overlap == 0 and score < HIGH_CONF_THRESHOLD:
            continue
        if score >= MATCH_THRESHOLD:
            candidate_matches.append((score, kalshi_q, polymarket_q))

candidate_matches.sort(key=lambda x: x[0], reverse=True)

used_kalshi = set()
used_polymarket = set()
rows = []
for score, kalshi_q, polymarket_q in candidate_matches:
    if kalshi_q in used_kalshi or polymarket_q in used_polymarket:
        continue
    used_kalshi.add(kalshi_q)
    used_polymarket.add(polymarket_q)
    kalshi_prices = kalshi_market_prices_map.get(kalshi_q, "")
    polymarket_prices = polymarket_market_prices_map.get(polymarket_q, "")
    arb_score = _arb_potential_score(kalshi_prices, polymarket_prices)
    rows.append({
        "kalshi.question": kalshi_q,
        "kalshi.market_prices_top5": kalshi_prices,
        "polymarket.question": polymarket_q,
        "polymarket.market_prices_top5": polymarket_prices,
        "arb_potential_score": arb_score,
    })

joined_df = pd.DataFrame(rows)
if not joined_df.empty:
    joined_df = joined_df.sort_values(
        by=["arb_potential_score", "kalshi.question", "polymarket.question"],
        ascending=[False, True, True],
        na_position="last",
    ).reset_index(drop=True)

joined_df.to_csv(JOINED_OUTPUT_PATH, index=False)
print(f"Saved {len(joined_df)} fuzzy-matched pairs to {JOINED_OUTPUT_PATH}")
joined_df.head(30)
# Cell 10: Build a focused list of the most probable arbitrage opportunities with selection reasons.
TOP_ARB_INPUT_PATH = Path("results/kalshi_polymarket_question_join.csv")
TOP_ARB_OUTPUT_PATH = Path("results/kalshi_polymarket_top_arb_opportunities.csv")
TOP_ARB_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

TOP_N_ARB = 40
MIN_ARB_SCORE = 0.01

def _bucket_score(score: float) -> str:
    if score >= 0.25:
        return "very_high"
    if score >= 0.10:
        return "high"
    if score >= 0.03:
        return "moderate"
    return "low"

def _selection_reason(row: pd.Series) -> str:
    score = float(row.get("arb_potential_score", 0.0))
    reasons: list[str] = [f"arb_potential_score={score:.2f}"]

    bucket = _bucket_score(score)
    if bucket == "very_high":
        reasons.append("large estimated locked-return spread")
    elif bucket == "high":
        reasons.append("strong estimated locked-return spread")
    elif bucket == "moderate":
        reasons.append("meaningful positive spread")
    else:
        reasons.append("small but positive spread")

    has_ks = bool(_clean_text(row.get("kalshi.market_prices_top5", "")))
    has_pm = bool(_clean_text(row.get("polymarket.market_prices_top5", "")))
    if has_ks and has_pm:
        reasons.append("both venues have top-5 ladder data")

    return "; ".join(reasons)

join_df = pd.read_csv(TOP_ARB_INPUT_PATH)
if join_df.empty:
    top_arb_df = join_df.copy()
else:
    join_df["arb_potential_score"] = pd.to_numeric(join_df.get("arb_potential_score", -1.0), errors="coerce")

    join_df["has_kalshi_prices"] = join_df.get("kalshi.market_prices_top5", "").astype(str).str.strip().ne("")
    join_df["has_polymarket_prices"] = join_df.get("polymarket.market_prices_top5", "").astype(str).str.strip().ne("")
    join_df["has_both_prices"] = join_df["has_kalshi_prices"] & join_df["has_polymarket_prices"]

    candidates = join_df[
        (join_df["has_both_prices"]) &
        (join_df["arb_potential_score"].notna()) &
        (join_df["arb_potential_score"] >= MIN_ARB_SCORE)
    ].copy()

    candidates["selection_bucket"] = candidates["arb_potential_score"].apply(_bucket_score)
    candidates["selection_reason"] = candidates.apply(_selection_reason, axis=1)

    top_arb_df = candidates.sort_values(
        by=["arb_potential_score", "kalshi.question", "polymarket.question"],
        ascending=[False, True, True],
    ).head(TOP_N_ARB).reset_index(drop=True)

    keep_cols = [
        "kalshi.question",
        "polymarket.question",
        "arb_potential_score",
        "selection_bucket",
        "selection_reason",
        "kalshi.market_prices_top5",
        "polymarket.market_prices_top5",
    ]
    top_arb_df = top_arb_df[[c for c in keep_cols if c in top_arb_df.columns]]

top_arb_df.to_csv(TOP_ARB_OUTPUT_PATH, index=False)
print(f"Saved {len(top_arb_df)} probable arbitrage opportunities to {TOP_ARB_OUTPUT_PATH}")
top_arb_df.head(20)
# Cell 11: Create Kalshi-only actionable bet recommendations with plain-English reasons.
KALSHI_BET_INPUT_PATH = Path("results/kalshi_polymarket_top_arb_opportunities.csv")
KALSHI_BET_OUTPUT_PATH = Path("results/kalshi_only_bet_recommendations.csv")
KALSHI_BET_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

TOP_KALSHI_BETS = 30
MIN_PRICE_GAP = 0.03
MIN_OPTION_SIMILARITY = 0.72
MIN_KALSHI_YES_FOR_BUY_YES = 0.20

def _norm_option_text(text: object) -> str:
    out = str(text or "").strip().lower()
    out = out.replace("current incumbent:", "").replace("retiring incumbent:", "")
    out = re.sub(r"[^a-z0-9\s]", " ", out)
    out = re.sub(r"\s+", " ", out).strip()
    return out

def _parse_ladder_rows(text: object) -> list[dict[str, object]]:
    out: list[dict[str, object]] = []
    raw = str(text or "").strip()
    if not raw:
        return out

    pattern = re.compile(r"\((.*?); Yes ([0-9]*\.?[0-9]+); No ([0-9]*\.?[0-9]+)\)")
    for option, yes_raw, no_raw in pattern.findall(raw):
        try:
            yes_price = float(yes_raw)
            no_price = float(no_raw)
        except Exception:
            continue
        if not (0 <= yes_price <= 1 and 0 <= no_price <= 1):
            continue
        option_txt = str(option).strip()
        if not option_txt:
            continue
        out.append({
            "option": option_txt,
            "option_norm": _norm_option_text(option_txt),
            "yes": yes_price,
            "no": no_price,
        })
    return out

def _best_pm_match(kalshi_row: dict[str, object], pm_rows: list[dict[str, object]]) -> tuple[dict[str, object] | None, float]:
    best_row = None
    best_sim = 0.0
    k_norm = str(kalshi_row.get("option_norm", ""))
    for pm_row in pm_rows:
        p_norm = str(pm_row.get("option_norm", ""))
        sim = SequenceMatcher(None, k_norm, p_norm).ratio()
        if sim > best_sim:
            best_sim = sim
            best_row = pm_row
    if best_sim < MIN_OPTION_SIMILARITY:
        return None, best_sim
    return best_row, best_sim

def _reason_text(
    side: str,
    option: str,
    kalshi_question: str,
    kalshi_yes: float,
    pm_yes: float,
    gap_abs: float,
) -> str:
    if side == "BUY_YES_KALSHI":
        return (
            f"Place bet on {option} (YES) on Kalshi for '{kalshi_question}' because "
            f"Kalshi has {kalshi_yes:.0%} chance while Polymarket has {pm_yes:.0%} "
            f"(difference {gap_abs:.0%})."
        )
    return (
        f"Place bet on {option} (NO) on Kalshi for '{kalshi_question}' because "
        f"Kalshi has {kalshi_yes:.0%} YES chance while Polymarket has {pm_yes:.0%}, "
        f"so Kalshi appears overpriced by {gap_abs:.0%}."
    )

def _liquidity_score(yes_price: float, no_price: float) -> float:
    overround = abs((yes_price + no_price) - 1.0)
    consistency_gap = abs(yes_price - (1.0 - no_price))
    raw = 1.0 - (2.5 * overround + 2.5 * consistency_gap)
    return round(max(0.0, min(1.0, raw)), 4)

def _estimate_days_to_resolution(question_text: str) -> int:
    now = pd.Timestamp.now().normalize()
    m = re.search(r"\b(20[2-4][0-9])\b", str(question_text or ""))
    if not m:
        return -1
    year = int(m.group(1))

    q = str(question_text or "").lower()
    if any(tok in q for tok in ["primary", "nominee"]):
        target = pd.Timestamp(year=year, month=6, day=30)
    else:
        target = pd.Timestamp(year=year, month=11, day=5)

    delta_days = int((target - now).days)
    return max(delta_days, 0)

def _confidence_tier(drift_score: float, option_similarity: float, min_liq: float, days_to_resolution: int) -> str:
    if drift_score >= 0.08 and option_similarity >= 0.95 and min_liq >= 0.75 and (days_to_resolution == -1 or days_to_resolution <= 365):
        return "high"
    if drift_score >= 0.04 and option_similarity >= 0.85 and min_liq >= 0.55:
        return "medium"
    return "low"

def _risk_notes(side: str, option_similarity: float, min_liq: float, days_to_resolution: int, expected_value: float, kalshi_yes: float) -> str:
    notes: list[str] = []
    if min_liq < 0.50:
        notes.append("thin liquidity")
    if option_similarity < 0.85:
        notes.append("option text mismatch risk")
    if days_to_resolution == -1:
        notes.append("unknown resolution date")
    elif days_to_resolution > 400:
        notes.append("long time-to-resolution")
    if expected_value <= 0:
        notes.append("edge may be consumed by fees")
    if side == "BUY_YES_KALSHI" and kalshi_yes < 0.35:
        notes.append("long-shot yes volatility")
    if not notes:
        notes.append("standard execution risk")
    return "; ".join(notes)

bet_source_df = pd.read_csv(KALSHI_BET_INPUT_PATH)
recommendation_rows: list[dict[str, object]] = []

for _, row in bet_source_df.iterrows():
    kalshi_q = str(row.get("kalshi.question", "")).strip()
    pm_q = str(row.get("polymarket.question", "")).strip()
    source_score = float(pd.to_numeric(row.get("arb_potential_score", -1), errors="coerce"))

    kalshi_ladder = _parse_ladder_rows(row.get("kalshi.market_prices_top5", ""))
    pm_ladder = _parse_ladder_rows(row.get("polymarket.market_prices_top5", ""))
    if not kalshi_ladder or not pm_ladder:
        continue

    for ks in kalshi_ladder:
        pm_match, opt_sim = _best_pm_match(ks, pm_ladder)
        if pm_match is None:
            continue

        kalshi_yes = float(ks["yes"])
        kalshi_no = float(ks["no"])
        pm_yes = float(pm_match["yes"])
        yes_gap = pm_yes - kalshi_yes
        gap_abs = abs(yes_gap)

        if gap_abs < MIN_PRICE_GAP:
            continue

        side = "BUY_YES_KALSHI" if yes_gap > 0 else "BUY_NO_KALSHI"
        if side == "BUY_YES_KALSHI" and kalshi_yes < MIN_KALSHI_YES_FOR_BUY_YES:
            continue

        kalshi_liquidity_score = _liquidity_score(kalshi_yes, kalshi_no)
        polymarket_no = float(pm_match.get("no", 1.0 - pm_yes))
        polymarket_liquidity_score = _liquidity_score(pm_yes, polymarket_no)
        days_to_resolution = _estimate_days_to_resolution(kalshi_q)
        drift_score = round(gap_abs * ((kalshi_liquidity_score + polymarket_liquidity_score) / 2.0), 4)
        min_liq = min(kalshi_liquidity_score, polymarket_liquidity_score)

        fee_buffer = float(FEE_BUFFER) if "FEE_BUFFER" in globals() else 0.01
        expected_value = round(gap_abs - fee_buffer, 4)
        confidence_tier = _confidence_tier(drift_score, opt_sim, min_liq, days_to_resolution)
        risk_notes = _risk_notes(side, opt_sim, min_liq, days_to_resolution, expected_value, kalshi_yes)

        recommendation_rows.append({
            "kalshi.question": kalshi_q,
            "polymarket.question": pm_q,
            "kalshi_option": str(ks["option"]),
            "polymarket_option_match": str(pm_match["option"]),
            "recommended_kalshi_bet": side,
            "kalshi_yes_price": round(kalshi_yes, 4),
            "kalshi_no_price": round(kalshi_no, 4),
            "polymarket_yes_price": round(pm_yes, 4),
            "yes_price_gap": round(yes_gap, 4),
            "abs_price_gap": round(gap_abs, 4),
            "kalshi_liquidity_score": kalshi_liquidity_score,
            "polymarket_liquidity_score": polymarket_liquidity_score,
            "days_to_resolution": days_to_resolution,
            "drift_score": drift_score,
            "confidence_tier": confidence_tier,
            "expected_value_kalshi_only": expected_value,
            "risk_notes": risk_notes,
            "option_similarity": round(opt_sim, 4),
            "source_arb_potential_score": round(source_score, 4),
            "recommendation_reason": _reason_text(
                side=side,
                option=str(ks["option"]),
                kalshi_question=kalshi_q,
                kalshi_yes=kalshi_yes,
                pm_yes=pm_yes,
                gap_abs=gap_abs,
            ),
        })

recommendations_df = pd.DataFrame(recommendation_rows)
if recommendations_df.empty:
    final_bets_df = recommendations_df
else:
    final_bets_df = recommendations_df.sort_values(
        by=["abs_price_gap", "source_arb_potential_score", "option_similarity"],
        ascending=[False, False, False],
    ).drop_duplicates(
        subset=["kalshi.question", "kalshi_option", "recommended_kalshi_bet"],
        keep="first",
    ).head(TOP_KALSHI_BETS).reset_index(drop=True)

final_bets_df.to_csv(KALSHI_BET_OUTPUT_PATH, index=False)
print(f"Saved {len(final_bets_df)} Kalshi-only bet recommendations to {KALSHI_BET_OUTPUT_PATH}")
final_bets_df.head(20)

Saved 996 unique open Polymarket event questions to results\polymarket_open_questions_prices.csv
Saved 228 fuzzy-matched pairs to results\kalshi_polymarket_question_join.csv
Saved 40 probable arbitrage opportunities to results\kalshi_polymarket_top_arb_opportunities.csv


TypeError: Cannot subtract tz-naive and tz-aware datetime-like objects.