# 🎾 Player Match Analysis
This notebook fetches:
1. Historical head‑to‑head matches  
2. A model prediction for the next match  
3. Recent news articles for each player


In [55]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
import os, re, time, json, pathlib, uuid, pandas as pd
from utils.predict_between_two_players import predict_match
from dotenv import load_dotenv
from tavily import TavilyClient
from concurrent.futures import ThreadPoolExecutor, as_completed
import textwrap



In [56]:
# --- Configuration ---
PLAYER1_ID = 207989   # Carlos Alcaraz
PLAYER2_ID = 206173   # Jannick Sinner

SURFACE   = "Clay"   # Hard | Clay | Grass | Carpet
BEST_OF   = 3        # 3 or 5
DRAW_SIZE = 128      # Optional tournament draw size

TIME_RANGE = "week"  #  day / week / month / year
NEWS_MAX   = 10
SCORE_CUT  = 0.4


In [43]:

DATA_PATH = "../data/cleanedDataset.csv"

# Helper: historical matches
def get_matches_between_players(p1_id: int, p2_id: int, csv_path: str = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    return df[((df['p1_id'] == p1_id) & (df['p2_id'] == p2_id)) |
              ((df['p1_id'] == p2_id) & (df['p2_id'] == p1_id))]

# Helper: Tavily news
load_dotenv()
tavily = TavilyClient(os.getenv("TAVILY_API_KEY"))

def tavily_news(player_name: str,
                time_range: str,
                max_results: int,
                score_cut: float) -> list[dict]:
    query = f"What are the latest news for tennis player {player_name}?"
    search = tavily.search(
        query=query,
        search_depth="advanced",
        time_range=time_range,
        max_results=max_results,
        include_answer="basic"
    )
    urls = [r['url'] for r in search['results'] if r.get('score',0) >= score_cut]
    return tavily.extract(urls=urls)['results'] if urls else []


In [44]:
matches_df = get_matches_between_players(PLAYER1_ID, PLAYER2_ID)
print(f"Found {len(matches_df)} historical matches")
matches_df


Found 10 historical matches


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,p1_id,p1_seed,p1_entry,...,p2_1stWon,p2_2ndWon,p2_SvGms,p2_bpSaved,p2_bpFaced,p1_rank,p1_rank_points,p2_rank,p2_rank_points,RESULT
84593,2021-0352,Paris Masters,Hard,64,M,20211101,273,207989,,,...,33.0,18.0,12.0,9.0,11.0,35.0,1529.0,9.0,3395.0,1
88540,2022-540,Wimbledon,Grass,128,G,20220627,213,207989,5.0,,...,61.0,40.0,21.0,7.0,7.0,7.0,4890.0,13.0,3185.0,0
88718,2022-0439,Umag,Clay,32,A,20220725,300,206173,2.0,,...,31.0,14.0,13.0,4.0,9.0,10.0,3185.0,5.0,4895.0,1
89097,2022-560,Us Open,Hard,128,G,20220829,222,207989,3.0,,...,75.0,43.0,27.0,15.0,26.0,4.0,5100.0,13.0,3020.0,1
90245,2023-0404,Indian Wells Masters,Hard,128,M,20230306,299,206173,11.0,,...,38.0,9.0,11.0,1.0,2.0,13.0,2655.0,2.0,6780.0,0
90340,2023-0403,Miami Masters,Hard,128,M,20230320,299,206173,10.0,,...,41.0,21.0,15.0,6.0,12.0,11.0,2925.0,1.0,7420.0,1
91952,2023-0747,Beijing,Hard,32,A,20230927,299,206173,6.0,,...,27.0,5.0,10.0,3.0,8.0,7.0,4465.0,2.0,8535.0,1
93057,2024-0404,Indian Wells Masters,Hard,128,M,20240304,298,207989,2.0,,...,30.0,15.0,12.0,3.0,6.0,2.0,8805.0,3.0,8270.0,1
93720,2024-520,Roland Garros,Clay,128,G,20240527,400,206173,2.0,,...,68.0,25.0,23.0,4.0,10.0,2.0,8770.0,3.0,7300.0,0
94808,2024-0747,Beijing,Hard,32,A,20240926,400,207989,2.0,,...,47.0,31.0,17.0,12.0,15.0,3.0,6690.0,1.0,11180.0,1


In [45]:
prediction_str = predict_match(
    player1_id=PLAYER1_ID,
    player2_id=PLAYER2_ID,
    surface=SURFACE,
    best_of=BEST_OF,
    draw_size=DRAW_SIZE
)
prediction_str


'Carlos Alcaraz is predicted to win — Carlos Alcaraz 51.4% | Jannik Sinner 48.6%'

In [46]:
# Get player names (fallback to IDs if the H2H dataframe is empty)
p1_name = matches_df.iloc[0]['p1_name'] if not matches_df.empty else str(PLAYER1_ID)
p2_name = matches_df.iloc[0]['p2_name'] if not matches_df.empty else str(PLAYER2_ID)

# Tavily news search
news_p1 = tavily_news(p1_name, TIME_RANGE, NEWS_MAX, SCORE_CUT)
news_p2 = tavily_news(p2_name, TIME_RANGE, NEWS_MAX, SCORE_CUT)

# Extract raw article text
raw_p1_texts = [article.get("raw_content", "") for article in news_p1]
raw_p2_texts = [article.get("raw_content", "") for article in news_p2]

print(f"{p1_name}: {len(news_p1)} articles (raw texts: {len(raw_p1_texts)})")
print(f"{p2_name}: {len(news_p2)} articles (raw texts: {len(raw_p2_texts)})")


Carlos Alcaraz: 7 articles (raw texts: 7)
Jannik Sinner: 6 articles (raw texts: 6)


In [47]:
raw_p1_texts

['Bolavip, like Futbol Sites, is a company owned by Better Collective. All rights reserved.\n\nRafael Nadal’s former coach makes something clear about Alcaraz’s ambitions to be the best in history\n\nRafael Nadal’s former coach, Carlos Moyá, shared his thoughts on Carlos Alcaraz’s ambitions to become the greatest player in tennis history, highlighting the challenges of his unique approach.\n\nUpdated on May 07, 2025 05:46PM EDT\n\nBy Gianni Taina\n\nWhen many believed no one in Spain could replicate Rafael Nadal’s extraordinary career, Carlos Alcaraz emerged, breaking records and becoming the youngest player in history to achieve the World No. 1 ranking. Now 22, Alcaraz has made it clear that his ultimate goal is to be the best player in tennis history—but on his own terms.\n\nAlcaraz’s philosophy has sparked debate, especially following the release of his documentary “Carlos Alcaraz: My Way,” where he spoke candidly about his ambitions and challenges. “I’m paving my way to becoming th

In [48]:
raw_p2_texts

['Published Time: 2025-05-05T09:26:36Z\nJannik Sinner\'s doping ban has expired, Italian Open welcomes him back in style\n\n\nWatch\nSearch\n\nLogin\nEdit Profile\nLogout\n\n\nLive Scores\n\nNews\nTournaments\nPlayers & Rankings\nINTERVIEWS\nBaseline\n\nATP Challenger Tour\n\n\nLive Scores\n\nNews\nTournaments\nPlayers & Rankings\nINTERVIEWS\nBaseline\n\nATP Challenger Tour\n\n\n\n\n\n\n\n\nAdvertising\nRome, Italy Jannik Sinner\'s doping ban has expired, Italian Open welcomes him back in style ------------------------------------------------------------------------------- By Associated Press May 05, 2025Rome, Italy Novak Djokovic withdraws from Rome after early exits in Madrid and Monte-Carlo ------------------------------------------------------------------------------ By TENNIS.com Apr 29, 2025Rome, Italy Andy Roddick: Jannik Sinner “has to set the table in different ways” on clay ---------------------------------------------------------------------------- By David Kane Apr 27, 2025

In [49]:
load_dotenv()

llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0)  # 0 for deterministic cleaning

clean_prompt = PromptTemplate(
    input_variables=["article"],
    template="""You are a meticulous copy-editor. Given a raw scrape of a news article,
remove everything that is not the core story text (ads, navigation links, repeated lists,
social-media prompts, newsletters, quizzes, etc.). Return only clean, well-formatted
paragraphs of the article in its original language—no commentary, no summary.

RAW ARTICLE:
----------------
{article}
----------------
CLEAN ARTICLE:"""
)
clean_chain = LLMChain(llm=llm, prompt=clean_prompt)

# --- quick regex to drop obvious junk before the LLM ---------------------------
_noise_re = re.compile(r"(Advertising|Follow (Us|us)|Trending News|Free Newsletters"
                       r"|Popular Topics|Subscribe|Latest News).*", re.I)

def _pre_clean(text: str) -> str:
    return "\n".join(
        ln for ln in text.splitlines() if ln.strip() and not _noise_re.match(ln)
    )

# --- worker that one thread will run ------------------------------------------
def _clean_single(raw: str) -> str:
    # LLMChain.invoke returns a dict {'text': ...}
    result = clean_chain.invoke({"article": _pre_clean(raw)})
    return result["text"] if isinstance(result, dict) else result

# --- public helper -------------------------------------------------------------
def clean_articles_fast(raw_list: list[str],
                        *,
                        max_workers: int = 8,
                        rpm_cap: int = 90) -> list[str]:
    """
    Clean many articles in parallel.
      • max_workers  – number of concurrent threads (tweak for your CPU/rate limit)
      • rpm_cap      – OpenAI requests-per-minute cap to avoid 429s
    """
    delay = 60 / rpm_cap if rpm_cap else 0
    cleaned = []
    with ThreadPoolExecutor(max_workers=max_workers) as pool:
        futures = {pool.submit(_clean_single, art): i
                   for i, art in enumerate(raw_list) if art.strip()}
        for fut in as_completed(futures):
            try:
                cleaned.append(fut.result())
            except Exception as e:
                print(f"LLM error: {e}")
            # crude rate-limit pacing
            if delay:
                time.sleep(delay)
    return cleaned

# ─── Run it ────────────────────────────────────────────────────────────────────
clean_p1_texts = clean_articles_fast(raw_p1_texts)
clean_p2_texts = clean_articles_fast(raw_p2_texts)

print(f"P1 cleaned: {len(clean_p1_texts)}  |  P2 cleaned: {len(clean_p2_texts)}")


P1 cleaned: 7  |  P2 cleaned: 6


In [50]:
clean_p1_texts

["Carlos Alcaraz has already overcome the physical discomfort that prevented him from playing in the Mutua Madrid Open. It all happened in the final of the Conde de Godó against Holger Rune, and despite the efforts of the Murcian to play in the Spanish capital, he decided not to take risks and focus on what was to come. After a few rest days at home and several training sessions, the El Palmar native is ready to play at the ATP Rome, as a prelude to his main goal of this clay court tour: Roland Garros.\n\nThe current world number three will make an appearance at a tournament in which he has only participated once, in 2023, and fell in the third round against Fabian Marozsan. Additionally, he will coincide at the Foro Italico with Jannik Sinner, who is returning to the circuit after serving his three-month suspension and with whom he could face in a hypothetical final. The Spaniard will start his journey against Dusan Lajovic, a Serbian player whom he has faced four times and defeated o

In [51]:
clean_p2_texts

['Jannik Sinner’s doping case has been one of the biggest stories in tennis since shortly before the 2024 U.S. Open. On August 20 2024, the International Tennis Integrity Agency (ITIA) — the worldwide anti-doping agency for the sport — announced that Sinner, the men’s world No. 1, had twice tested positive for clostebol, a banned anabolic steroid, in March of that year.\n\nIt also announced that an independent hearing, convened by the ITIA, had found Sinner bore “no fault or negligence” for those positive tests, accepting his explanation that he had been contaminated by a healing spray purchased by his physio, Umberto Ferrara. Sinner’s physiotherapist, Giacomo Naldi, used the spray on a cut on his hand and then subsequently gave Sinner a massage on his back and applied treatments to his feet.\n\nSinner parted company with Ferrara and Naldi, part of the team that took him to world No. 1, on the eve of the U.S. Open, which Sinner won against Taylor Fritz.\n\nJust over a month after the I

In [52]:
docs: list[Document] = []
seen_urls = set()


def _make_docs(clean_texts, raw_articles, player_name, player_id):
    for i, (body, raw) in enumerate(zip(clean_texts, raw_articles)):
        url = raw.get("url")
        if not url or url in seen_urls:
            continue                      # skip duplicate
        seen_urls.add(url)
        docs.append(
            Document(
                page_content=body,
                metadata={
                    "player_name": player_name,
                    "player_id": player_id,
                    "url": raw.get("url"),
                    "uuid": str(uuid.uuid4())[:8]  # unique key
                }
            )
        )

_make_docs(clean_p1_texts, news_p1, p1_name, PLAYER1_ID)
_make_docs(clean_p2_texts, news_p2, p2_name, PLAYER2_ID)

print(f"Total docs to store: {len(docs)}")

Total docs to store: 13


In [53]:
persist_dir = "chroma_store"
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectordb = Chroma.from_documents(
    documents        = docs,
    embedding        = embeddings,
    persist_directory= persist_dir
)
print("Vector DB saved to", pathlib.Path(persist_dir).resolve())


Vector DB saved to C:\Users\iosif\PycharmProjects\TennisCenter\LLMs\chroma_store


In [None]:
ruud_docs = vectordb.similarity_search(
    query   = "Rome",
    k       = 3,
    filter  = {"player_name": p1_name}   # filter by metadata
)
for d in ruud_docs:
    print("-" * 60)
    print("URL:", d.metadata["url"])
    print(d.page_content)

In [68]:
def head2head_summary(df: pd.DataFrame, p1: str, p2: str) -> str:
    if df.empty:
        return "These players have never met on tour."

    # Count wins, regardless of who is p1/p2 in each row
    wins = {p1: 0, p2: 0}
    for _, r in df.iterrows():
        # RESULT == 1 if p1_name won; RESULT == 0 if p2_name won
        winner = r["p1_name"] if r["RESULT"] == 1 else r["p2_name"]
        if winner == p1:
            wins[p1] += 1
        elif winner == p2:
            wins[p2] += 1

    # Last 3 meetings
    last3 = df.sort_values("tourney_date", ascending=False).head(3)
    lines = []
    for _, r in last3.iterrows():
        if r["RESULT"] == 1:
            winner, loser = r["p1_name"], r["p2_name"]
        else:
            winner, loser = r["p2_name"], r["p1_name"]
        lines.append(
            f"{r['tourney_date']} | {r['surface']} | {r['tourney_name']} | "
            f"{r['round']} | {winner} d. {loser} {r['score']}"
        )

    return (
        f"Total meetings: {len(df)}  —  {p1} {wins[p1]}W vs {p2} {wins[p2]}W\n"
        "Last 3 meetings:\n" + "\n".join(lines)
    )

h2h_summary = head2head_summary(matches_df, p1_name, p2_name)
print("Head-to-Head Summary:\n", h2h_summary)


Head-to-Head Summary:
 Total meetings: 10  —  Carlos Alcaraz 6W vs Jannik Sinner 4W
Last 3 meetings:
20240926 | Hard | Beijing | F | Carlos Alcaraz d. Jannik Sinner 6-7(6) 6-4 7-6(3)
20240527 | Clay | Roland Garros | SF | Carlos Alcaraz d. Jannik Sinner 2-6 6-3 3-6 6-4 6-3
20240304 | Hard | Indian Wells Masters | SF | Carlos Alcaraz d. Jannik Sinner 1-6 6-3 6-2


In [69]:
def head2head_surface_summary(df: pd.DataFrame, p1: str, p2: str, surface: str) -> str:
    sf = df[df["surface"].str.lower() == surface.lower()]
    if sf.empty:
        return f"No prior meetings on {surface}."

    wins = {p1: 0, p2: 0}
    for _, r in sf.iterrows():
        winner = r["p1_name"] if r["RESULT"] == 1 else r["p2_name"]
        if winner == p1:
            wins[p1] += 1
        elif winner == p2:
            wins[p2] += 1

    last3 = sf.sort_values("tourney_date", ascending=False).head(3)
    lines = []
    for _, r in last3.iterrows():
        if r["RESULT"] == 1:
            winner, loser = r["p1_name"], r["p2_name"]
        else:
            winner, loser = r["p2_name"], r["p1_name"]
        lines.append(
            f"{r['tourney_date']} | {r['tourney_name']} | {r['round']} | "
            f"{winner} d. {loser} {r['score']}"
        )

    return (
        f"{surface} meetings: {len(sf)}  —  {p1} {wins[p1]}W vs {p2} {wins[p2]}W\n"
        "Last on surface:\n" + "\n".join(lines)
    )

h2h_surface_summary = head2head_surface_summary(matches_df, p1_name, p2_name, SURFACE)
print("Surface-Specific Head-to-Head:\n", h2h_surface_summary)


Surface-Specific Head-to-Head:
 Clay meetings: 2  —  Carlos Alcaraz 1W vs Jannik Sinner 1W
Last on surface:
20240527 | Roland Garros | SF | Carlos Alcaraz d. Jannik Sinner 2-6 6-3 3-6 6-4 6-3
20220725 | Umag | F | Jannik Sinner d. Carlos Alcaraz 6-7(5) 6-1 6-1


In [59]:
def gather_news_snippets(vdb, player_name: str, k: int = 3) -> str:
    docs = vdb.similarity_search(
        query="recent form and news",
        k=k,
        filter={"player_name": player_name}
    )
    if not docs:
        return "No recent news snippets found."
    snippets = []
    for d in docs:
        # shorten to 500 chars 
        snippets.append(textwrap.shorten(d.page_content, width=500, placeholder="…"))
    return "\n\n".join(snippets)

news_p1_snip = gather_news_snippets(vectordb, p1_name, k=3)
news_p2_snip = gather_news_snippets(vectordb, p2_name, k=3)

print(f"Snippets for {p1_name}:\n", news_p1_snip, "\n\n")
print(f"Snippets for {p2_name}:\n", news_p2_snip)


Snippets for Carlos Alcaraz:
 Carlos Alcaraz 2025 Stats 2 | 0 | 24-5 Career 18 | 0 | 233-61 Career CURRENT TOURNAMENT Internazionali BNL d'Italia- Rome, Italy May 5, 2025 to May 18, 2025 ROUND | OPPONENT | RESULT | SCORE Men's Singles 1st | | W | 2nd | Dusan Lajovic | - | May 9 7:00 AM ET STATS YEAR | PRIZE MONEY | SINGLES TITLES | DOUBLES TITLES | SINGLES W-L 2025 | $2,700,922 | 2 | 0 | 24-5 2024 | $10,358,429 | 4 | 0 | 54-13 2023 | $15,196,504 | 6 | 0 | 65-12 2022 | $10,102,330 | 5 | 0 | 57-13 2021 | $1,632,676 | 1 | 0 |…

Carlos Alcaraz is back in action at the Italian Open, following time out due to his adductor injury. Alcaraz suffered the injury in Barcelona, which he received treatment for during his final against Holger Rune, where the Dane emerged victorious. After withdrawing from the Madrid Open, the Spaniard is now healthy enough to compete at the Italian Open in Rome, which he missed last year due to an arm injury. Alcaraz is looking for his third title of the season, havi

In [70]:
llm = ChatOpenAI(
    model_name="gpt-4o-mini",
    temperature=0.2)

prompt = PromptTemplate(
    input_variables=[
        "p1_name", "p2_name", "prediction",
        "h2h", "h2h_surface", "surface",
        "news_p1", "news_p2"
    ],
    template="""
You are a professional tennis analyst AI. Using the data below, provide:

1. A concise narrative of each player’s recent form.
2. Key tactical factors likely to decide the match.
3. Your own predicted winner and confidence (0–100%).
4. A brief rationale (2–3 sentences).

Model prediction:
{prediction}

Head-to-Head (All surfaces):
{h2h}

Head-to-Head on {surface}:
{h2h_surface}

Recent news about {p1_name}:
{news_p1}

Recent news about {p2_name}:
{news_p2}

Respond as an expert analyst, not the model itself.
"""
)

analysis_chain = LLMChain(llm=llm, prompt=prompt)

analysis = analysis_chain.invoke({
    "p1_name":     p1_name,
    "p2_name":     p2_name,
    "prediction":  prediction_str,
    "h2h":         h2h_summary,
    "h2h_surface": h2h_surface_summary,
    "surface":     SURFACE,
    "news_p1":     news_p1_snip,
    "news_p2":     news_p2_snip
})

print("Analyst Insight:\n", analysis if isinstance(analysis, str) else analysis["text"])

Analyst Insight:
 ### 1. Recent Form Narrative

**Carlos Alcaraz:**  
Carlos Alcaraz has recently returned to competitive tennis after recovering from an adductor injury that sidelined him for the Madrid Open. Prior to his injury, he was performing well, with a solid record of 24 wins and 5 losses in 2025, including two titles. His recent matches show resilience, particularly in his last encounter against Jannik Sinner at the Beijing final, where he triumphed in a closely contested match. Alcaraz's ability to bounce back from injury and his determination to reclaim his top form make him a formidable opponent.

**Jannik Sinner:**  
Jannik Sinner has just made his return to the tour after a three-month suspension due to a positive drug test. His absence from the circuit raises questions about his match fitness and mental readiness. Despite being a three-time Grand Slam champion, the break could impact his rhythm and confidence. Sinner's recent comments indicate a focus on his game rather