In [1]:
# 📋 Cell 0 – Prereqs & env-vars
# - Install deps:  pip install pandas numpy pinecone-client langchain-openai langchain
# - Export keys **before** running:
#     export OPENAI_API_KEY="sk-…"
#     export PINECONE_API_KEY="pc-…"
#     export PINECONE_CLOUD="aws"        # or "gcp"
#     export PINECONE_REGION="us-east-1" # e.g. us-east-1, us-east1

In [2]:
# %% Cell 1 – Imports & config constants
import os, time, math, numpy as np, pandas as pd
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from pinecone import Pinecone, ServerlessSpec

CSV_PATH   = "twtc.csv"
INDEX_NAME = "scout-reports-index"
BATCH_SIZE = 100                               # vectors per upsert
CLOUD      = os.getenv("PINECONE_CLOUD",  "aws")
REGION     = os.getenv("PINECONE_REGION", "us-east-1")
API_KEY    = os.getenv("PINECONE_API_KEY")


In [3]:
# %% Cell 2 – Load dataset & show shape
df = pd.read_csv(CSV_PATH)
print("Rows, columns:", df.shape)


Rows, columns: (9175, 26)


In [4]:
# %% Cell 3 – Row → langchain Document (narrative + grades)
def make_doc(row) -> Document:
    base = row["report"] if pd.notna(row["report"]) else row["text"]
    graded = [f"{col} {int(row[col])}" for col in
              ("Fastball","Slider","Curveball","Changeup","Control",
               "Power","Run","Field","Hit","Arm") if row[col] > 0]
    text = " | ".join([
        f"SCOUTING REPORT: {row['name']} — {row['primary_position']} ({int(row['year'])})",
        base,
        " ".join(graded),
    ])
    meta = {
        "name":   row["name"],
        "pos":    row["primary_position"],
        "year":   int(row["year"]),
        "eta":    int(row["eta"]),
        "source": row["source"],
        "text":   text,      # store full blob for retrieval display
    }
    return Document(page_content=text, metadata=meta)

docs = [make_doc(r) for _, r in df.iterrows()]
print(f"Built {len(docs)} documents")


Built 9175 documents


In [5]:
# %% Cell 4 – Split only long blobs (>1 600 chars ≈ 400 tokens)
splitter   = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=50
)
THRESHOLD  = 1600

docs_final = []
for d in docs:
    docs_final.extend(splitter.split_documents([d])) \
        if len(d.page_content) > THRESHOLD else docs_final.append(d)

print(f"After splitting: {len(docs_final)} chunks")


After splitting: 9316 chunks


In [6]:
# %% Cell 5 – Stats check on chunk lengths
lengths = np.array([len(d.page_content) for d in docs_final])
print("Max chars:", lengths.max(), "| Median tokens ≈", int(np.median(lengths)/4))


Max chars: 1957 | Median tokens ≈ 280


In [7]:
# %% Cell 6 – Init OpenAI embeddings (3-large) & Pinecone index (dim 3 072)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
EMBED_DIM  = len(embeddings.embed_query("test"))      # 3 072 for 3-large

pc = Pinecone(api_key=API_KEY)
if INDEX_NAME in pc.list_indexes().names():
    idx_dim = pc.describe_index(INDEX_NAME)["dimension"]
    if idx_dim != EMBED_DIM:
        print(f"Deleting old index (dim {idx_dim}) to match model dim {EMBED_DIM}…")
        pc.delete_index(INDEX_NAME)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name      = INDEX_NAME,
        dimension = EMBED_DIM,
        metric    = "cosine",
        spec      = ServerlessSpec(cloud=CLOUD, region=REGION),
    )
    print(f"Created index {INDEX_NAME} with dim {EMBED_DIM}")
index = pc.Index(INDEX_NAME)


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# %%
def chunk(iterable, size):
    for i in range(0, len(iterable), size):
        yield iterable[i:i+size]

start = time.time()
for batch_start, batch_docs in enumerate(chunk(docs_final, BATCH_SIZE)):
    texts     = [d.page_content for d in batch_docs]
    metadatas = [d.metadata      for d in batch_docs]
    vecs      = embeddings.embed_documents(texts)
    to_upsert = [
        (f"id-{batch_start*BATCH_SIZE + j}", vecs[j], metadatas[j])
        for j in range(len(batch_docs))
    ]
    index.upsert(vectors=to_upsert)
    if batch_start % 10 == 0:
        print(f"Upserted {(batch_start+1)*BATCH_SIZE} vectors…")

print("Done. Total vectors in index:",
      pc.describe_index(INDEX_NAME)["total_vector_count"],
      "| Time:", round(time.time()-start, 1), "s")


Upserted 100 vectors…
Upserted 1100 vectors…
Upserted 2100 vectors…
Upserted 3100 vectors…
Upserted 4100 vectors…
Upserted 5100 vectors…
Upserted 6100 vectors…
Upserted 7100 vectors…
Upserted 8100 vectors…
Upserted 9100 vectors…
Done. Total vectors in index: None | Time: 234.7 s


In [15]:
stats = index.describe_index_stats()
total = stats.get("total_vector_count") \
        or stats.get("total_record_count") \
        or sum(ns["vector_count"] for ns in stats["namespaces"].values())

print("Total vectors now in Pinecone:", total)


Total vectors now in Pinecone: 9316


In [None]:
# %% Cell 8 – Helper: similarity_search(query, k)
def similarity_search(query, k=4):
    vec = embeddings.embed_query(query)
    return index.query(vector=vec, top_k=k, include_metadata=True)["matches"]

matches = similarity_search("What grade is Cade Cavalli's slider?")
print(matches[0]["metadata"]["text"][:400], "…")


SCOUTING REPORT: Cade Cavalli — RHP (2017) | Cavalli showed a cannon arm as a shortstop on the summer showcase circuit in 2015, leading scouts to wonder how hard he might throw off the mound. He missed much of last summer battling an ankle injury, and then much of his senior season with back problems. He did sit at 94 mph with his fastball early in a heavily scouted Oklahoma 6-A playoff game again …


In [35]:
# %% 📞 Cell 10 – ask(): answer first, sources after
# Usage examples:
#   ask("Who are the top 70-run catchers?", show_sources=True)   # prints answer + sources
#   ans = ask("…")                                              # returns answer string only

import textwrap
from langchain.schema import SystemMessage, HumanMessage

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.2)

def _fmt_source(idx, hit, width=90):
    text  = hit["metadata"]["text"].replace("\n", " ").strip()
    first = textwrap.shorten(text, width=width, placeholder="…")
    return f"{idx:02d}. [score {hit['score']:.3f}]  {first}"

def ask(question: str, k: int = 6, show_sources: bool = False) -> str:
    """Return answer string; optionally print nicely formatted sources."""
    # Retrieve context
    vec  = embeddings.embed_query(question)
    hits = index.query(vector=vec, top_k=k, include_metadata=True)["matches"]
    if not hits:
        return "No relevant information found in the vector database."

    context = "\n\n---\n\n".join(h["metadata"]["text"] for h in hits)

    # LLM call
    msgs = [
        SystemMessage(content= "You are Scout-RAG, a baseball scouting assistant. "
        "The user will ask questions about prospects; the only facts you may use "
        "come from the **Context** section below, which consists of one-or-more "
        "scouting-report excerpts. Each excerpt follows this pattern:\n"
        "  SCOUTING REPORT: <Name> — <Pos> (<Year>) | <Narrative> | <Grades>\n\n"

        "RULES you must follow when you craft an answer:\n"
        "1. **Grounding** – Never add information that is not present in the "
        "   context. If the answer is not found, say so briefly.\n"
        "2. **Numeric constraints** – When a question specifies cut-offs "
        "   (e.g., ‘changeup ≥ 55’, ‘sub-20 % strike-out rate’), list ONLY "
        "   players whose grades or stats in the context satisfy EVERY "
        "   condition. Ignore partial matches.\n"
        "3. **Filters** – Obey filters the user implies (handedness, year, "
        "   position, ETA, league, etc.). If a chunk lacks the field, treat "
        "   it as non-qualifying.\n"
        "4. **One row per player** – Mention a player at most once. If the "
        "   context contains duplicates or multiple years, use the one that "
        "   best satisfies the query.\n"
        "5. **Concise output** – Prefer short bullet lists or numbered lists "
        "   (Name – key grades/traits). No paragraphs of fluff.\n"
        "6. **Unambiguous grades** – Quote numeric grades exactly as they "
        "   appear (e.g., ‘Changeup 60’, not ‘plus changeup’).\n"
        "7. **Hallucination check** – After drafting your answer, quickly "
        "   verify each fact against the context. Remove any item that is "
        "   not explicitly supported.\n\n"

        "Respond with the answer first. If the caller passes "
        "`show_sources=True`, the code will print the source snippets "
        "after your answer, so do NOT embed citations or excerpts yourself."
),
        HumanMessage(content=f"Context:\n{context}\n\nQuestion:\n{question}")
    ]
    answer = llm(msgs).content.strip()

    # Display
    if show_sources:
        print("\nAnswer:\n", answer, "\n")
        print("### Sources")
        for i, h in enumerate(hits, 1):
            print(_fmt_source(i, h))



# Example call (prints answer first, then sources)
ask("What grade is Cade Cavalli's slider?", show_sources=True)



Answer:
 Cade Cavalli's slider grade is 50. 

### Sources
01. [score 0.712]  SCOUTING REPORT: Cade Cavalli — RHP (2017) | Cavalli showed a cannon arm as a shortstop…
02. [score 0.584]  SCOUTING REPORT: Ethan Hankins — RHP (2018) | Hankins thrust himself into consideration…
03. [score 0.581]  SCOUTING REPORT: Ethan Small — LHP (2019) | Small reached 96 mph with his fastball as a…
04. [score 0.575]  SCOUTING REPORT: Jonathan Gray — RHP (2013) | While many felt his teammate Dillon Overton…
05. [score 0.566]  SCOUTING REPORT: Jonathan Gray — RHP (2013) | Gray didn't enter the spring with much…
06. [score 0.566]  SCOUTING REPORT: Michael Clevinger — RHP (2013) | The junior college product was having a…


In [36]:
ask("Which left-handed pitchers show plus command and a changeup grade of at least 50?", show_sources=True)


Answer:
 1. Dillon Peters – Fastball 60, Changeup 50, Control 55
2. Julio Urias – Fastball 70, Changeup 60, Control 55
3. Stephen Gonsalves – Fastball 60, Changeup 50, Control 50 

### Sources
01. [score 0.625]  SCOUTING REPORT: Dillon Peters — LHP (2017) | Scouts viewed Peters as a potential day-one…
02. [score 0.591]  SCOUTING REPORT: Matthew Liberatore — LHP (2018) | Liberatore stood out on the summer…
03. [score 0.585]  SCOUTING REPORT: Brendon Little — LHP (2015) | Little isn't the sexiest prospect, but the…
04. [score 0.581]  SCOUTING REPORT: Julio Urias — LHP (2015) | Not only is Urias the top left-handed…
05. [score 0.580]  SCOUTING REPORT: Brailyn Marquez — LHP (2019) | Name another teenage lefty who touches…
06. [score 0.580]  SCOUTING REPORT: Stephen Gonsalves — LHP (2013) | A high school lefty, Gonsalves had a…


In [43]:
ask("Provide cases (2013-2019 only) where two scouting sources disagreed on a player’s future role—starter vs. reliever or everyday vs. utility—and briefly outline each side’s argument.", show_sources=True)


Answer:
 1. **Brendan McKay — 1B (2019)**
   - **Starter Argument**: McKay showed above-average to plus command with a low-to-mid-90s fastball and a plus-flashing curveball, indicating No. 2 or 3 starter upside. His development as a pitcher was highlighted by his ability to navigate lineups effectively.
   - **Reliever Argument**: Some scouts viewed his future as a standout, playoff rotation-caliber starter but acknowledged the potential for him to be valued more on the mound than as a hitter, suggesting a dual role might limit his effectiveness.

2. **Kyle Bradish — RHP (2019)**
   - **Reliever Argument**: Bradish's max-effort delivery and command issues were cited as reasons he would likely not succeed as a starter long-term. His traits were projected to play best in short stints, making him a good middle reliever.
   - **Starter Argument**: While not explicitly stated, the mention of his ability to generate mid-90s fastballs early in games suggests that some scouts might have seen 

In [44]:
ask("List lefties who debuted in MLB at age 21 or younger (per debut_age) and owned a changeup grade ≥ 60.", show_sources=True)


Answer:
 1. Jesus Luzardo – Changeup 60
2. Julio Urias – Changeup 60 

### Sources
01. [score 0.562]  SCOUTING REPORT: Jesus Luzardo — LHP (2019) | â¢ 7/4: Luzardo out with lat strainWhen…
02. [score 0.533]  SCOUTING REPORT: Sixto Sanchez — RHP (2019) | Sanchez's first 2019 start -- during which…
03. [score 0.532]  SCOUTING REPORT: Julio Urias — LHP (2015) | Not only is Urias the top left-handed…
04. [score 0.528]  SCOUTING REPORT: Alex Young — LHP (2015) | Young stood out on the Cape for his three…
05. [score 0.523]  SCOUTING REPORT: Kyle Young — LHP (2019) | Purely on stuff, Young belongs in the bottom…
06. [score 0.521]  SCOUTING REPORT: Alex Young — LHP (2018) | After taking Dansby Swanson with the No. 1…


In [46]:
ask("Give examples of arms flagged for “high-effort deliveries” that might force a bullpen role; include their primary pitches.", show_sources=True)


Answer:
 1. **Kyle Bradish** – Fastball 55, Slider 55, Curveball 60, Changeup 50
2. **Braden Webb** – Fastball 60, Curveball 60, Changeup 55
3. **Shawn Armstrong** – Fastball 60, Slider 55, Curveball 40 

### Sources
01. [score 0.514]  SCOUTING REPORT: Deivi Garcia — RHP (2019) | At this time last year, Garcia was a 40 FV…
02. [score 0.513]  SCOUTING REPORT: Kyle Bradish — RHP (2019) | It's exciting to consider how Bradish's…
03. [score 0.511]  SCOUTING REPORT: Braden Webb — RHP (2019) | Webb opened his senior season at Owasso…
04. [score 0.510]  SCOUTING REPORT: Drew Rasmussen — RHP (2019) | Rasmussen was a big part of Oregon State's…
05. [score 0.501]  SCOUTING REPORT: Ethan Hankins — RHP (2018) | Hankins made himself a candidate to become…
06. [score 0.501]  SCOUTING REPORT: Shawn Armstrong — RHP (2016) | The Indians went well above slot to…


In [47]:
ask("Which switch-hitters in the corpus have scouts warning about extreme platoon splits?", show_sources=True)


Answer:
 - Carlos Cortes – Scouts note he will have to be an elite offensive force to be more than a platoon corner bat.
- Jhon Solarte – Lacks projection on the body, has to hit enough for a corner, implying potential platoon concerns. 

### Sources
01. [score 0.543]  SCOUTING REPORT: Danny Lantigua — OF (2019) | There are ways to nitpick Lantigua -- he's…
02. [score 0.529]  SCOUTING REPORT: Drew Vettleson — OF (2014) | Vettleson made headlines in high school for…
03. [score 0.528]  SCOUTING REPORT: Carlos Cortes — OF (2019) | Cortes was a bit of an oddity as a prep…
04. [score 0.503]  SCOUTING REPORT: Jhon Solarte — OF (2019) | Physical, switch-hitting outfielder with…
05. [score 0.502]  SCOUTING REPORT: Chris Shaw — 1B (2019) | Shaw is one of many whiff-heavy corner bats…
06. [score 0.499]  SCOUTING REPORT: Peter O'Brien — OF (2016) | Few prospects have more raw power than…


In [48]:
ask("Retrieve reports noting pitchers who throw a true screwball, forkball, or eephus pitch. Quote the key sentence.", show_sources=True)


Answer:
 There are no reports in the provided context that mention pitchers who throw a true screwball, forkball, or eephus pitch. 

### Sources
01. [score 0.486]  SCOUTING REPORT: Devin Smeltzer — LHP (2019) | Most pitchers tuck their gloves in by…
02. [score 0.479]  SCOUTING REPORT: Carson Smith — RHP (2014) | Relievers who throw from an unorthodox arm…
03. [score 0.476]  SCOUTING REPORT: Zeke Spruill — RHP (2013) | Another prospect in the Justin Upton haul,…
04. [score 0.469]  SCOUTING REPORT: Garrett Whitlock — RHP (2019) | When you watch Whitlock (video link…
05. [score 0.468]  SCOUTING REPORT: Dustin May — RHP (2019) | May's flamboyant ginger curls and Bronson…
06. [score 0.468]  SCOUTING REPORT: MacKenzie Gore — LHP (2019) | Blisters and fingernail issues were a…


In [49]:
ask("Are there prospects between 2013-2019 who pair a 70 run tool with a 40-or-lower fielding grade? Summarize the defensive concern.", show_sources=True)


Answer:
 - **None found** – All prospects with a 70 run tool have fielding grades above 40. 

### Sources
01. [score 0.607]  SCOUTING REPORT: VÃ­ctor VÃ­ctor Mesa — OF (2019) | It took four trades for bonus space…
02. [score 0.592]  SCOUTING REPORT: Troy Stokes Jr. — OF (2019) | The Brewers selected Stokes in the fourth…
03. [score 0.585]  SCOUTING REPORT: Cristian Pache — OF (2019) | If we told you a prospect seemed like an…
04. [score 0.580]  SCOUTING REPORT: Jorge Mateo — SS (2019) | Not only was Mateo unable to carry his torrid…
05. [score 0.579]  SCOUTING REPORT: Nick Allen — SS (2019) | Even among a historically talented group of…
06. [score 0.578]  SCOUTING REPORT: Jose Garcia — SS (2019) | Between his lack of reps during the '16-'17…
