In [None]:
# 📋 Cell 0 – Prereqs & env-vars
# - Install deps:  pip install pandas numpy pinecone-client langchain-openai langchain
# - Export keys **before** running:
#     export OPENAI_API_KEY="sk-…"
#     export PINECONE_API_KEY="pc-…"
#     export PINECONE_CLOUD="aws"        # or "gcp"
#     export PINECONE_REGION="us-east-1" # e.g. us-east-1, us-east1

In [34]:
# %% Cell 1 – Imports & config constants
import os, time, math, numpy as np, pandas as pd
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from pinecone import Pinecone, ServerlessSpec

CSV_PATH   = "twtc.csv"
INDEX_NAME = "scout-reports-index"
BATCH_SIZE = 100                               # vectors per upsert
CLOUD      = os.getenv("PINECONE_CLOUD",  "aws")
REGION     = os.getenv("PINECONE_REGION", "us-east-1")
API_KEY    = os.getenv("PINECONE_API_KEY")


In [35]:
# %% Cell 2 – Load dataset & show shape
df = pd.read_csv(CSV_PATH)
print("Rows, columns:", df.shape)


Rows, columns: (9175, 26)


In [36]:
# %% Cell 3 – Row → langchain Document (narrative + grades)
def make_doc(row) -> Document:
    base = row["report"] if pd.notna(row["report"]) else row["text"]
    graded = [f"{col} {int(row[col])}" for col in
              ("Fastball","Slider","Curveball","Changeup","Control",
               "Power","Run","Field","Hit","Arm") if row[col] > 0]
    text = " | ".join([
        f"SCOUTING REPORT: {row['name']} — {row['primary_position']} ({int(row['year'])})",
        base,
        " ".join(graded),
    ])
    meta = {
        "name":   row["name"],
        "pos":    row["primary_position"],
        "year":   int(row["year"]),
        "eta":    int(row["eta"]),
        "source": row["source"],
        "text":   text,      # store full blob for retrieval display
    }
    return Document(page_content=text, metadata=meta)

docs = [make_doc(r) for _, r in df.iterrows()]
print(f"Built {len(docs)} documents")


Built 9175 documents


In [37]:
# %% Cell 4 – Split only long blobs (>1 600 chars ≈ 400 tokens)
splitter   = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=400, chunk_overlap=50
)
THRESHOLD  = 1600

docs_final = []
for d in docs:
    docs_final.extend(splitter.split_documents([d])) \
        if len(d.page_content) > THRESHOLD else docs_final.append(d)

print(f"After splitting: {len(docs_final)} chunks")


After splitting: 9316 chunks


In [38]:
# %% Cell 5 – Stats check on chunk lengths
lengths = np.array([len(d.page_content) for d in docs_final])
print("Max chars:", lengths.max(), "| Median tokens ≈", int(np.median(lengths)/4))


Max chars: 1957 | Median tokens ≈ 280


In [39]:
# %% Cell 6 – Init OpenAI embeddings (3-large) & Pinecone index (dim 3 072)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
EMBED_DIM  = len(embeddings.embed_query("test"))      # 3 072 for 3-large

pc = Pinecone(api_key=API_KEY)
if INDEX_NAME in pc.list_indexes().names():
    idx_dim = pc.describe_index(INDEX_NAME)["dimension"]
    if idx_dim != EMBED_DIM:
        print(f"Deleting old index (dim {idx_dim}) to match model dim {EMBED_DIM}…")
        pc.delete_index(INDEX_NAME)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name      = INDEX_NAME,
        dimension = EMBED_DIM,
        metric    = "cosine",
        spec      = ServerlessSpec(cloud=CLOUD, region=REGION),
    )
    print(f"Created index {INDEX_NAME} with dim {EMBED_DIM}")
index = pc.Index(INDEX_NAME)


In [33]:
# %%
def chunk(iterable, size):
    for i in range(0, len(iterable), size):
        yield iterable[i:i+size]

start = time.time()
for batch_start, batch_docs in enumerate(chunk(docs_final, BATCH_SIZE)):
    texts     = [d.page_content for d in batch_docs]
    metadatas = [d.metadata      for d in batch_docs]
    vecs      = embeddings.embed_documents(texts)
    to_upsert = [
        (f"id-{batch_start*BATCH_SIZE + j}", vecs[j], metadatas[j])
        for j in range(len(batch_docs))
    ]
    index.upsert(vectors=to_upsert)
    if batch_start % 10 == 0:
        print(f"Upserted {(batch_start+1)*BATCH_SIZE} vectors…")

print("Done. Total vectors in index:",
      pc.describe_index(INDEX_NAME)["total_vector_count"],
      "| Time:", round(time.time()-start, 1), "s")


Upserted 100 vectors…
Upserted 1100 vectors…
Upserted 2100 vectors…
Upserted 3100 vectors…
Upserted 4100 vectors…
Upserted 5100 vectors…
Upserted 6100 vectors…
Upserted 7100 vectors…
Upserted 8100 vectors…
Upserted 9100 vectors…
Done. Total vectors in index: None | Time: 220.3 s


In [42]:
# %% 📊 Cell X – verify vector count (v3-style)
import time
time.sleep(5)                                   # <- give the service a moment

stats = index.describe_index_stats()
# v3 returns counts inside 'namespaces'
total = stats.get("total_vector_count") \
        or stats.get("total_record_count") \
        or sum(ns["vector_count"] for ns in stats["namespaces"].values())

print("Total vectors now in Pinecone:", total)


Total vectors now in Pinecone: 9316


In [40]:
# %% Cell 8 – Helper: similarity_search(query, k)
def similarity_search(query, k=4):
    vec = embeddings.embed_query(query)
    return index.query(vector=vec, top_k=k, include_metadata=True)["matches"]

matches = similarity_search("What grade is Cade Horton's slider?")
print(matches[0]["metadata"]["text"][:400], "…")


SCOUTING REPORT: Cade Cavalli — RHP (2017) | Cavalli showed a cannon arm as a shortstop on the summer showcase circuit in 2015, leading scouts to wonder how hard he might throw off the mound. He missed much of last summer battling an ankle injury, and then much of his senior season with back problems. He did sit at 94 mph with his fastball early in a heavily scouted Oklahoma 6-A playoff game again …


In [45]:
# %% Cell 9 – RAG answer function using GPT-4o
from langchain.schema import SystemMessage, HumanMessage
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.2)

def rag_answer(question, k=4):
    ctx = "\n\n---\n\n".join(m["metadata"]["text"] for m in similarity_search(question, k))
    msgs = [
        SystemMessage(content="You are a helpful baseball scouting assistant. "
                              "Answer only using the provided context."),
        HumanMessage(content=f"Context:\n{ctx}\n\nQuestion:\n{question}")
    ]
    return llm(msgs).content

print("\nAnswer:\n", rag_answer("What grade is Cade Cavali slider?"))



Answer:
 Cade Cavali's slider is graded at 50.


In [47]:
# %% 📞 Cell 10 – Define a single helper `ask()` for Q&A
# Usage: answer = ask("What grade is Cade Horton's slider?")
#
# - k: # of nearest chunks to fetch
# - show_sources=True will print the source texts that fed the answer

from langchain.schema import SystemMessage, HumanMessage

llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0.2)

def ask(question: str, k: int = 6, show_sources: bool = False) -> str:
    """Query Pinecone, build context, and return an LLM answer."""
    # Retrieve top-k similar chunks
    vec   = embeddings.embed_query(question)
    hits  = index.query(vector=vec, top_k=k, include_metadata=True)["matches"]

    # If nothing retrieved, short-circuit
    if not hits:
        return "No relevant information found in the vector database."

    # Build context string
    context = "\n\n---\n\n".join(hit["metadata"]["text"] for hit in hits)

    # Compose and run LLM prompt
    messages = [
        SystemMessage(content="You are a helpful baseball scouting assistant. "
                              "Answer strictly from the provided context."),
        HumanMessage(content=f"Context:\n{context}\n\nQuestion:\n{question}")
    ]
    answer = llm(messages).content

    # Optionally show sources
    if show_sources:
        for i, h in enumerate(hits, 1):
            print(f"\n# Source {i} | score={h['score']:.4f}\n{h['metadata']['text'][:400]}...\n")

    return answer

# Example
print(ask("What grade is Cade Cavali's slider?", show_sources=True))



# Source 1 | score=0.7020
SCOUTING REPORT: Cade Cavalli — RHP (2017) | Cavalli showed a cannon arm as a shortstop on the summer showcase circuit in 2015, leading scouts to wonder how hard he might throw off the mound. He missed much of last summer battling an ankle injury, and then much of his senior season with back problems. He did sit at 94 mph with his fastball early in a heavily scouted Oklahoma 6-A playoff game again...


# Source 2 | score=0.5893
SCOUTING REPORT: Garrett Cave — RHP (2017) | Cave began his college career at Florida International, struggling there for two years, especially as a starter during his sophomore season. He starred in the Cape Cod League as a closer last summer, ahead of his transfer to Division II Tampa, where he has continued to show off tremendous arm strength.The 6-foot-4 right-hander shows a premium fastball t...


# Source 3 | score=0.5884
SCOUTING REPORT: Garrett Cave — RHP (2017) | After struggling to throw strikes and having only sporadic succe

In [48]:
ask("Which left-handed pitchers in the 2018 class show plus command and a changeup grade of at least 55?", show_sources=True)


# Source 1 | score=0.5995
SCOUTING REPORT: Matthew Liberatore — LHP (2018) | In an age when radar gun readings often rule the day, Liberatore stood out on the summer showcase circuit more with his feel for pitching than plus velocity. He threw 12 shutout innings for the 18U U.S. National Team, and there was a reason he got the ball for their gold medal game against Korea, won by Liberatore thanks to his six scoreless frame...


# Source 2 | score=0.5935
SCOUTING REPORT: Dillon Peters — LHP (2017) | Scouts viewed Peters as a potential day-one Draft pick in 2011, but a strong commitment to Texas, as well as a seven-figure price tag, ultimately caused him to fall to Cleveland in the 20th round. He honored his college commitment and turned in three consistent seasons as a starter for the Longhorns, setting him up once again to come off the board in t...


# Source 3 | score=0.5920
SCOUTING REPORT: Steven Hajjar — LHP (2018) | New England isn't typically a hotbed for high school pitching t

"The left-handed pitchers in the 2018 class that show plus command and a changeup grade of at least 55 are:\n\n1. **Matthew Liberatore** - Fastball 55, Slider 55, Curveball 60, Changeup 55, Control 55.\n2. **Garrett Wade** - Fastball 55, Slider 55, Curveball 50, Changeup 50, Control 55. (Note: Wade's changeup does not meet the 55 grade requirement.)\n\nTherefore, only **Matthew Liberatore** meets both criteria."

In [49]:
ask(" List any catchers with sub-20% strike-out rates whose pop-time is mentioned as “1.90 s or better.” ", show_sources=True)


# Source 1 | score=0.5664
SCOUTING REPORT: Daulton Varsho — C (2019) | The list of catchers with speed comparable to Varsho's is pretty short. J.T. Realmuto and Jorge Alfaro are two current examples, but Varsho reaches a top speed faster than either of them, and is an actual threat to steal bases. He's also a doubles machine with natural feel for gap-to-gap contact, and if soft liners trickle between outfielders, Varsho ca...


# Source 2 | score=0.5574
SCOUTING REPORT: Daniel Flores — C (2017) | As twitchy as catching prospects get, Flores posts plus-plus pop times thanks to a combination of plus raw arm strength, efficient footwork and a transfer so quick it qualifies as slight of hand. He's also a promising receiver and, while physically projectable, scouts don't anticipate Flores will get so big that he needs to abandon the position. He also ...


# Source 3 | score=0.5445
SCOUTING REPORT: Will Smith — C (2016) | In a year where a lot of the best college catching prospects are ske

'The catchers with sub-20% strike-out rates whose pop-time is mentioned as “1.90 s or better” are:\n\n1. Will Smith - Consistently records pop times of 1.9 seconds or less.\n2. Daniel Flores - Posts plus-plus pop times thanks to his quick transfer. \n\n(Note: The specific strike-out rates are not provided in the context, but these two catchers are noted for their pop times.)'