In [1]:
# Set up a User Agent for this session
import os

os.environ['USER_AGENT'] = 'sports-buddy-query-analysis'

In [2]:
import datetime
import math
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader(
    web_paths=("https://en.wikipedia.org/wiki/2022_Ballon_d%27Or",
               "https://en.wikipedia.org/wiki/2023_Ballon_d%27Or",
               "https://en.wikipedia.org/wiki/2022%E2%80%9323_NBA_season",
               "https://en.wikipedia.org/wiki/2021%E2%80%9322_NBA_season",
               "https://en.wikipedia.org/wiki/2022%E2%80%9323_Premier_League",
               "https://en.wikipedia.org/wiki/2021%E2%80%9322_Premier_League",
               "https://en.wikipedia.org/wiki/2021%E2%80%9322_UEFA_Champions_League",
               "https://en.wikipedia.org/wiki/2022%E2%80%9323_UEFA_Champions_League",
               "https://en.wikipedia.org/wiki/2023_Cricket_World_Cup")
)

docs = loader.load()

# Rounding up or down based to nearest 1000
def round_to_nearest_thousand(number):
    return round(number / 1000) * 1000

def print_summary(doc):
    print(f"Title: {doc.metadata["title"]}")
    print(f"Approximate Word Count: {doc.metadata["words"]}")

# TODO: Add 'words' metadata
for doc in docs:
    doc.metadata["words"] = round_to_nearest_thousand(len(doc.page_content.split(" ")))
    print_summary(doc)
    print()


Title: 2022 Ballon d'Or - Wikipedia
Approximate Word Count: 4000

Title: 2023 Ballon d'Or - Wikipedia
Approximate Word Count: 2000

Title: 2022â€“23 NBA season - Wikipedia
Approximate Word Count: 10000

Title: 2021–22 NBA season - Wikipedia
Approximate Word Count: 14000

Title: 2022–23 Premier League - Wikipedia
Approximate Word Count: 8000

Title: 2021–22 Premier League - Wikipedia
Approximate Word Count: 7000

Title: 2021–22 UEFA Champions League - Wikipedia
Approximate Word Count: 6000

Title: 2022–23 UEFA Champions League - Wikipedia
Approximate Word Count: 4000

Title: 2023 Cricket World Cup - Wikipedia
Approximate Word Count: 6000



In [3]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

database = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = database.as_retriever()

In [4]:
search_results = database.similarity_search("Who won the 2022 ballon d'or?")
print_summary(search_results[0])

Title: 2022 Ballon d'Or - Wikipedia
Approximate Word Count: 4000


In [6]:
search_results = database.similarity_search("Suggest a sports article with approximately 14000 words")
print_summary(search_results[0])

Title: 2023 Cricket World Cup - Wikipedia
Approximate Word Count: 6000


In [7]:
from typing import Optional
from pydantic import BaseModel, Field


class SportsSearch(BaseModel):
    """Search over a database of sports articles."""

    query: str = Field(
        ...,
        description="Similarity search query applied to sports articles.",
    )
    words: Optional[int] = Field(None, description="Number of words in article")

In [8]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

system = """You are an expert at converting user questions into database queries. \
You have access to a database of sports articles. \
Given a question, return a list of database queries optimized to retrieve the most relevant results.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structured_llm = llm.with_structured_output(SportsSearch)
query_analyzer = {"question": RunnablePassthrough()} | prompt | structured_llm

In [10]:
from typing import List
from langchain_core.documents import Document

def retrieve_by_metadata(search: SportsSearch) -> List[Document]:
    if search.words is not None:
        _filter = {"words": {"$eq": search.words}}
    else:
        _filter = None
    return database.similarity_search(search.query, filter=_filter)

In [11]:
query_analyzer.invoke("Who won the 2022 ballon d'or?")

SportsSearch(query="2022 Ballon d'Or winner", words=None)

In [12]:
search_results = database.similarity_search("2022 Ballon d'Or winner")
print_summary(search_results[0])

Title: 2022 Ballon d'Or - Wikipedia
Approximate Word Count: 4000


In [13]:
query_analyzer.invoke("Suggest a sports article with approximately 14000 words")

SportsSearch(query='sports article', words=14000)

In [14]:
retrieval_chain = query_analyzer | retrieve_by_metadata

search_results = retrieval_chain.invoke("Suggest a sports article with approximately 14000 words")
print_summary(search_results[0])

Title: 2021–22 NBA season - Wikipedia
Approximate Word Count: 14000


In [15]:
search_results = retrieval_chain.invoke("Suggest a football article with approximately 6000 words")
print_summary(search_results[0])

Title: 2021–22 UEFA Champions League - Wikipedia
Approximate Word Count: 6000
