# **Building a smart Football Chatbot with Retrieval Augmented Generation**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
df = pd.read_parquet("/content/drive/MyDrive/cahiers-du-foot.parquet")
df.head()

Unnamed: 0,id,title,url,article_summary,article_body
0,e87eab6f1590ae84ec703e69560050a27520c3c7,Tourner les pages du football brésilien - Les ...,https://www.cahiersdufootball.net/article/tour...,"Bibliothèque – Éloge de l’esquive et Futebol, ...",Éloge de l'esquive Comme beaucoup de personnes...
1,7da6704ac94ca568abd5672f5ea9b69c97459905,Materazzi sans les lazzi - Les Cahiers du foot...,https://www.cahiersdufootball.net/article/marc...,"Passe en retraite - À quarante-et-un ans, Marc...",Il pouvait difficilement en être autrement. Le...
2,d18d429b29215e22226fe45b41f4c9da7f96fbe5,Calcio : A History of Italian Football - Les C...,https://www.cahiersdufootball.net/article/calc...,Bibliothèque – Le portrait précis et passionné...,S’il est un football auquel on prête des stéré...
3,0be54a662620cf8eceabbaab5bf93c5ef2b84cca,"Madjer 1987, maître talon - Les Cahiers du foo...",https://www.cahiersdufootball.net/article/madj...,"Un jour un but – Le 27 mai 1987 à Vienne, l'Al...",La logique voulait que cette trente-deuxième f...
4,a9c29f8754bfd657d16f2c66be05ed5b22aec032,"Hateley 1984, une tête au-dessus - Les Cahiers...",https://www.cahiersdufootball.net/article/un-j...,"Un jour, un but – Le 28 octobre 1984, Mark Hat...","Mark Hateley arrive au Milan AC en 1984, en pr..."


In [None]:
!pip install langchain-azure-ai langchain-milvus

### **Connecting to Azure AI Service**

In [None]:
import os

# API Configuration
inference = os.environ.get("INFERENCE_ENDPOINT")
apikey = os.environ.get("API_KEY")
embedding_model_name = "text-embedding-3-small"
chat_model_name = "gpt-4o" # model_name in ["Phi-4", "gpt-4o", "DeepSeek-R1", "Mistral-Nemo"]

### **Loading Azure AI Models**

In [None]:
from langchain_azure_ai.chat_models import AzureAIChatCompletionsModel
from langchain_azure_ai.embeddings import AzureAIEmbeddingsModel

# --- 1. Initialize the Azure AI Models ---

try:
    # --- 1.a) Initialize Chat Model ---
    chat_model = AzureAIChatCompletionsModel(
        endpoint=inference,
        credential=apikey,
        model_name=chat_model_name,
    )
    print("Chat Model initialized successfully.")

    # --- 1.b) Initialize Embeddings Model ---
    embeddings_model = AzureAIEmbeddingsModel(
        endpoint=inference,
        credential=apikey,
        model_name=embedding_model_name,
    )
    print("Embeddings Model initialized successfully.")

except Exception as e:
    print(f"An error occurred during initialization: {e}")

Chat Model initialized successfully.
Embeddings Model initialized successfully.


### **Initializing Milvus vector database**

In [None]:
from langchain_milvus import Milvus

# --- 2. Connect to Milvus ---

vectorstore = Milvus(
    embedding_function=embeddings_model,
    connection_args={"uri" : "/content/drive/MyDrive/milvus-cahiers-du-foot.db"}
)

print("Milvus vectorstore loaded successfully.")

Milvus vectorstore loaded successfully.


### **Implementing the RAG Arquitecture**

In [None]:
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

# --- 3. RAG Arquitecture ---

# --- 3.a) Retrieves relevant documents from the vectorstore for a given query. ---
def get_relevant_documents(vectorstore, query, k=7):
    """Retrieves relevant documents from the vectorstore for a given query."""
    retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": k})
    retrieved_docs = retriever.invoke(query)
    return retrieved_docs


# --- 3.b) Assembles the chat prompt with the system message and documents. ---
def create_chat_prompt(retrieved_docs, question):
    """Assembles the chat prompt with the system message and documents."""
    document_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    prompt_template = """You are a document analyst specializing in football (soccer) articles.
        The user will submit a question and provide you with relevant excerpts from football articles.
        Your task is to answer the question based ONLY on the information in these excerpts. Be concise and clear and list your sources at the end of each section.
        If the provided excerpts do not contain the answer, or if the question is not related to football,
        respond with "I don't know".
        Assume the user knows basic football terminology, no need to explain basic concepts.
        Answer in English.

        Specifically, for questions asking for a summary of different viewpoints, you MUST identify the various opinions expressed in the documents and present them as bullet points. Each bullet point should clearly state the opinion and be supported by information from the provided excerpts. Be as objective as possible, representing each viewpoint accurately.

        Here are excerpts from football articles:\n\n{context}

        Here is the question:\n\n{question}
    """

    prompt = ChatPromptTemplate.from_template(prompt_template)
    messages = prompt.format_messages(context=document_content, question=question)

    return messages


# --- 3.c) Invokes the chat model and returns the response. ---
def generate_chat_response(model, messages):
    """Invokes the chat model and returns the response."""
    try:
        answer = model.invoke(messages)
        return answer.content
    except Exception as e:
        print(f"Error during model invocation: {e}")
        return "Error generating response."


# --- 3.d) Formats the answer to include the sources (article titles/URLs). ---
def format_response(answer, retrieved_docs):
    """Formats the answer to include the sources (article titles/URLs) used."""
    if "I don't know" in answer:
        return answer

    source_list = []
    for doc in retrieved_docs:
        source_str = ""
        title = doc.metadata.get('title')
        url = doc.metadata.get('url')
        if title:
            source_str += f"Title: {title}"
        else:
            source_str += "Title not found"
        if url:
            if source_str:
                source_str += ", "
            source_str += f"URL: {url}"
        else:
            source_str += ", URL not found"
        if source_str:
            source_list.append(source_str)

    if source_list:
        sources = "\nSources:\n" + "\n".join(source_list)
        return f"{answer}\n{sources}"
    else:
        return f"{answer}\nNo Sources Found"

### **Executing the RAG Pipeline**

In [None]:
# --- 4. Main RAG Pipeline Function ---
def rag_pipeline(vectorstore, question):
    """
    Performs the entire RAG pipeline: retrieves documents, translates if needed,
    assembles the prompt, invokes the chat model, and formats the answer.
    """
    try:
        retrieved_docs = get_relevant_documents(vectorstore, question)
        messages = create_chat_prompt(retrieved_docs, question)
        answer = generate_chat_response(chat_model, messages)
        final_answer = format_response(answer, retrieved_docs)
        return final_answer

    except Exception as e:
        print(f"Error in RAG pipeline: {e}")

### **Testing the Chatbot**

In [None]:
# --- 5. Testing the RAG Architecture ---

# --- Example Usage 1 ---
question = "Summarize in bullet points the point of view on the usage of video assisted refereeing ?"
answer = rag_pipeline(vectorstore, question)
print(answer)

- **Video Assisted Refereeing (VAR) usage is inconsistent and problematic**: Either VAR is used excessively, interrupting the flow of matches, or it is scarcely employed, making its expanded implementation seem unnecessary. This inconsistency creates a fundamental issue in collective sports.  
  - Source: Provided excerpt.

- **Training of video referees is questioned**: There is a suggestion that video referees may require better training to address the challenges and inconsistencies in VAR application.  
  - Source: Provided excerpt.

- **The issue of VAR is deemed insoluble**: The article expresses skepticism regarding finding a viable solution to VAR's inconsistent application in football.  
  - Source: Provided excerpt.

Sources:
Title: Vidéo : le rugby dans l’engrenage - Les Cahiers du football || magazine de foot et d'eau fraîche, URL: https://www.cahiersdufootball.net/article/arbitrage-video-le-rugby-dans-l-engrenage-5005?page=2
Title: Vidéo : le rugby dans l’engrenage - Les Ca

In [None]:
# --- Example Usage 2 ---
question = "What are the main topics or themes covered in the articles on Les Cahiers du Football website?"
answer = rag_pipeline(vectorstore, question)
print(answer)

Based on the provided excerpts, the main topics or themes covered in the articles on *Les Cahiers du Football* website include:

- **Seasonal Journal and Football News**: Chronicling small and major events (“faits et méfaits”) in football throughout the season. This includes humorous and critical analysis of football happenings.
  - Source: "En fil rouge, un journal de la saison , chronique des petits et des grands faits (méfaits compris) de la trépidante actu du football."

- **Stats and Categorization**: Focus on intriguing statistics and “footballégories,” offering data-driven reflections and classifications regarding football.
  - Source: "des stats choisies, des footballégories."

- **Special Euro Coverage**: Dedicated sections focusing on the Euro football tournament.
  - Source: "Bien sûr, un cahier spécial Euro achève l'ouvrage."

- **Critical Opinions on Players and Teams**: Articles that critically analyze players (e.g., Bafétimbi Gomis described as "l’escroquerie à 15 millio

In [None]:
# --- Example Usage 3 ---
question = "Explain the concept of manifest destiny"
answer = rag_pipeline(vectorstore, question)
print(answer)

I don't know.


### **EXTRA --> Avanced RAG Arquitecture: Query Expansion and Relevance Implementation**

In [None]:
# --- 6. Advanced RAG Arquitecture ---

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# --- 6.a) Query Expansion Implementation (using LLM) ---
def create_query_expansion_chain(llm):
    """Creates a chain to expand the user query with related keywords."""
    prompt_template = """You are an expert at expanding search queries.
    The user will provide a search query related to football (soccer).
    Your task is to expand the query by adding relevant keywords and synonyms to improve search results.
    Return the expanded query.

    Original Query: {query}
    Expanded Query:"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["query"])
    return LLMChain(llm=llm, prompt=prompt, output_key="expanded_query")


# --- 6.b) Relevance Scoring Implementation (using LLM) ---
def score_document_relevance(document, query, llm):
    """Scores the relevance of a document to a query using an LLM."""
    prompt_template = """You are an expert at determining the relevance of a document to a search query.
    The user will provide a document and a query.
    Your task is to score the relevance of the document to the query on a scale of 1 to 10, where 1 is not relevant and 10 is highly relevant.
    Justification: (Explain why you gave the score).
    Relevance Score: (Score)

    Document: {document}
    Query: {query}
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["document", "query"])
    chain = LLMChain(llm=llm, prompt=prompt, output_key="relevance_score")
    return chain


# --- 6.c) Modified Main RAG Pipeline Function ---
def advanced_rag_pipeline(vectorstore, question, llm):
    """Performs RAG with query expansion and relevance scoring."""
    try:
        expansion_chain = create_query_expansion_chain(llm)
        expanded_query = expansion_chain.run(question)
        print(f"Expanded Query: {expanded_query}")

        retrieved_docs = get_relevant_documents(vectorstore, expanded_query)
        scored_docs = []
        for doc in retrieved_docs:
            scoring_chain = score_document_relevance(document=doc.page_content, query=question, llm=llm)
            score_results = scoring_chain.run(document=doc.page_content, query=question)
            try:
                relevance_score = int(score_results.split("(Score:")[-1].strip(")\n"))
            except:
                relevance_score = 5
            scored_docs.append((doc, relevance_score))

        relevant_docs = [doc for doc, score in scored_docs if score >= 7]
        messages = create_chat_prompt(relevant_docs, question)
        answer = generate_chat_response(llm, messages)
        final_answer = format_response(answer, relevant_docs)

    except Exception as e:
        print(f"Error in RAG pipeline: {e}")
        return "Error generating response."

    source_list = []
    for doc in relevant_docs:
        source_str = ""
        title = doc.metadata.get('title')
        url = doc.metadata.get('url')
        if title:
            source_str += f"Title: {title}"
        else:
            source_str += "Title not found"
        if url:
            if source_str:
                source_str += ", "
            source_str += f"URL: {url}"
        else:
            source_str += ", URL not found"
        if source_str:
            source_list.append(source_str)

    if source_list:
        sources = "\nSources:\n" + "\n".join(source_list)
        return f"{final_answer}\n{sources}"
    else:
        return final_answer

In [None]:
# --- 7. Example Usage of Advanced RAG Pipeline ---
question = "What are the controversies around VAR in football?"
advanced_answer = advanced_rag_pipeline(vectorstore, question, chat_model)
print(advanced_answer)

  return LLMChain(llm=llm, prompt=prompt, output_key="expanded_query")
  expanded_query = expansion_chain.run(question)


Expanded Query: What are the controversies around VAR in football? | Video Assistant Referee debates in soccer | VAR issues and criticisms in football | VAR problems in soccer matches | Controversial VAR decisions in football | Arguments for and against VAR in soccer | Impact of Video Assistant Referees on football games | Disputed VAR calls in football tournaments | Technology in football refereeing challenges | VAR and refereeing controversies in soccer | Football officiating debates surrounding VAR
- **Claim that VAR undermines the referee's authority**  
  Some argue that VAR diminishes the authority of referees on the pitch and creates confusion about decision-making responsibilities. Referees may rely too heavily on video technology, which can lead to delays and erode trust in their judgment.

- **Inconsistency in decisions made using VAR**  
  Critics point out that VAR decisions lack consistency, with similar incidents often leading to different outcomes due to subjective inter

## **Creating a Chatbot Interface**

In [None]:
!pip install gradio
import gradio as gr
from functools import partial

### **Using the Basic RAG Arquitecture**       



In [None]:
# Create a Gradio interface
iface = gr.Interface(
    fn=partial(rag_pipeline, vectorstore),
    inputs=gr.Textbox(lines=3, placeholder="Ask me anything about football!", label="Your Question:"),
    outputs=gr.Textbox(label="Chatbot's Answer:", show_label=True),
    title="Football Article Chatbot",
    description="Ask questions about football (soccer) articles. The chatbot will answer based on the provided excerpts.",
    theme="soft",
    css=""".gradio-container {background-color: #f0f2f5;}.output-textbox {font-size: 18px;}"""
)

In [None]:
# Launch the interface
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8dd4086f5cadaa6709.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




### **Using the Advanced RAG Arquitecture**

In [None]:
# Create a Gradio interface
iface = gr.Interface(
    fn=partial(advanced_rag_pipeline, vectorstore, llm=chat_model),
    inputs=gr.Textbox(lines=3, placeholder="Ask me anything about football!", label="Your Question:"),
    outputs=gr.Textbox(label="Chatbot's Answer:", show_label=True),
    title="Football Article Chatbot (Advanced RAG)",
    description="Ask questions about football (soccer) articles. The chatbot uses query expansion and relevance scoring.",
    theme="soft",
    css=""".gradio-container {background-color: #f0f2f5;}.output-textbox {font-size: 18px;}"""
)

# Launch the interface
iface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2990721dc49cc4a00d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


