In [2]:
!pip install -q requests beautifulsoup4 tqdm

You should consider upgrading via the 'C:\Users\Eunha\Documents\Personal work\ValorantAssistant\myenv\Scripts\python.exe -m pip install --upgrade pip' command.


In [1]:
from bs4 import BeautifulSoup
import requests
import json
from tqdm import tqdm


## URLS

In [2]:
BASE_URL = "https://valorant.fandom.com"
AGENTS_PAGE = "https://valorant.fandom.com/wiki/Agents"

In [3]:
def get_agent_links():
    res = requests.get(AGENTS_PAGE)
    soup = BeautifulSoup(res.text, "html.parser")
    agent_links = []
    # Find the main agents table
    table = soup.find("table", class_="wikitable sortable")
    if not table:
        print("Could not find agents table")
        return []

    rows = table.find_all("tr")
    for row in rows:
        # Get the second cell which has agent link
        cells = row.find_all("td")
        if len(cells) > 1:
            cell = cells[1]
            link = cell.find("a")
            if link and link.get("href") and "/wiki/" in link.get("href"):
                agent_links.append(BASE_URL + link.get("href"))
    return list(set(agent_links))


def parse_agent_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    
    try:
        # Get basic info
        name = soup.find("h1", id="firstHeading").text.strip()

        info_box = soup.find_all('section', class_='pi-item pi-group pi-border-color pi-collapse pi-collapse-open')
        bio_box = info_box[0]
        game_details_box= info_box[1]
        bio_info = {}
        if bio_box:
            bio_field_mapping = {
                'realname': 'real_name',
                'aliases': 'aliases',
                'pronouns': 'pronouns',
                'origin': 'origin',
                'race': 'race',
                'number': 'agent_number',
                
            }

            # Find all data items in the info box
            for data_item in bio_box.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
           
                source = data_item.get('data-source')
                if source in bio_field_mapping:
                    value_div = data_item.find('div', class_='pi-data-value')
                    if value_div:
                        bio_info[bio_field_mapping[source]] = ' '.join(value_div.stripped_strings)

        #game details box
        game_info = {}
        if game_details_box:
            game_field_mapping = {
                'role': 'role',
                'role_link': 'role_link',
                'basic': 'basic',
                'basic_link': 'basic_link',
                'signature': 'signature',
                'signature_link': 'signature_link',
                'ultimate': 'ultimate',    
                'ultimate_link': 'ultimate_link',
            }
    
            for data_item in game_details_box.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
                source = data_item.get('data-source')

                if source in game_field_mapping:
                    value_div = data_item.find('div', class_='pi-data-value')
                    if value_div:
                        game_info[game_field_mapping[source]] = ' '.join(value_div.stripped_strings)
    
                    link = data_item.find("a")
                    if link and link.get("href") and "/wiki/" in link.get("href"):
                        game_info[game_field_mapping[source+'_link']] = BASE_URL + link.get("href")
        return {
            "agent": name,
            # "role": role,
            "biography": bio_info,  # Add the biographical information
            "abilities": game_info,
            "url": url
        }

    except Exception as e:
        print(f"Error parsing {url}: {e}")
        return None

In [4]:
def main():
    agent_links = get_agent_links()
    
    all_agents = []

    for link in tqdm(agent_links):
        agent_data = parse_agent_page(link)
        if agent_data:
            all_agents.append(agent_data)

    with open("data/valorant_agents.json", "w", encoding="utf-8") as f:
        json.dump(all_agents, f, indent=2, ensure_ascii=False)

    print(f"✅ Done! Scraped {len(all_agents)} agents.")

main()

100%|██████████| 27/27 [00:09<00:00,  2.87it/s]

✅ Done! Scraped 27 agents.





In [88]:
!pip -q install langchain langchain-openai chromadb python-dotenv langchain-chroma



You should consider upgrading via the 'C:\Users\Eunha\Documents\Personal work\ValorantAssistant\myenv\Scripts\python.exe -m pip install --upgrade pip' command.


# Convert JSON to plain text chunks

In [5]:
import json

with open("data/valorant_agents.json", "r", encoding="utf-8") as f:
    agents = json.load(f)

from langchain.schema.document import Document

# Your agent loop
docs = []
for agent in agents:
    name = agent["agent"]
    bio = agent["biography"]
    ab = agent["abilities"]

    text = f"""Agent: {name}
    Real Name: {bio.get("real_name", "Unknown")}
    Pronouns: {bio.get("pronouns", "Unknown")}
    Origin: {bio.get("origin", "Unknown")}
    Race: {bio.get("race", "Unknown")}
    Agent Number: {bio.get("agent_number", "Unknown")}
    Role: {ab.get("role", "Unknown")}

    Abilities:
    - Basic: {ab.get("basic", "N/A")}
    - Signature: {ab.get("signature", "N/A")}
    - Ultimate: {ab.get("ultimate", "N/A")}
    Wiki: {agent.get("url", "")}
    """


    # Merge all fields into metadata
    metadata = {
        "agent": name,
        "role": ab.get("role"),
        **bio,  # flatten biography into metadata
        "agent_url": agent.get("url"),
        "basic_link": ab.get("basic_link"),
        "signature_link": ab.get("signature_link"),
        "ultimate_link": ab.get("ultimate_link")
    }
    docs.append(Document(page_content=text.strip(), metadata=metadata))

    
import json
from langchain.schema.document import Document
exportable_docs = []
for doc in docs:
    exportable_docs.append({
        "content": doc.page_content,
        "metadata": doc.metadata
    })

# Save to file
with open("data/valorant_chunks.json", "w", encoding="utf-8") as f:
    json.dump(exportable_docs, f, indent=2, ensure_ascii=False)

# Embed the text chunks

In [6]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os

# Load OpenAI API key
load_dotenv()

# Set up embedding model
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

# Create or load Chroma vector store
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding,
    persist_directory="data/chroma_store"
)

print("✅ Data embedded and stored in Chroma!")


✅ Data embedded and stored in Chroma!


# BACKUP, DELETE IF NEEDED

In [125]:
# Close existing Chroma connection
if 'vectorstore' in globals():
    del vectorstore
    print("Closed existing Chroma connection")
# Delete existing Chroma store
import shutil
import os

# Check if the directory exists before trying to delete it
if os.path.exists("data/chroma_store"):
    print("Deleting existing Chroma store...")
    shutil.rmtree("data/chroma_store")
    print("✅ Existing store deleted")

Deleting existing Chroma store...


PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'data/chroma_store\\20d75822-54f4-40ff-8f96-5977f852dd7e\\data_level0.bin'

In [36]:
from collections import Counter

# Get all documents from the store
all_docs = vectorstore.get()
if all_docs and 'metadatas' in all_docs:
    # Count roles
    roles = [doc['role'] for doc in all_docs['metadatas']]
    role_counts = Counter(roles)
    print("Distribution of agents by role:")
    for role, count in role_counts.items():
        print(f"{role}: {count} agents")
    
    # Count unique agents
    agents = [doc['agent'] for doc in all_docs['metadatas']]
    unique_agents = set(agents)
    print(f"\nTotal unique agents: {len(unique_agents)}")
    
    # Test a few queries to verify embeddings
    test_queries = [
        "which agent is from Australia?",
        "Which agent is from Korea?"
    ]
    
    print("\nTesting queries:")
    for query in test_queries:
        print(f"\nQuery: {query}")
        results = vectorstore.similarity_search_with_score(query, k=3)
        for doc, score in results:
            print(f"Score: {score:.4f} - {doc.metadata['agent']} ({doc.metadata['role']})")

Distribution of agents by role:
Controller: 6 agents
Initiator: 7 agents
Sentinel: 6 agents
Duelist: 8 agents

Total unique agents: 27

Testing queries:

Query: which agent is from Australia?
Score: 1.1419 - Skye (Initiator)
Score: 1.2342 - Phoenix (Duelist)
Score: 1.2455 - Chamber (Sentinel)

Query: Which agent is from Korea?
Score: 1.1517 - KAY/O (Initiator)
Score: 1.1697 - Iso (Duelist)
Score: 1.1821 - Yoru (Duelist)


# Query the DB

In [41]:
# Load existing Chroma
vectorstore = Chroma(
    persist_directory="data/chroma_store",
    embedding_function=embedding
)

# Query it
# Example queries you could try:

query = "Which agent is from China?"
results = vectorstore.similarity_search(query, k=1)

# Print results
for doc in results:
    print("Matched Agent:", doc.metadata["agent"])
    print(doc.page_content)

    #This is any metadata that is not part of the content that i want to include for the agent
    print("🔗 Link:", doc.metadata["basic_link"])


Matched Agent: Iso
Agent: Iso
    Real Name: Li Zhao Yu
    Pronouns: He/Him
    Origin: Chongqing, China
    Race: Radiant
    Agent Number: 24
    Role: Duelist

    Abilities:
    - Basic: Contingency Undercut
    - Signature: Double Tap
    - Ultimate: Kill Contract
    Wiki: https://valorant.fandom.com/wiki/Iso
🔗 Link: https://valorant.fandom.com/wiki/Contingency


# Query with metadata filtering

In [27]:
filtered_docs = vectorstore.similarity_search(
    "Which agent uses knives?",
    k=2,
    filter={"role": "Duelist"}
)

for doc in filtered_docs:
    print(doc.metadata["agent"], "-", doc.metadata["origin"])

Iso - Chongqing, China
Waylay - Thailand


In [53]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
import os
from dotenv import load_dotenv
load_dotenv()
from langchain.prompts import PromptTemplate

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
llm = ChatOpenAI(
    model="gpt-4o-mini",
    openai_api_key=OPENAI_API_KEY,
)
retriever = vectorstore.as_retriever(
    search_type="similarity",
    # search_kwargs={"k": 3}  # Increased to get more context
)



custom_prompt = PromptTemplate.from_template("""
You are a Valorant assistant that provides detailed information about agents and their abilities.

When answering questions:
1. Use the context to provide a comprehensive answer
2. If the question is about abilities, include the relevant URLs from the metadata. If the metadata doesn't contain the requested URL, say "URL not found in metadata"
3. Format your response in a clear, organized way
4. If the question is not about Valorant, respond with "Sorry, please ask a question related to Valorant."

For example, if asked about an agent's abilities, include both the description from the context AND the relevant URLs from the metadata.

Context:
{context}

Question:
{question}

Answer: Provide a detailed answer and include relevant URLs from the metadata when discussing abilities.
""")
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True
)

query = "What is jett's basic ability?"
result = qa_chain.invoke({"query": query})

print("💬 Answer:\n", result["result"])
# print("🔍 Metadata:", result["source_documents"][0].metadata["basic_link"])  # This will show us the actual metadata value
# Optional: view source docs used
for doc in result["source_documents"]:
    print("📄 Source:", doc.metadata["agent"])


💬 Answer:
 Jett's basic ability is **Cloudburst**. This ability allows her to throw a projectile that expands into a brief vision-blocking cloud on impact with the ground. It is particularly useful for obscuring visibility, aiding in her mobility, and creating opportunities to engage or disengage in fights.

Additionally, Jett has another basic ability called **Updraft** which allows her to propel herself upward, giving her access to high ground or allowing her to escape.

For more detailed information about Jett and her abilities, you can visit her wiki page here: [Jett Wiki](https://valorant.fandom.com/wiki/Jett).
📄 Source: Jett
📄 Source: Killjoy
📄 Source: Tejo
📄 Source: Deadlock
