# HW4: QA Agent

## Dependencies and LLM Backbone

In [124]:
# !pip install langchain==1.0.5
# !pip install langchain-core
# !pip install langchain-community
# !pip install faiss-cpu
# !pip install kagglehub
# !Install DuckDuckGo search dependency
# !pip install -U ddgs
device = "cuda"  # "cpu" or "cuda"

In [125]:
# This is the list of countries we are using (with their official languages)
# Feel free to use it in your code
list_of_countries = {}
with open("countries_with_languages.tsv", "r"  ) as f:
    for line in f.readlines():
        country, langs = line.strip().split("\t")
        list_of_countries[country] = langs.split(",")

### Choice 1: OpenAI API

The notebook's implementation is based on this.
Feel free to change the model, and please keep track of your usage on the "Usage" page on [LiteLLM API webpage](https://ai-gateway.andrew.cmu.edu/ui/).

In [126]:
# !pip install langchain-openai

In [127]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import getpass, os

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API Key: ")
openai_model_id = "gpt-5"
openai_embmodel_id = "azure/text-embedding-3-small"

llm = ChatOpenAI(
    model=openai_model_id,
    api_key=os.environ["OPENAI_API_KEY"],
    base_url="https://ai-gateway.andrew.cmu.edu/"
)
embeddings = OpenAIEmbeddings(
    model=openai_embmodel_id,
    api_key=os.environ['OPENAI_API_KEY'],
    base_url='https://ai-gateway.andrew.cmu.edu/'
)

### Choice 2: Hugging Face Models

You may also use Hugging Face models without API credits if you have available GPU resource. You might have to the change prompt templates according to your model choice.

In [128]:
# !pip install langchain-huggingface text-generation transformers google-search-results 
# !pip install numexpr langchainhub sentencepiece sentence-transformers jinja2 bitsandbytes accelerate

In [129]:
# import getpass, os
# from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline, HuggingFaceEmbeddings

# os.environ["HUGGINGFACEHUB_API_TOKEN"] = getpass.getpass("Enter your Hugging Face API key: ")
# hgf_model_id = "Qwen/Qwen3-0.6B"
# hgf_embmodel_id = "sentence-transformers/all-mpnet-base-v2"

# hgf_model = HuggingFacePipeline.from_model_id(
#     model_id=hgf_model_id,
#     task="text-generation",
#     pipeline_kwargs=dict(
#         max_new_tokens=128,
#         do_sample=False,
#     ),
# )
# hgf_llm = ChatHuggingFace(hgf_model)
# hgf_embeddings = HuggingFaceEmbeddings(model_name=hgf_embmodel_id)

## Handling different type of questions

Implement the answer formatting and extraction for each question type. You may change the prompt to fit your processing function.

In [130]:
from langchain.agents import create_agent

### üó∫Ô∏èGlobal Trekker

In [131]:
# # Hardened web_search: import inside try to avoid ddgs crash
# from langchain.tools import tool

# @tool
# def web_search(query: str) -> str:
#     """
#     Searches the web for information about locations, landmarks, and geographic features.
#     Useful for identifying cities and countries from descriptive clues.
#     """
#     try:
#         from langchain_community.tools import DuckDuckGoSearchRun
#         search = DuckDuckGoSearchRun()
#         try:
#             results = search.run(query)
#             return results
#         except Exception as e:
#             return f"Search error: {str(e)}"
#     except Exception as e:
#         return (
#             "DuckDuckGo search dependency missing or failed to load. "
#             "Install with: pip install -U ddgs. Error: " + str(e)
#         )

In [132]:
global_trekker_messages = [
    {"role": "system", "content": "You are an expert in world knowledge."},
    {"role": "user", "content": """Given the following paragraph, guess the most likely country and city. Answer in the format of [country], [city]. If the paragraph is generic or only country-level, return "Unknown" for city."""},
]
global_trekker = create_agent(model=llm, tools=[])

In [133]:
def extract_global_trekker_answer(response: str) -> tuple[str, str]:
    # TODO: Extract the country and city from the response
    # return country, city
    import re
    
    chars = r'\[([^\]]+)\]\s*,\s*\[([^\]]+)\]'
    match = re.search(chars, response)
    
    if match:
        country = match.group(1).strip()
        city = match.group(2).strip()
        return country, city
    
    # Fallback: look for two items separated by comma
    # Often LLMs will say something like "United States, Pittsburgh"
    lines = response.strip().split('\n')
    for line in lines:
        # Skip lines that are too long (likely explanations)
        if len(line) > 100:
            continue
        # Look for comma-separated values
        if ',' in line:
            parts = line.split(',')
            if len(parts) >= 2:
                country = parts[0].strip().strip('[]"\'')
                city = parts[1].strip().strip('[]"\'')
                # Clean up common prefixes
                for prefix in ['The answer is', 'Answer:', 'Location:', 'Country:', 'City:']:
                    country = country.replace(prefix, '').strip()
                    city = city.replace(prefix, '').strip()
                return country, city
    
    # Last resort: return empty strings
    return "", ""

In [134]:
# Test run your extration function before using it in the main loop!
extract_global_trekker_answer("AAA, BBB")

('AAA', 'BBB')

### üçΩÔ∏èCulinary Detective

In [135]:
import kagglehub
import pandas as pd

def gather_recipe_data(kaggledataset: str) -> list[str]:
    dataset_path = kagglehub.dataset_download(kaggledataset)
    df = pd.read_csv(f"{dataset_path}/Receipes from around the world.csv", encoding='latin-1')
    
    # Process the dataframe to list of text entries for retrieval
    # Format each recipe as structured text for better retrieval
    recipes = []
    for _, row in df.iterrows():
        # Create a readable text representation of each recipe
        recipe_parts = []
        for col in df.columns:
            value = row[col]
            # Skip NaN values and format nicely
            if pd.notna(value) and str(value).strip():
                recipe_parts.append(f"{col}: {value}")
        
        # Join all parts into a single text entry
        recipe_text = ". ".join(recipe_parts)
        recipes.append(recipe_text)
    
    return recipes

In [136]:
# from langchain_community.vectorstores import FAISS
# from langchain_core.documents import Document
# from langchain.tools import tool

# recipes = gather_recipe_data("prajwaldongre/collection-of-recipes-around-the-world")
# docs = [Document(page_content=recipe) for recipe in recipes]
# vector = FAISS.from_documents(docs, embeddings)
# retriever = vector.as_retriever(search_kwargs={"k": 2})

## RAG Tool
I created the following:
- a folder to store embeddings and faiss index
- a rag pipeline file
- a file that exposes the rag pipeline as a tool. 
- I am importing the tool in here. 

In [137]:
from langchain.tools import tool
from rag_system.rag_pipeline import CulinaryRAG

# Load the RAG engine and retriever
rag = CulinaryRAG()
retriever = rag.load_index()  # returns vectorstore.as_retriever()

@tool
def retrieve_culinary_context(query: str):
    """
    Retrieves culinary information relevant to country/region origin detection.
    Takes a descriptive query (ingredients, cooking method, spices) and performs vector retrieval.
    """
    docs = retriever.invoke(query)
    return "\n".join([d.page_content for d in docs])

In [138]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("prajwaldongre/collection-of-recipes-around-the-world")

print("Path to dataset files:", path)

Path to dataset files: /home/sagemaker-user/.cache/kagglehub/datasets/prajwaldongre/collection-of-recipes-around-the-world/versions/1


In [139]:
@tool
def retrieve_recipes(query: str):
  """
  Retrieves recipes based on a search query.
  """
  return retriever.invoke(query)

culinary_detective_messages = [
    {"role": "system", "content": """
You are an expert culinary anthropologist. From ingredients and a short description, identify the country and the SPECIFIC region within that country where the dish is most associated.

You can consult a retrieval tool bound to this agent (retrieve_culinary_context). Use it when helpful; otherwise reason from your knowledge of ingredients, techniques, and named dishes.
"""},
    {"role": "human", "content": """
Task:
Return ONLY a single line in the exact format: [Country], [Region]
- Country: full official name (e.g., "United States", not "USA").
- Region: a specific intra-country region (e.g., North, South, East, West, Central, Northeast, etc.).
- Use "All" only if the dish is truly nationwide.
- If no region is identifiable, write "Unknown".
- Do NOT include any explanation before or after the bracketed answer.

Cues to consider:
- Ingredients (grains, staple flours, spice blends), cooking methods, named dishes, iconic sides.
- Example mappings:
  * Brazil: p√£o de queijo / tapioca flour / queijo minas ‚Üí [Brazil], [South]
  * Japan: sushi / rice vinegar / tempura ‚Üí [Japan], [All]
  * India: dosa / idli / coconut ‚Üí [India], [South]; naan / paneer / tandoor ‚Üí [India], [North]
  * Ethiopia: teff / injera / wat / berbere ‚Üí [Ethiopia], [Unknown]
  * China: dim sum ‚Üí [China], [South]; hot pot (Sichuan/Chongqing-style) ‚Üí [China], [West]
  * Thailand: khao soi ‚Üí [Thailand], [North]; som tam ‚Üí [Thailand], [Northeast]

Few-shot examples:
Input: "Fermented teff flatbread served with spicy stews (wat) and berbere."
Output: [Ethiopia], [Unknown]

Input: "Cheese bread made with tapioca starch, typical with churrasco in the south."
Output: [Brazil], [South]

Input: "Batter of rice and urad dal, steamed into soft cakes, served with coconut chutney."
Output: [India], [South]

Input: "Assorted small bites with tea in bamboo steamers, hallmark of Cantonese cuisine."
Output: [China], [South]

Now produce ONLY the answer for the current input as [Country], [Region].
"""},
]

culinary_detective = create_agent(model=llm, tools=[retrieve_culinary_context])

In [140]:
result = culinary_detective.invoke({
    "messages": culinary_detective_messages + [
        {"role": "human", "content": "The dish is spicy, coconut-based, and served with rice."}
    ]
})
print(result)

{'messages': [SystemMessage(content='\nYou are an expert culinary anthropologist. From ingredients and a short description, identify the country and the SPECIFIC region within that country where the dish is most associated.\n\nYou can consult a retrieval tool bound to this agent (retrieve_culinary_context). Use it when helpful; otherwise reason from your knowledge of ingredients, techniques, and named dishes.\n', additional_kwargs={}, response_metadata={}, id='63b22b05-4c3b-4d66-8b75-3c4383d21380'), HumanMessage(content='\nTask:\nReturn ONLY a single line in the exact format: [Country], [Region]\n- Country: full official name (e.g., "United States", not "USA").\n- Region: a specific intra-country region (e.g., North, South, East, West, Central, Northeast, etc.).\n- Use "All" only if the dish is truly nationwide.\n- If no region is identifiable, write "Unknown".\n- Do NOT include any explanation before or after the bracketed answer.\n\nCues to consider:\n- Ingredients (grains, staple fl

In [141]:
def extract_culinary_detective_answer(response: str) -> tuple[str, str]:
    # TODO: Extract the country and region from the response
    # return country, region
    import re
    
    chars = r'\[([^\]]+)\]\s*,\s*\[([^\]]+)\]'
    match = re.search(chars, response)
    
    if match:
        country = match.group(1).strip()
        region = match.group(2).strip()
        return country, region
    
    lines_split = response.strip().split('\n')
    for line in lines_split:
        if len(line) > 100:
            continue
        if ',' in line:
            parts = line.split(',')
            if len(parts) >= 2:
                country = parts[0].strip().strip('[]"\'')
                region = parts[1].strip().strip('[]"\'')
                for prefix in ['The answer is', 'Answer:', 'Country:', 'Region:']:
                    country = country.replace(prefix, '').strip()
                    region = region.replace(prefix, '').strip()
                return country, region
    
    return "", ""

In [142]:
# extract_culinary_detective_answer(result)

### üëÑLingua Locale

In [143]:
lingua_locale_messages = [
    {"role": "system", "content": """
You are an expert in languages, scripts, orthography, and regional vocabulary. Determine which country's website or text a sentence most likely comes from.
"""},
    {"role": "user", "content": """
You must ONLY return a single line: [Country]. You should use the full official country name (e.g., "United States", not "USA"). 
- You should not add any other explanations. Only output the name of the country.

Here are some heuristics:
- English: colour, organise, centre ‚Üí [United Kingdom]; sidewalk, color, organize ‚Üí [United States].
- Portuguese: autocarro, factura, telem√≥vel ‚Üí [Portugal]; √¥nibus, nota fiscal, celular ‚Üí [Brazil].
- Spanish: vos, colectivo, pileta ‚Üí [Argentina]; ordenador ‚Üí [Spain]; autob√∫s ‚Üí [Mexico/Spain], coche (Spain) vs carro (LatAm).
- Chinese: Traditional characters (Ëá∫ÁÅ£„ÄÅËá∫Âåó„ÄÅÁπÅÈ´î) ‚Üí [Taiwan]; Simplified (‰∏≠ÂõΩ„ÄÅÂπøÂ∑û„ÄÅÁÆÄ‰Ωì) ‚Üí [China].
- Cyrillic specifics: –¥—ò / —í / —õ / —ô / —ö ‚Üí [Montenegro]; –¥–µ–≤–æ—ò–∫–∞ / —í–∞–∫ more typical of [Serbia].
- French: anglicisms + CAD context ‚Üí [Canada]; m√©tropolitain cues ‚Üí [France].
- Haitian Creole ‚Üí [Haiti].

Examples:
Input: "Please, colour is the preferred spelling in our centre."
Output: [United Kingdom]

Input: "Clique para imprimir a fatura no seu telem√≥vel."
Output: [Portugal]

Input: "–°–∏–Ω–æ—õ —Å–∞–º –≤–∏–¥–∏–æ –¥—ò–µ–≤–æ—ò–∫—É —É –ü–æ–¥–≥–æ—Ä–∏—Ü–∏."
Output: [Montenegro]

Input: "ÈÄôÊòØËá∫ÁÅ£Êú¨Âú∞ÁöÑÈ†ÅÈù¢„ÄÇ"
Output: [Taiwan]

Now produce ONLY [Country] for the current sentence.
"""},
]

lingua_locale = create_agent(model=llm, tools=[])

In [144]:
def extract_lingua_locale_answer(response: str) -> tuple[str, str]:
    # TODO: Extract the country and "none" from the response
    # only the first field is used, the second is a dummy field to make the return type consistent
    # return country, "none"
    import re
    
    chars = r'\[([^\]]+)\]'
    match = re.search(chars, response)
    
    if match:
        country = match.group(1).strip()
        return country, "none"
    
    lines_split = response.strip().split('\n')
    for line in lines_split:
        if len(line) > 100:
            continue
        clean_line = line.strip().strip('[]"\'')
        for prefix in ['The answer is', 'Answer:', 'Country:', 'The country is', 'This is from']:
            clean_line = clean_line.replace(prefix, '').strip()
        
        if clean_line and len(clean_line) < 50:
            clean_line = clean_line.rstrip('.,;:')
            return clean_line, "none"
    
    return "", "none"

## Answering questions
This part includes how we load the questions and generate the prediction in desired format. 

In [145]:
def geoguesser(q: dict, print_raw_response=False) -> tuple[str, str]:
    if q["type"] == "GlobalTrekker":
        query = {"role": "user", "content": f"Paragraph: {q['paragraph']}"}
        messages, agent, extractor = global_trekker_messages, global_trekker, extract_global_trekker_answer
    elif q["type"] == "CulinaryDetective":
        query = {"role": "user", "content": f"Ingredients: {q['ingredient']}. Description: {q['description']}"}
        messages, agent, extractor = culinary_detective_messages, culinary_detective, extract_culinary_detective_answer
    else: #q["type"] == "LinguaLocale":
        query = {"role": "user", "content": f"Sentence: {q['sentence']}"}
        messages, agent, extractor = lingua_locale_messages, lingua_locale, extract_lingua_locale_answer

    response_all = agent.invoke({"messages": messages + [query]})
    response = response_all["messages"][-1].content
    if print_raw_response: print(f"{q['type']}: {response_all}")
    return extractor(response)

In [146]:
import json

# Here, we load the examples questions. Public/private set will be in the same format
dataset_name = "public.jsonl"
questions = []
with open(dataset_name, "r") as f:
    for line in f.readlines():
        questions.append(json.loads(line))

In [147]:
# Test run on one question
# You might want to save the raw response for debugging answer formatting/extraction
# If the extracted answer seems off, check the raw response instead of running inference repeatedly
geoguesser(questions[0], print_raw_response=True)

GlobalTrekker: {'messages': [SystemMessage(content='You are an expert in world knowledge.', additional_kwargs={}, response_metadata={}, id='920f974d-322d-4477-8d17-b5e4921630d6'), HumanMessage(content='Given the following paragraph, guess the most likely country and city. Answer in the format of [country], [city]. If the paragraph is generic or only country-level, return "Unknown" for city.', additional_kwargs={}, response_metadata={}, id='2d6bb0e2-5419-430c-a521-92b1284f829c'), HumanMessage(content='Paragraph: Uluru, also known as Ayers Rock, is a large sandstone monolith that is sacred to the Pitjantjatjara, the Aboriginal people of the area, known as the A·πâangu. It is one of the most important indigenous sites of the country, being a popular destination for tourists since the 1930s. The area around the formation is home to many springs, waterholes, rock caves and ancient paintings. Uluru is listed as a UNESCO World Heritage Site.', additional_kwargs={}, response_metadata={}, id='6

('Australia', 'Yulara')

In [148]:
# Sample script to generate answers
from tqdm import tqdm
answers = []
for q in tqdm(questions):
    try:
        country, category = geoguesser(q)
        answers.append(f"{q['type']}\t{country}\t{category}")
    except Exception as e:
        print(f"Error processing question {q}: {e}")
        answers.append(f"{q['type']}\tUnknown\tUnknown")

with open("public.txt", "w") as f:
    for answer in answers:
        f.write(answer + "\n")

print("Saved predictions to public.txt (private set)")

 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 144/170 [15:32<05:12, 12.03s/it]

Error processing question {'type': 'LinguaLocale', 'sentence': 'ÏßÄÎÇúÏ£º Í≤ΩÍ∏∞ÎèÑ Ïù∏Ï≤úÏóêÏÑú Ï∞®Îüâ Ïö¥Ï†ÑÏûêÍ∞Ä Í∞ÄÏÜç ÌéòÎã¨ÏùÑ ÏûòÎ™ª Î∞üÏïÑ Í∏∏ÏùÑ ÏßÄÎÇòÎçò Î≥¥ÌñâÏûêÎ•º ÏπòÏñ¥ Ïà®ÏßÄÍ≤å Ìïú ÌòêÏùòÎ°ú Í≤ΩÏ∞∞ ÏàòÏÇ¨Î•º Î∞õÍ≥† ÏûàÎäî Í≤ÉÏúºÎ°ú ÌååÏïÖÎêêÏäµÎãàÎã§.', 'country': 'Korea, South', 'info': {'reference': 'https://imnews.imbc.com/news/2025/society/article/6776876_36718.html', 'hints': 'orthography, "Í≤ΩÍ∏∞ÎèÑ Ïù∏Ï≤ú" (Gyeonggi-do, Incheon)'}}: Error code: 400 - {'error': {'message': "litellm.BadRequestError: litellm.ContentPolicyViolationError: litellm.ContentPolicyViolationError: AzureException - The response was filtered due to the prompt triggering Azure OpenAI's content management policy. Please modify your prompt and retry. To learn more about our content filtering policies please read our documentation: https://go.microsoft.com/fwlink/?linkid=2198766\nmodel=gpt-5. content_policy_fallback=None. fallbacks=None.\n\nSet 'content_policy_fallback' - https://docs.litellm.a

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 170/170 [19:03<00:00,  6.73s/it]

Saved predictions to public.txt (private set)





## Evaluation
This is how we calculate the scores on Gradescope (details subject to change, but the general logic will stay the same).

In [149]:
def soft_match(answer, expectedAnswer):
    score = 0.0
    if expectedAnswer in answer:
        score = len(expectedAnswer) / len(answer)
    return score

def exact_match(answer, expectedAnswer):
    score = 0.0
    if expectedAnswer == answer:
        score = 1.0
    return score    

In [150]:
answers = []
for q in questions:
    answers.append((q["type"], q["country"], q.get("city", q.get("region", ""))))
with open("public.txt", "r") as f:
    preds = [line.split("\t") for line in f.readlines()]

scores = {"GlobalTrekker": [], "CulinaryDetective": [], "LinguaLocale": []}
for (q_type, exp_country, exp_place), (p_type, pred_country, pred_place) in zip(answers, preds):
    assert q_type == p_type
    country_score = soft_match(pred_country, exp_country)
    category_score = 0.0
    weights = [0.0, 0.0]
    if q_type == "GlobalTrekker":
        #  correct country -> 80%, correct country and city -> +20%
        weights = [0.8, 0.2]
        if country_score > 0:
            if exp_place == "None": category_score = 1.0
            else: category_score = soft_match(pred_place, exp_place)
    elif q_type == "CulinaryDetective":
        # correct country -> 60%, correct country and region -> +40%
        weights = [0.6, 0.4]
        if country_score > 0:
            if exp_place == "None": category_score = 1.0
            else: category_score = exact_match(pred_place, exp_place)
    else: # LinguaLocale
        # correct country -> 60%, matched official language -> +40%
        weights = [0.6, 0.4]
        if country_score > 0:
            category_score = 1.0
        else: # incorrect country. language match works only if pred_country is a clean answer
            exp_langs = list_of_countries.get(exp_country, [])
            pred_langs = list_of_countries.get(pred_country, [])
            if any(lang in exp_langs for lang in pred_langs):
                category_score = 1.0

    score = weights[0] * country_score + weights[1] * category_score
    scores[q_type].append(score)

for q_type, score_list in scores.items():
    avg_score = sum(score_list) / len(score_list)
    print(f"{q_type} Average Score: {avg_score:.4f}")

GlobalTrekker Average Score: 0.8684
CulinaryDetective Average Score: 0.5455
LinguaLocale Average Score: 0.9140


In [151]:
def accuracy_score(num_exact: int, num_total: int) -> float:
    acc_score = 0.0
    if num_total > 0:
        score = (num_exact / num_total)*100
    return score

# Let's calculate Precision, Recall, F1
def calculate_metrics(tp, fp, fn):
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

In [152]:
# Per-type exact-match accuracy (robust)
from collections import Counter

# 1) Build expected answers per question with safe defaults
exp = []
for q in questions:
    q_type = q.get("type", "Unknown")
    exp_country = q.get("country", "Unknown")
    if q_type == "GlobalTrekker":
        exp_place = q.get("city", "Unknown")
    elif q_type == "CulinaryDetective":
        exp_place = q.get("region", "Unknown")
    else:
        exp_place = ""
    exp.append((q_type, exp_country, exp_place))

# 2) Read predictions and pad/truncate to 3 fields
preds_raw = []
with open("public.txt", "r") as f:
    for line in f.readlines():
        parts = line.rstrip("\n").split("\t")
        parts = (parts + ["Unknown", "Unknown"])[:3]
        preds_raw.append(parts)

# 3) Count totals per type
type_counts = Counter([t for t,_,_ in exp])

# 4) Exact-match counters per type
exact_counts = {"GlobalTrekker": 0, "CulinaryDetective": 0, "LinguaLocale": 0}

# 5) Iterate and count exact matches according to task rules
for (q_type, exp_country, exp_place), (p_type, pred_country, pred_place) in zip(exp, preds_raw):
    if q_type != p_type:
        # Skip mismatches to avoid misaligned files
        continue
    if q_type == "GlobalTrekker":
        if exact_match(pred_country, exp_country) == 1.0 and exact_match(pred_place, exp_place) == 1.0:
            exact_counts[q_type] += 1
    elif q_type == "CulinaryDetective":
        if exact_match(pred_country, exp_country) == 1.0 and exact_match(pred_place, exp_place) == 1.0:
            exact_counts[q_type] += 1
    else:  # LinguaLocale (country only)
        if exact_match(pred_country, exp_country) == 1.0:
            exact_counts[q_type] += 1

# 6) Compute accuracy (%) per type
def _acc(num_exact: int, num_total: int) -> float:
    return (num_exact / num_total) * 100 if num_total > 0 else 0.0

for t in ["GlobalTrekker", "CulinaryDetective", "LinguaLocale"]:
    total = type_counts.get(t, 0)
    acc = _acc(exact_counts.get(t, 0), total)
    print(f"{t} Accuracy (% exact): {acc:.4f} | exact={exact_counts.get(t,0)} / total={total}")

GlobalTrekker Accuracy (% exact): 60.6557 | exact=37 / total=61
CulinaryDetective Accuracy (% exact): 43.6364 | exact=24 / total=55
LinguaLocale Accuracy (% exact): 87.0370 | exact=47 / total=54


In [153]:
from collections import defaultdict

def calculate_metrics(tp, fp, fn):
    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    return precision, recall, f1

# Containers
tp = defaultdict(int)
fp = defaultdict(int)
fn = defaultdict(int)
counts = defaultdict(int)

task_types = ["GlobalTrekker", "CulinaryDetective", "LinguaLocale"]

for (q_type, exp_country, exp_place), (p_type, pred_country, pred_place) in zip(exp, preds_raw):

    for T in task_types:

        # Ground truth condition
        gt_positive = (q_type == T)
        # Predicted condition
        pred_positive = (p_type == T)

        # Define exact match under that type
        if T == "GlobalTrekker":
            exact = (pred_country == exp_country) and (pred_place == exp_place)
        elif T == "CulinaryDetective":
            exact = (pred_country == exp_country) and (pred_place == exp_place)
        else:  # LinguaLocale
            exact = (pred_country == exp_country)

        if gt_positive:
            counts[T] += 1

        # Evaluate classification outcomes
        if gt_positive and pred_positive:
            # Either TP or FN
            if exact:
                tp[T] += 1
            else:
                fn[T] += 1
        elif (not gt_positive) and pred_positive:
            # FP
            fp[T] += 1
        elif gt_positive and (not pred_positive):
            # FN ‚Äî model failed to predict this type
            fn[T] += 1
        # TN is ignored

# Print results
for T in task_types:
    precision, recall, f1 = calculate_metrics(tp[T], fp[T], fn[T])
    print(f"{T}:  P={precision:.4f}  R={recall:.4f}  F1={f1:.4f}   "
          f"(TP={tp[T]}, FP={fp[T]}, FN={fn[T]}, Total positives={counts[T]})")


GlobalTrekker:  P=1.0000  R=0.6066  F1=0.7551   (TP=37, FP=0, FN=24, Total positives=61)
CulinaryDetective:  P=1.0000  R=0.4364  F1=0.6076   (TP=24, FP=0, FN=31, Total positives=55)
LinguaLocale:  P=1.0000  R=0.8704  F1=0.9307   (TP=47, FP=0, FN=7, Total positives=54)
