In [1]:
#Get combined unqique queries from test collection as txt
import os

file_paths = [
    "datasets/LongEval-Web/LongEval Test Collection/queries/2023-03_queries.txt",
    "datasets/LongEval-Web/LongEval Test Collection/queries/2023-04_queries.txt",
    "datasets/LongEval-Web/LongEval Test Collection/queries/2023-05_queries.txt",
    "datasets/LongEval-Web/LongEval Test Collection/queries/2023-06_queries.txt",
    "datasets/LongEval-Web/LongEval Test Collection/queries/2023-07_queries.txt",
    "datasets/LongEval-Web/LongEval Test Collection/queries/2023-08_queries.txt"
]

# Use a dictionary to ensure unique qid (keeps the first query seen for each qid)
qid_to_query = {}

for file_path in file_paths:
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            qid, query = line.split("\t", 1)
            qid = qid.strip()
            query = query.strip()
            if qid not in qid_to_query:
                qid_to_query[qid] = query  # Keep the first seen

# Optional: sort by numeric qid
sorted_entries = sorted(qid_to_query.items(), key=lambda x: int(x[0]))

# Write to output file
with open("combined_unique_queries.txt", "w", encoding="utf-8") as f:
    for qid, query in sorted_entries:
        f.write(f"{qid}\t{query}\n")


In [None]:
pd.read_csv("combined_unique_queries.txt", sep="\t", names=["qid", "query"]).to_csv("unique_queries_2023-03_to_2023-08.csv", index=False)

In [9]:
import csv
import requests
import os

api_key = ""
csv_file_path = 'unique_queries_2023-03_to_2023-08.csv'
api_url = "https://api.openai.com/v1/chat/completions"
model_name = "gpt-4o-mini"  
max_length = 200
rows = []
row_counter = 1
base, ext = os.path.splitext(csv_file_path)
new_csv_file_path = f"{base}_updated{ext}"
context = "Try to be sure about it. Categorize the request into one of the following categories, responding only with the category name: time-independent (timeless information not tied to a specific time or event, e.g., definitions, recipes, general rules), explicit-time (requests with explicit time references, e.g., years, dates, specific periods), event (requests about specific events, e.g., Named public events, Scheduled institutional events, Historical events), or timeliness (time-sensitive or current information where up-to-date info or availability matters e.g., weather, stock prices, live updates, buying intent, tax rates). Categorization Process: 1. Look for explicit time references (e.g., years, dates). Assign to explicit-time if present. 2. Check for event-related terms. Assign to event if applicable. 3. If the request requires real-time or current information, assign to timeliness. 4. If the request is timeless and not tied to time or events, assign to time-independent. Only respond with the category name. Examples: definition of gravity, chess rules → time-independent; World War II, US president 1990 → explicit-time; Cannes festival 2025, French Revolution → event; Apple stock price, weather Lyon → timeliness"
#context = "Categorize the query as one of the following and respond only with the category name: not-temporal (no direct or indirect time reference; relevance is static—e.g., definitions, rules, recipes), explicit-temporal (mentions a specific date, year, or defined time period—e.g., 'October 2024', 'last election'), or implicit-temporal (no explicit date, but relevance depends on current or recent context—e.g., weather, stock prices, travel, or buying intent, where up-to-date info or availability matters)."
QUERY_LIMIT = None  # Set to None to process all

# Read CSV file
with open(csv_file_path, mode='r') as file:
    reader = csv.DictReader(file)
    fieldnames = reader.fieldnames + ['Answer']  # Add 'Answer' column
    for row in reader:
        rows.append(row)

# Apply limit if set
if QUERY_LIMIT:
    rows = rows[:QUERY_LIMIT]

# Process each row and send request to OpenAI API
for row in rows:
    prompt = row['query'] + context

    # Payload for OpenAI API
    payload = {
        "model": model_name,
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_length // 2  # Approximate token count for response
    }
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    # Send request to OpenAI API
    response = requests.post(api_url, json=payload, headers=headers)

    if response.status_code == 200:
        full_answer = response.json()['choices'][0]['message']['content'].strip()
        row['Answer'] = full_answer[:max_length]
    else:
        row['Answer'] = "Error: " + response.text

    if row_counter % 100 == 0:
        print(f"{row_counter} rows processed...")

    row_counter += 1

with open(new_csv_file_path, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(rows)

print(f"Updated CSV file created at: {new_csv_file_path}")

100 rows processed...
200 rows processed...
300 rows processed...
400 rows processed...
500 rows processed...
600 rows processed...
700 rows processed...
800 rows processed...
900 rows processed...
1000 rows processed...
1100 rows processed...
1200 rows processed...
1300 rows processed...
1400 rows processed...
1500 rows processed...
1600 rows processed...
1700 rows processed...
1800 rows processed...
1900 rows processed...
2000 rows processed...
2100 rows processed...
2200 rows processed...
2300 rows processed...
2400 rows processed...
2500 rows processed...
2600 rows processed...
2700 rows processed...
2800 rows processed...
2900 rows processed...
3000 rows processed...
3100 rows processed...
3200 rows processed...
3300 rows processed...
3400 rows processed...
3500 rows processed...
3600 rows processed...
3700 rows processed...
3800 rows processed...
3900 rows processed...
4000 rows processed...
4100 rows processed...
4200 rows processed...
4300 rows processed...
4400 rows processed.

In [None]:
#Individual csv for test subcollections
import pandas as pd

# Paths
combined_csv_path = "unique_queries_2023-03_to_2023-08_updated.csv"  # big CSV with all queries & categories
specific_queries_txt = "datasets/LongEval-Web/LongEval Test Collection/queries/2023-08_queries.txt"  # the txt file for specific timeframe
output_csv_path = "2023-08_categorized_queries.csv"     # output filtered CSV

# Step 1: Load combined CSV
combined_df = pd.read_csv(combined_csv_path)

# Step 2: Load qids from the specific txt file
with open(specific_queries_txt, "r", encoding="utf-8") as f:
    qids_in_file = [line.split("\t")[0].strip() for line in f if line.strip()]

# Step 3: Filter combined_df by qids present in txt file
filtered_df = combined_df[combined_df["qid"].astype(str).isin(qids_in_file)]

# Step 4: Save filtered dataframe to new CSV
filtered_df.to_csv(output_csv_path, index=False)

print(f"Filtered CSV saved with {len(filtered_df)} entries for {specific_queries_txt}")
