In [None]:
import json
import csv

#
# --- Process Documents ---
#

# Set your file paths
input_file = "data/dragonball/jsonl/dragonball_docs.jsonl"
output_file = "data/dragonball/docs_en.csv"

data = []
field_names = set()

# Read the JSONL file
with open(input_file, "r", encoding="utf-8") as jsonl_file:
    for line in jsonl_file:
        obj = json.loads(line)
        if obj["language"] == "en":
            data.append(obj)
            field_names.update(obj.keys())
            # field_names |= obj.keys() # same effect like line above

# Write to CSV
if data:
    with open(output_file, "x", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        writer.writeheader()
        writer.writerows(data)
else:
    print("No data found in the input file.")

In [None]:
import json
import csv

#
# --- Process Queries ---
#


# Set your file paths
input_file = "data/dragonball/jsonl/dragonball_queries.jsonl"
output_file = "data/dragonball/queries_en.csv"

data = []
field_names = set()

# Read the JSONL file
with open(input_file, "r", encoding="utf-8") as jsonl_file:
    for line in jsonl_file:
        obj = json.loads(line)
        if obj["language"] == "en":
            data.append(obj)
            field_names.update(obj.keys())
            # field_names |= obj.keys() # same effect like line above

# Write to CSV
if data:
    with open(output_file, "x", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        writer.writeheader()
        writer.writerows(data)
else:
    print("No data found in the input file.")

In [None]:
#
# --- Flatten Queries ---
#


# Set your file paths
input_file = "data/dragonball/jsonl/dragonball_queries.jsonl"
output_file = "data/dragonball/queries_en_flattened.csv"

data = []
field_names = set()

# Read the JSONL file
with open(input_file, "r", encoding="utf-8") as jsonl_file:
    for line in jsonl_file:
        obj = json.loads(line)
        if obj["language"] != "en":
            continue

        # get nested json objects and remove
        gt_json = obj.pop("ground_truth")
        query_json = obj.pop("query")

        # insert flattened objects into high-level json
        for key, value in gt_json.items():
            obj[f"ground_truth.{key}"] = value
        for key, value in query_json.items():
            obj[f"query.{key}"] = value

        data.append(obj)
        field_names.update(obj.keys())
        # field_names |= obj.keys() # same effect like line above

# Write to CSV
if data:
    field_names_list = list(field_names)
    field_names_list.sort()
    with open(output_file, "x", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=field_names_list)
        writer.writeheader()
        writer.writerows(data)
else:
    print("No data found in the input file.")

In [None]:
#
# --- add token count to docs ---
#

import pandas as pd
import tiktoken

# Load the tokenizer. Encoding for gpt-4.1-mini is not yet released but is likely to be "o200k_base"
encoding = tiktoken.get_encoding("o200k_base")

# Read the CSV file
df = pd.read_csv("data/dragonball/en/docs.csv")


# Tokenize each row in the 'content' column
df["num_tokens"] = (
    df["content"].astype(str).apply(lambda x: encoding.encode(x)).apply(len)
)

# Save the modified DataFrame to a new CSV file
df.to_csv("data/dragonball/en/docs_with_tokens.csv", index=False)