## Clean TechQA

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import json

In [None]:
qa_paths = {
    "train": {
        "qa": "/home/brachmat/phd/datasets/TechQA/training_and_dev/training_Q_A.json",
        "doc": "/home/brachmat/phd/datasets/TechQA/training_and_dev/training_dev_technotes.json"
    },
    "dev": {
        "qa": "/home/brachmat/phd/datasets/TechQA/training_and_dev/dev_Q_A.json",
        "doc": "/home/brachmat/phd/datasets/TechQA/training_and_dev/training_dev_technotes.json"
    },
    "validation": {
        "qa": "/home/brachmat/phd/datasets/TechQA/validation/validation_reference.json",
        "doc": "/home/brachmat/phd/datasets/TechQA/validation/validation_technotes.json"
    }
}

def safe_int(x):
    try:
        return int(x)
    except (TypeError, ValueError):
        return -1

def load_split(name, qa_path, doc_path):
    with open(qa_path, "r", encoding="utf-8") as f:
        qa_data = json.load(f)
    with open(doc_path, "r", encoding="utf-8") as f:
        doc_data = json.load(f)

    records = []
    for q in qa_data:
        question_id = q.get("QUESTION_ID", "").strip()
        question_text = q.get("QUESTION_TEXT", "").strip()
        answer_text = q.get("ANSWER", "").strip()
        passage_id = q.get("DOCUMENT", "").strip()
        passage_entry = doc_data.get(passage_id, {})

        passage_title = passage_entry.get("title", "").strip()
        passage_text = passage_entry.get("text") or passage_entry.get("content", "")

        start_offset = safe_int(q.get("START_OFFSET"))
        end_offset = safe_int(q.get("END_OFFSET"))
        answerable = int(q.get("ANSWERABLE", "").strip().upper() == "Y")

        records.append({
            "split": name,
            "id": question_id,
            "context": passage_text,
            "title": passage_title,
            "question": question_text,
            "answer": answer_text,
            "answer_start": start_offset,
            "answer_end": end_offset,
            "answerable": answerable
        })
    return pd.DataFrame(records)

df = pd.concat(
    [load_split(name, paths["qa"], paths["doc"]) for name, paths in qa_paths.items()],
    ignore_index=True
)

df = df[df['answer']!= ""]
df = df[df['question']!= ""]

# dev_df = df[df["split"] == "dev"]

# dev_train, dev_val = train_test_split(dev_df, test_size=0.3, random_state=42, stratify=dev_df["answerable"])

# dev_train = dev_train.copy()
# dev_val = dev_val.copy()
# dev_train["split"] = "train"
# dev_val["split"] = "validation"

# df = pd.concat([df[df["split"] != "dev"], dev_train, dev_val], ignore_index=True)
# df["split"] = df["split"].replace("dev", "validation")

df.to_csv("raw/techqa.csv", index=False)

In [None]:
# --- Load data ---
df_augmented = pd.read_csv("raw/augmented_techqa.csv")
ds = pd.read_csv("raw/techqa.csv")

In [2]:
sampled_ids = set(ds["id"])
df_filtered = df_augmented[df_augmented["orig_id"].isin(sampled_ids)]

# --- Drop rows where aug_id contains 'ERR' (handle NaNs safely) ---
mask_err = df_filtered["aug_id"].astype(str).str.contains("ERR", na=False)
df_no_err = df_filtered[~mask_err]

# --- Keep only orig_id that appear exactly 9 times ---
counts = df_no_err["orig_id"].value_counts()
valid_ids = counts[counts == 9].index
df_clean = df_no_err[df_no_err["orig_id"].isin(valid_ids)].copy()

assert (df_clean["orig_id"].value_counts() == 9).all(), "Some orig_id do not have exactly 9 rows."

# --- Build mapping: orig_id -> original answer text from SQuAD v2 ---
want_ids = set(df_clean["orig_id"])

id2answer = {}
for _, ex in ds.iterrows():
    qid = ex["id"]
    if qid in want_ids:
        texts = ex["answer"]
        id2answer[qid] = texts if len(texts) > 0 else ""  # "" for unanswerable
        if len(id2answer) == len(want_ids):
            break

df_clean["orig_answer"] = df_clean["orig_id"].map(id2answer)

missing = df_clean["orig_answer"].isna().sum()
if missing:
    print(f"Warning: {missing} rows had no matching orig_answer in ds.")

df_clean['is_valid'] = None
df_clean['reason'] = None
df_clean['notes'] = None

df_clean.to_csv("final/ds_techqa_aug.csv", index=False)
df_clean.iloc[0:216].to_csv("final/ds_techqa_aug_sample24.csv", index=False)

print(f"df_augmented: {len(df_augmented):,} rows")
print(f"df_filtered (in sampled ids): {len(df_filtered):,} rows")
print(f"dropped ERR rows: {mask_err.sum():,}")
print(f"df_clean (only orig_id with exactly 9 rows): {len(df_clean):,} rows")
print(f"unique orig_id in df_clean: {df_clean['orig_id'].nunique():,}")

df_augmented: 5,048 rows
df_filtered (in sampled ids): 5,021 rows
dropped ERR rows: 5
df_clean (only orig_id with exactly 9 rows): 4,986 rows
unique orig_id in df_clean: 554


## Clean SQuADv2

In [10]:
import pandas as pd
from datasets import load_dataset

# --- Load data ---
df_augmented = pd.read_csv("raw/augmented_squadv2.csv")
ds = load_dataset("/home/brachmat/phd/datasets/squad_v2", split="train")

In [11]:
# --- Take first 91k, then sample 6k ---
ds_sampled = ds.select(range(91_000)).shuffle(seed=42).select(range(600))

# --- Filter augmented rows to only those with orig_id in sampled ids ---
sampled_ids = set(ds_sampled["id"])
df_filtered = df_augmented[df_augmented["orig_id"].isin(sampled_ids)]

# --- Drop rows where aug_id contains 'ERR' (handle NaNs safely) ---
mask_err = df_filtered["aug_id"].astype(str).str.contains("ERR", na=False)
df_no_err = df_filtered[~mask_err]

# --- Keep only orig_id that appear exactly 9 times ---
counts = df_no_err["orig_id"].value_counts()
valid_ids = counts[counts == 9].index
df_clean = df_no_err[df_no_err["orig_id"].isin(valid_ids)].copy()

assert (df_clean["orig_id"].value_counts() == 9).all(), "Some orig_id do not have exactly 9 rows."

# --- Build mapping: orig_id -> original answer text from SQuAD v2 ---
want_ids = set(df_clean["orig_id"])

id2answer = {}
for ex in ds:
    qid = ex["id"]
    if qid in want_ids:
        texts = ex["answers"]["text"]
        id2answer[qid] = texts[0] if len(texts) > 0 else ""  # "" for unanswerable
        if len(id2answer) == len(want_ids):
            break

df_clean["orig_answer"] = df_clean["orig_id"].map(id2answer)

missing = df_clean["orig_answer"].isna().sum()
if missing:
    print(f"Warning: {missing} rows had no matching orig_answer in ds.")

df_clean['is_valid'] = None
df_clean['reason'] = None
df_clean['notes'] = None


df_clean.to_csv("final/ds_squadv2_aug.csv", index=False)
ds_sampled.to_pandas().to_csv("final/ds_squadv2.csv", index=False)
df_clean.iloc[0:216].to_csv("final/ds_squadv2_aug_sample24.csv", index=False)

print(f"df_augmented: {len(df_augmented):,} rows")
print(f"df_filtered (in sampled ids): {len(df_filtered):,} rows")
print(f"dropped ERR rows: {mask_err.sum():,}")
print(f"df_clean (only orig_id with exactly 9 rows): {len(df_clean):,} rows")
print(f"unique orig_id in df_clean: {df_clean['orig_id'].nunique():,}")

df_augmented: 820,401 rows
df_filtered (in sampled ids): 5,396 rows
dropped ERR rows: 2
df_clean (only orig_id with exactly 9 rows): 5,382 rows
unique orig_id in df_clean: 598
