In [25]:
import json
import os
import re
import spacy

nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x16ab55750>

In [32]:
def get_sentences(doc):
    doc = nlp(doc)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def filter_document(doc, min_sentence_length=None):
    sentences = get_sentences(doc)
    if min_sentence_length:
        sentences = [sent for sent in sentences
                     if len(sent.split()) >= min_sentence_length]
    doc = " ".join(sentences)
    return doc

def preprocess_case_data(
    text,
    max_length=None,
    min_sentence_length=None,
    uncased=False,
    filter_min_length=None,
):

    text = (
        text.strip()
        .replace("\n", " ")
        .replace("FRAGMENT_SUPPRESSED", "")
        .replace("FACTUAL", "")
        .replace("BACKGROUND", "")
        .replace("ORDER", "")
    )
    if uncased:
        text = text.lower()
        
    text = re.sub(r"\s+", " ", text).strip()
    cite_number = re.search(r"\[[0-9]+\]", text)

    if cite_number:
        text = text[0: cite_number.span()[0]].strip() + ' ' + text[cite_number.span()[1] :].strip()
        
    if filter_min_length:
        words = text.split()
        if len(words) <= filter_min_length:
            return None

    if min_sentence_length:
        text = filter_document(text, min_sentence_length)
    if max_length:
        words = text.split()[:max_length]
        text = " ".join(words)
    if not text.endswith("."):
        text = text + "."
    return text

def create_data():
    dataset = []
    labels = json.load(open("data/task2_train_labels_2025.json", "r"))

    for case in sorted(os.listdir("data/task2_train_files_2025")):
        data = {}
        for candidate in os.listdir(f"data/task2_train_files_2025/{case}"):
            data["query_id"] = case
            data["entailed_fragment"] = open(f"data/task2_train_files_2025/{case}/entailed_fragment.txt", "r").read()
            data["doc_ids"] = sorted(os.listdir(f"data/task2_train_files_2025/{case}/paragraphs"))
            data["docs"] = [preprocess_case_data(open(f"data/task2_train_files_2025/{case}/paragraphs/{doc_id}", "r").read()) for doc_id in data["doc_ids"]]
            data["qrels"] = [{
                "query_id": case,
                "doc_id": doc_id,
                "relevance": int(doc_id in labels[case])
            } for doc_id in data["doc_ids"]]
        dataset.append(data)
    return dataset

def create_data_2():
    docs, queries, qrels = [], [], []
    labels = json.load(open("data/task2_train_labels_2025.json", "r"))

    for case in sorted(os.listdir("data/task2_train_files_2025")):
        case_path = os.path.join("data/task2_train_files_2025", case)
        # Query: Use entailed_fragment.txt as query text
        query_text = preprocess_case_data(
            open(os.path.join(case_path, "entailed_fragment.txt"), "r").read()
        )
        if query_text:  # Ensure query text is not None after preprocessing
            queries.append(f"{case}\t{query_text}")
        else:
            print(f"Warning: Skipping query for case {case} due to preprocessing returning None")
            continue

        # Docs: Use paragraph texts as documents
        paragraph_dir = os.path.join(case_path, "paragraphs")
        for candidate in sorted(os.listdir(paragraph_dir)):
            doc_text = preprocess_case_data(
                open(os.path.join(paragraph_dir, candidate), "r").read()
            )
            if doc_text:  # Ensure doc text is not None
                docs.append(f"{candidate}\t{doc_text}")
            else:
                print(f"Warning: Skipping doc {candidate} in case {case} due to preprocessing returning None")

            # Qrels: Map query_id (case) to doc_id (candidate) with relevance
            relevance = 1 if candidate in labels[case] else 0
            qrels.append(f"{case}\t{candidate}\t{relevance}")

    return docs, queries, qrels

docs, queries, qrels = create_data_2()


In [33]:
with open("data/task2-collie-2025-docs.tsv", 'w') as f:
    f.write("\n".join(docs))
    
with open("data/task2-collie-2025-queries.tsv", 'w') as f:
    f.write("\n".join(queries))

with open("data/task2-collie-2025-qrels.qrels", 'w') as f:
    f.write("\n".join(qrels))

In [34]:
import csv
def validate_tsv(file_path, expected_columns=2):
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        for i, row in enumerate(reader, 1):
            if len(row) != expected_columns:
                print(f"Error on line {i} in {file_path}: Expected {expected_columns} columns, got {len(row)}: {row}")
            if not row[0]:
                print(f"Error on line {i} in {file_path}: Missing doc_id")
            if len(row) > 1 and not row[1]:
                print(f"Error on line {i} in {file_path}: Missing text")
validate_tsv("data/task2-collie-2025-docs.tsv")

In [6]:
import json

with open("/Users/hieungo/Downloads/train_bm25_scores.json", "r") as f:
    train_bm25_scores = json.load(f)
with open("/Users/hieungo/Downloads/dev_bm25_scores.json", "r") as f:
    dev_bm25_scores = json.load(f)
with open("/Users/hieungo/Downloads/test_bm25_scores.json", "r") as f:
    test_bm25_scores = json.load(f)

In [14]:
all = {**train_bm25_scores, **dev_bm25_scores, **test_bm25_scores}

In [22]:
scores = []

for case, candidates in all.items():
    scores.extend([f"{case}\t{candidate}\t{score}" for id, (candidate, score) in enumerate(candidates.items())])
    


In [23]:
with open("data/task2-collie-2025-scoreddocs.tsv", 'w') as f:
    f.write("\n".join(scores))