In [2]:
import json
import os
import re
import spacy

nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1232fb2d0>

In [3]:
def get_sentences(doc):
    doc = nlp(doc)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def filter_document(doc, min_sentence_length=None):
    sentences = get_sentences(doc)
    if min_sentence_length:
        sentences = [sent for sent in sentences
                     if len(sent.split()) >= min_sentence_length]
    doc = " ".join(sentences)
    return doc

def preprocess_case_data(
    text,
    max_length=None,
    min_sentence_length=None,
    uncased=False,
    filter_min_length=None,
):

    text = (
        text.strip()
        .replace("\n", " ")
        .replace("FRAGMENT_SUPPRESSED", "")
        .replace("FACTUAL", "")
        .replace("BACKGROUND", "")
        .replace("ORDER", "")
    )
    if uncased:
        text = text.lower()
        
    text = re.sub(r"\s+", " ", text).strip()
    cite_number = re.search(r"\[[0-9]+\]", text)

    if cite_number:
        text = text[0: cite_number.span()[0]].strip() + ' ' + text[cite_number.span()[1] :].strip()
        
    if filter_min_length:
        words = text.split()
        if len(words) <= filter_min_length:
            return None

    if min_sentence_length:
        text = filter_document(text, min_sentence_length)
    if max_length:
        words = text.split()[:max_length]
        text = " ".join(words)
    if not text.endswith("."):
        text = text + "."
    return text

def create_data():
    dataset = []
    labels = json.load(open("data/task2_train_labels_2025.json", "r"))

    for case in sorted(os.listdir("data/task2_train_files_2025")):
        data = {}
        for candidate in os.listdir(f"data/task2_train_files_2025/{case}"):
            data["query_id"] = case
            data["entailed_fragment"] = open(f"data/task2_train_files_2025/{case}/entailed_fragment.txt", "r").read()
            data["doc_ids"] = sorted(os.listdir(f"data/task2_train_files_2025/{case}/paragraphs"))
            data["docs"] = [preprocess_case_data(open(f"data/task2_train_files_2025/{case}/paragraphs/{doc_id}", "r").read()) for doc_id in data["doc_ids"]]
            data["qrels"] = [{
                "query_id": case,
                "doc_id": doc_id,
                "relevance": int(doc_id in labels[case])
            } for doc_id in data["doc_ids"]]
        dataset.append(data)
    return dataset

def create_data_2():
    docs, queries, qrels = [], [], []
    labels = json.load(open("data/task2_train_labels_2025.json", "r"))

    for case in sorted(os.listdir("data/task2_train_files_2025")):
        content = open(os.path.join("data/task2_train_files_2025", case, "entailed_fragment.txt"), "r").read()
        docs.append({
            "query_id": case,
            "text": content
        })
        
        queries.extend([{"query_id": candidate, "text": preprocess_case_data(open(os.path.join("data/task2_train_files_2025", case, "paragraphs", candidate), "r").read())} for candidate in sorted(os.listdir(os.path.join("data/task2_train_files_2025", case, "paragraphs")))])
        qrels.extend([f"{case}\t0\t{candidate}\t{1 if candidate in labels[case] else 0}" for candidate in sorted(os.listdir(os.path.join("data/task2_train_files_2025", case, "paragraphs")))])
    return docs, queries, qrels

docs, queries, qrels = create_data_2()


In [4]:
qrels

['001\t0\t001.txt\t0',
 '001\t0\t002.txt\t0',
 '001\t0\t003.txt\t0',
 '001\t0\t004.txt\t0',
 '001\t0\t005.txt\t0',
 '001\t0\t006.txt\t0',
 '001\t0\t007.txt\t0',
 '001\t0\t008.txt\t0',
 '001\t0\t009.txt\t0',
 '001\t0\t010.txt\t0',
 '001\t0\t011.txt\t0',
 '001\t0\t012.txt\t0',
 '001\t0\t013.txt\t0',
 '001\t0\t014.txt\t0',
 '001\t0\t015.txt\t0',
 '001\t0\t016.txt\t0',
 '001\t0\t017.txt\t0',
 '001\t0\t018.txt\t0',
 '001\t0\t019.txt\t0',
 '001\t0\t020.txt\t0',
 '001\t0\t021.txt\t0',
 '001\t0\t022.txt\t0',
 '001\t0\t023.txt\t0',
 '001\t0\t024.txt\t0',
 '001\t0\t025.txt\t0',
 '001\t0\t026.txt\t0',
 '001\t0\t027.txt\t1',
 '001\t0\t028.txt\t0',
 '001\t0\t029.txt\t0',
 '001\t0\t030.txt\t0',
 '001\t0\t031.txt\t0',
 '001\t0\t032.txt\t0',
 '002\t0\t001.txt\t0',
 '002\t0\t002.txt\t0',
 '002\t0\t003.txt\t0',
 '002\t0\t004.txt\t0',
 '002\t0\t005.txt\t0',
 '002\t0\t006.txt\t0',
 '002\t0\t007.txt\t0',
 '002\t0\t008.txt\t0',
 '002\t0\t009.txt\t0',
 '002\t0\t010.txt\t0',
 '002\t0\t011.txt\t0',
 '002\t0\t0

In [5]:
with open("data/task2-collie-2025-docs.jsonl", 'w') as f:
    json.dump(docs, f, indent=4, ensure_ascii=False)
    
with open("data/task2-collie-2025-queries.jsonl", 'w') as f:
    json.dump(queries, f, indent=4, ensure_ascii=False)

with open("data/task2-collie-2025-qrels.qrels", 'w') as f:
    f.write("\n".join(qrels))