In [37]:
from pathlib import Path
import shutil
import pandas as pd
from lxml import etree
import pickle as pk

In [38]:
def copy_files():
    papers = Path("data/top1000_complete").glob("**/Documents_xml/*.xml")
    summaries = Path("data/top1000_complete").glob("**/summary/*.gold.txt")

    Path("data/papers").mkdir(exist_ok=True)
    Path("data/summaries").mkdir(exist_ok=True)

    for path in papers:
        shutil.copyfile(path, f"data/papers/{path.stem}.xml")

    for path in summaries:
        shutil.copyfile(path, f"data/summaries/{path.stem.replace('.gold', '')}.txt")

In [43]:
def aggregate() -> list[dict]:
    items = []

    for path in Path("data/papers").glob("*.xml"):
        uid = path.stem
        tree = etree.parse(path)
        root = tree.getroot()
        title = root.find("./S[@sid='0']").text

        sentences = root.xpath(".//S[not(@sid = '0')]")
        sentences_text = [s.text for s in sentences if s.text is not None]

        number_of_sentences = len(sentences)
        number_of_characters = sum([len(s) for s in sentences_text])

        with open(f"data/papers/{uid}.txt", "w") as plain:
            plain.writelines([s + "\n" for s in sentences_text])

        item = {
            "uid": uid,
            "title": title if not None else "",
            # "paper": path,
            # "paper_plain": f"data/papers/{uid}.txt",
            # "summary": f"data/summaries/{uid}.txt",
            "count_sentences": number_of_sentences,
            "count_characters": number_of_characters,
        }

        items.append(item)

    return sorted(items, key=lambda x: x["uid"])


In [44]:
if __name__ == "__main__":
    files = aggregate()
    df = pd.DataFrame(files)
    df.to_pickle("data.pkl")

df

Unnamed: 0,uid,title,count_sentences,count_characters
0,A00-1031,TnT - A Statistical Part-Of-Speech Tagger,178,21804
1,A00-1043,Sentence Reduction For Automatic Text Summariz...,201,30647
2,A00-2004,Advances In Domain Independent Linear Text Seg...,147,15603
3,A00-2009,A Simple Approach To Building Ensembles Of Nai...,156,23629
4,A00-2018,A Maximum-Entropy-Inspired Parser,190,28077
...,...,...,...,...
1000,W99-0612,Language Independent Named Entity Recognition ...,200,36161
1001,W99-0613,Unsupervised Models For Named Entity Classific...,256,34713
1002,W99-0623,Exploiting Diversity In Natural Language Proce...,148,18589
1003,W99-0625,Detecting Text Similarity Over Short Passages:...,294,52706
