In [10]:
from pathlib import Path
import shutil
import pandas as pd
from lxml import etree

In [11]:
def copy_files():
    papers = Path("data/top1000_complete").glob("**/Documents_xml/*.xml")
    summaries = Path("data/top1000_complete").glob("**/summary/*.gold.txt")

    Path("data/original").mkdir(exist_ok=True)
    Path("data/summary").mkdir(exist_ok=True)

    for path in papers:
        shutil.copyfile(path, f"data/original/{path.stem}.xml")

    for path in summaries:
        shutil.copyfile(path, f"data/summary/{path.stem.replace('.gold', '')}.txt")

In [21]:
def aggregate() -> list[dict]:
    items = []

    for path in Path("original").glob("*.xml"):
        uid = path.stem
        tree = etree.parse(path)
        root = tree.getroot()
        title = root.find("./S[@sid='0']").text
        sentences = root.xpath(".//S[not(@sid = '0')]")
        number_of_sentences = len(sentences)
        
        length_char = sum(len(element.text) for element in sentences if element.text is not None)
        length_word = sum(len(element.text.split(" ")) for element in sentences if element.text is not None)
        
        if number_of_sentences is not None and number_of_sentences != 0:
            avg_sentence_length_char = length_char / number_of_sentences
            avg_sentence_length_word = length_word / number_of_sentences

        item = {
            "uid": uid,
            "title": title if not None else "",
            "original": path,
            "summary": f"summary/{uid}.txt",
            "number_of_sentences": number_of_sentences,
            "length_word": length_word,
            "avg_sentence_length_char": avg_sentence_length_char,
            "avg_sentence_length_word": avg_sentence_length_word,
        }    

        items.append(item)

    return sorted(items, key=lambda x: x["uid"])


files = aggregate()
df = pd.DataFrame(files)

In [22]:
df

Unnamed: 0,uid,title,original,summary,number_of_sentences,length_word,avg_sentence_length_char,avg_sentence_length_word
0,A00-1031,TnT - A Statistical Part-Of-Speech Tagger,original/A00-1031.xml,summary/A00-1031.txt,178,3499,122.494382,19.657303
1,A00-1043,Sentence Reduction For Automatic Text Summariz...,original/A00-1043.xml,summary/A00-1043.txt,201,4948,152.472637,24.616915
2,A00-2004,Advances In Domain Independent Linear Text Seg...,original/A00-2004.xml,summary/A00-2004.txt,147,2487,106.142857,16.918367
3,A00-2009,A Simple Approach To Building Ensembles Of Nai...,original/A00-2009.xml,summary/A00-2009.txt,156,3776,151.467949,24.205128
4,A00-2018,A Maximum-Entropy-Inspired Parser,original/A00-2018.xml,summary/A00-2018.txt,190,4700,147.773684,24.736842
...,...,...,...,...,...,...,...,...
1000,W99-0612,Language Independent Named Entity Recognition ...,original/W99-0612.xml,summary/W99-0612.txt,200,5571,180.805000,27.855000
1001,W99-0613,Unsupervised Models For Named Entity Classific...,original/W99-0613.xml,summary/W99-0613.txt,256,5851,135.597656,22.855469
1002,W99-0623,Exploiting Diversity In Natural Language Proce...,original/W99-0623.xml,summary/W99-0623.txt,148,2991,125.601351,20.209459
1003,W99-0625,Detecting Text Similarity Over Short Passages:...,original/W99-0625.xml,summary/W99-0625.txt,294,8274,179.272109,28.142857
