In [1]:
from pathlib import Path
import shutil
import pandas as pd
from lxml import etree
import pickle as pk
import re
import codecs
import nltk
import json
from pathlib import Path
import pandas as pd
import codecs

In [22]:
def create_dirs():
    Path("../data").mkdir(exist_ok=True)
    Path("../data/papers").mkdir(exist_ok=True)
    Path("../data/papers/xml").mkdir(exist_ok=True)
    Path("../data/papers/txt").mkdir(exist_ok=True)
    Path("../data/summaries").mkdir(exist_ok=True)

In [23]:
def copy_files():
    uids = []
    data = Path("../original/data/")
    papers = data.glob("**/Reference_XML/*.xml")
    summaries = data.glob("**/**/*.human.txt")

    for summary in summaries:
        uid = re.sub("([A-Z]\d{2}-\d{4})(.*)", "\g<1>", summary.stem)
        uids.append(uid)

        if bool(re.search("(?:[A-Z]\d{2}-\d{4})_(.*).human", summary.stem)):
            annotator = re.sub("(?:[A-Z]\d{2}-\d{4})_(.*).human", "\g<1>", summary.stem)
            Path(f"../data/summaries/{annotator}").mkdir(exist_ok=True)
            summary_path = f"data/summaries/{annotator}/{summary.stem.replace(f'_{annotator}.human', '')}.txt"
        else:
            summary_path = f"data/summaries/{summary.stem.replace('.human', '')}.txt"

        shutil.copyfile(summary, summary_path)

    for paper in papers:
        if paper.stem in uids:
            shutil.copyfile(paper, f"data/papers/xml/{paper.stem}.xml")

In [24]:
def parse_xml(path):
    with codecs.open(path, "r", encoding="latin-1") as file:
        try:
            xml = file.read()
        except Exception as e:
            print("Could not parse: ", paper)
            print(e)

        root = etree.fromstring(xml)
        title = root.find("./S[@sid='0']").text if not None else ""
        text = root.xpath(".//S[not(@sid = '0')]")
        
        sentences = [s.text for s in text if s.text is not None]
        sids = [s.attrib.get("sid") for s in text if s.text is not None]

        return title, text, sentences, sids

In [25]:
def extract_metadata():
    papers = Path("data/papers/xml").glob("*.xml")
    summaries = list(Path("data/summaries").glob("**/*.txt"))
    data = []

    for path in papers:
        title, text, sentences, sids = parse_xml(path)
        uid = path.stem

        path_plain = f"data/papers/txt/{uid}.txt"
        with open(path_plain, "w") as plain:
            plain.writelines([s + "\n" for s in sentences])
            
        path_summaries = [str(summary) for summary in summaries if uid in summary.stem]

        paper = {
            "uid": uid,
            "title": title,
            "raw_paper": sentences,
            "ids_paper": sids,
            "path_xml": path,
            "path_txt": path_plain,
            "path_summaries": path_summaries,
        }

        data.append(paper)

    return sorted(data, key=lambda x: x["uid"])

In [26]:
def splitlines_summaries():
    summaries = Path("data/summaries").glob("**/*.txt")

    for summary in summaries:
        with codecs.open(summary, "r+", encoding="latin-1") as file:
            text = file.read()
            split = nltk.tokenize.sent_tokenize(text)
            file.seek(0)
            file.writelines([line + "\n" for line in split])
            file.truncate()

In [27]:
if __name__ == "__main__":
    create_dirs()
    copy_files()
    splitlines_summaries()

    data = extract_metadata()
    df = pd.DataFrame(data)
    df.to_pickle("data.pkl")

    display(df)

Unnamed: 0,uid,title,raw_paper,ids_paper,path_xml,path_txt,path_summaries
0,A00-2018,A Maximum-Entropy-Inspired Parser *,[We present a new parser for parsing down to P...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/A00-2018.xml,data/papers/txt/A00-2018.txt,"[data/summaries/vardha/A00-2018.txt, data/summ..."
1,A00-2030,A Novel Use of Statistical Parsing to Extract ...,"[Since 1995, a few statistical parsing algorit...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/A00-2030.xml,data/papers/txt/A00-2030.txt,"[data/summaries/vardha/A00-2030.txt, data/summ..."
2,A97-1014,An Annotation Scheme for Free Word Order Langu...,[We describe an annotation scheme and a tool d...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/A97-1014.xml,data/papers/txt/A97-1014.txt,"[data/summaries/vardha/A97-1014.txt, data/summ..."
3,C00-2123,Word Re-ordering and DP-based Search in Statis...,"[In this paper, we describe a search procedure...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/C00-2123.xml,data/papers/txt/C00-2123.txt,[data/summaries/C00-2123.txt]
4,C02-1025,Named Entity Recognition: A Maximum Entropy Ap...,[This paper presents a maximum entropy-based n...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/C02-1025.xml,data/papers/txt/C02-1025.txt,[data/summaries/C02-1025.txt]
5,C08-1098,Estimation of Conditional ProbabilitiesWith De...,[We present a HMM part-of-speech tagging metho...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/C08-1098.xml,data/papers/txt/C08-1098.txt,[data/summaries/C08-1098.txt]
6,C10-1045,"Better Arabic Parsing: Baselines, Evaluations,...","[In this paper, we offer broad insight into th...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/C10-1045.xml,data/papers/txt/C10-1045.txt,[data/summaries/C10-1045.txt]
7,C90-2039,Strategic Lazy Incremental Copy Graph Unification,[The strategic lazy incremental copy graph uni...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/C90-2039.xml,data/papers/txt/C90-2039.txt,[data/summaries/C90-2039.txt]
8,C94-2154,THE CORRECT AND EFFICIENT IMPLEMENTATION OF AP...,"[in this pa,per, we argue tha, t type inferenc...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/C94-2154.xml,data/papers/txt/C94-2154.txt,[data/summaries/C94-2154.txt]
9,D09-1092,Polylingual Topic Models,[Topic models are a useful tool for analyzing ...,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",data/papers/xml/D09-1092.xml,data/papers/txt/D09-1092.txt,"[data/summaries/vardha/D09-1092.txt, data/summ..."


In [28]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)
df.raw_paper

0     [We present a new parser for parsing down to Penn tree-bank style parse trees that achieves 90.1% average precision/recall for sentences of 40 and less, and for of length 100 and less when trained and tested on the previously established [5,9,10,15,17] &quot;standard&quot; sections of the Wall Street Journal treebank., This represents a 13% decrease in error rate over the best single-parser results on this corpus [9]., The major technical innovation is the use of a &quot;maximum-entropy-inspired&quot; model for conditioning and smoothing that let us successfully to test and combine many different conditioning events., We also present some partial results showing the effects of different conditioning information, including a surprising 2% improvement due to guessing the lexical head's p...
1     [Since 1995, a few statistical parsing algorithms have demonstrated a breakthrough in parsing accuracy, as measured against the UPenn TREEBANK as a gold standard., In this paper we report 