In [9]:
from pathlib import Path
import shutil
import pandas as pd
from lxml import etree
import pickle as pk
import re
import codecs
import nltk
import json

nltk.download("punkt")

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("max_colwidth", 800)

[nltk_data] Downloading package punkt to /home/lukel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
def copy_original_data():
    uids = []
    data = Path("../original/data/")
    papers = data.glob("**/Reference_XML/*.xml")
    summaries = data.glob("**/**/*.human.txt")

    for summary in summaries:
        uid = re.sub("([A-Z]\d{2}-\d{4})(.*)", "\g<1>", summary.stem)
        uids.append(uid)
        path = f"../data/summaries/{summary.stem.replace('.human', '')}.txt"
        shutil.copyfile(summary, path)

    for paper in papers:
        if paper.stem in uids:
            shutil.copyfile(paper, f"../data/papers/{paper.stem}.xml")

In [11]:
def parse_xml(path):
    with codecs.open(path, "r", encoding="latin-1") as file:
        try:
            xml = file.read()
        except Exception as e:
            print("Could not parse: ", paper)
            print(e)

        root = etree.fromstring(xml)
        title = root.find("./S[@sid='0']").text if not None else ""
        s_elems = root.xpath(".//S")
        sentences = [s.text for s in s_elems if s.text is not None]
        sids = [s.attrib.get("sid") for s in s_elems if s.text is not None]

        return title, dict(zip(sids, sentences))

In [12]:
def get_papers():
    papers = Path("../data/papers").glob("*.xml")
    data = []

    for path in papers:
        # Parse paper XML
        title, text = parse_xml(path)
        paper_id = path.stem

        # Get related summaries based on paper id
        summaries = Path("../data/summaries").glob(f"*{paper_id}*.txt")
        summary_paths = [str(s) for s in summaries]
        summary_ids = [s.split("/")[-1].replace(".txt", "") for s in summary_paths]

        paper = {
            "paper_id": paper_id,
            "paper_title": title,
            "paper_path": path,
            "paper_text": text,
            "summary_ids": summary_ids,
            "summary_paths": summary_paths,
        }

        data.append(paper)

    return sorted(data, key=lambda p: p["paper_id"])

In [13]:
def splitlines_summaries(summaries):
    for summary in summaries:
        with codecs.open(summary, "r+", encoding="latin-1") as s:
            text = s.read().strip()
            text = re.sub(r'([a-z])\.([A-Z])', r'\1. \2', text)
            split = nltk.tokenize.sent_tokenize(text)
            s.seek(0)
            s.writelines([line + "\n" for line in split])
            s.truncate()

In [14]:
def prepare_for_annotation(row):
    summary_paths = row["summary_paths"]
    summary_ids = row["summary_ids"]
    paper_id = row["paper_id"]

    paths = []
    for summary, summary_id in zip(summary_paths, summary_ids):
        annotations = []

        with codecs.open(summary, "r", encoding="latin-1") as file:
            for index, sentence in enumerate(file, start=1):
                annotation = {
                    "summary_id": summary_id,
                    "paper_id": paper_id,
                    "source_sid": index,
                    "target_sid": None,
                    "source_text": sentence.strip(),
                    "strategy": None,
                }
                annotations.append(annotation)

        path = f"../data/tba/{summary_id}.json"
        paths.append(path)

        with open(path, "w") as f:
            json.dump(annotations, f, ensure_ascii=False, indent=4)

    return paths

In [15]:
if __name__ == "__main__":
    # Create necessary data folders
    Path("../data/papers").mkdir(exist_ok=True, parents=True)
    Path("../data/summaries").mkdir(exist_ok=True, parents=True)
    Path("../data/tba").mkdir(exist_ok=True, parents=True) # To-be-annotated...
    Path("../data/annotation").mkdir(exist_ok=True, parents=True)

    # Copy original data, only getting papers with summaries
    copy_original_data()

    # Extract metadata about papers
    data = get_papers()
    papers_df = pd.DataFrame(data)

    # Tokenize summaries (sentence-level)
    papers_df["summary_paths"].apply(lambda s: splitlines_summaries(s))

    # Prepare annotation files from summaries
    papers_df["annotation_paths"] = papers_df.apply(lambda row: prepare_for_annotation(row), axis=1)
    
    # Save to file
    papers_df.to_pickle("../data/papers.pkl")
    papers_df.to_csv("../data/papers.csv", index=False)
    
    display(papers_df)

Unnamed: 0,paper_id,paper_title,paper_path,paper_text,summary_ids,summary_paths,annotation_paths
0,A00-2018,A Maximum-Entropy-Inspired Parser *,../data/papers/A00-2018.xml,"{'0': 'A Maximum-Entropy-Inspired Parser *', '1': 'We present a new parser for parsing down to Penn tree-bank style parse trees that achieves 90.1% average precision/recall for sentences of 40 and less, and for of length 100 and less when trained and tested on the previously established [5,9,10,15,17] &quot;standard&quot; sections of the Wall Street Journal treebank.', '2': 'This represents a 13% decrease in error rate over the best single-parser results on this corpus [9].', '3': 'The major technical innovation is the use of a &quot;maximum-entropy-inspired&quot; model for conditioning and smoothing that let us successfully to test and combine many different conditioning events.', '4': 'We also present some partial results showing the effects of different conditioning information, inc...","[A00-2018_sweta, A00-2018_akanksha, A00-2018_vardha]","[../data/summaries/A00-2018_sweta.txt, ../data/summaries/A00-2018_akanksha.txt, ../data/summaries/A00-2018_vardha.txt]","[../data/tba/A00-2018_sweta.json, ../data/tba/A00-2018_akanksha.json, ../data/tba/A00-2018_vardha.json]"
1,A00-2030,A Novel Use of Statistical Parsing to Extract Information from Text,../data/papers/A00-2030.xml,"{'0': 'A Novel Use of Statistical Parsing to Extract Information from Text', '1': 'Since 1995, a few statistical parsing algorithms have demonstrated a breakthrough in parsing accuracy, as measured against the UPenn TREEBANK as a gold standard.', '2': 'In this paper we report adapting a lexic al ized, probabilistic context-free parser to information extraction and evaluate this new technique on MUC-7 template elements and template relations.', '3': 'Since 1995, a few statistical parsing algorithms (Magerman, 1995; Collins, 1996 and 1997; Charniak, 1997; Rathnaparki, 1997) demonstrated a breakthrough in parsing accuracy, as measured against the University of Pennsylvania TREEBANK as a gold standard.', '4': 'Yet, relatively few have embedded one of these algorithms in a task.', '5': 'Chi...","[A00-2030_sweta, A00-2030_aakansha, A00-2030_vardha]","[../data/summaries/A00-2030_sweta.txt, ../data/summaries/A00-2030_aakansha.txt, ../data/summaries/A00-2030_vardha.txt]","[../data/tba/A00-2030_sweta.json, ../data/tba/A00-2030_aakansha.json, ../data/tba/A00-2030_vardha.json]"
2,A97-1014,An Annotation Scheme for Free Word Order Languages,../data/papers/A97-1014.xml,"{'0': 'An Annotation Scheme for Free Word Order Languages', '1': 'We describe an annotation scheme and a tool developed for creating linguistically annotated corpora for non-configurational languages.', '2': 'Since the requirements for such a formalism differ from those posited for configurational languages, several features have been added, influencing the architecture of the scheme.', '3': 'The resulting scheme reflects a stratificational notion of language, and makes only minimal assumpabout the interrelation of the particu- •lar representational strata.', '4': 'The work reported in this paper aims at providing syntactically annotated corpora (treebanks') for stochastic grammar induction.', '5': 'In particular, we focus on several methodological issues concerning the annotation of n...","[A97-1014_vardha, A97-1014_sweta, A97-1014_swastika]","[../data/summaries/A97-1014_vardha.txt, ../data/summaries/A97-1014_sweta.txt, ../data/summaries/A97-1014_swastika.txt]","[../data/tba/A97-1014_vardha.json, ../data/tba/A97-1014_sweta.json, ../data/tba/A97-1014_swastika.json]"
3,C00-2123,Word Re-ordering and DP-based Search in Statistical Machine Translation,../data/papers/C00-2123.xml,"{'0': 'Word Re-ordering and DP-based Search in Statistical Machine Translation', '1': 'In this paper, we describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP).', '2': 'Starting from a DP-based solution to the traveling salesman problem, we present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm.', '3': 'A search restriction especially useful for the translation direction from German to English is presented.', '4': 'The experimental tests are carried out on the Verbmobil task (GermanEnglish, 8000-word vocabulary), which is a limited-domain spoken-language task.', '5': 'The goal of machine translation is the translation of a text given in some so...",[C00-2123],[../data/summaries/C00-2123.txt],[../data/tba/C00-2123.json]
4,C02-1025,Named Entity Recognition: A Maximum Entropy Approach Using Global Information,../data/papers/C02-1025.xml,"{'0': 'Named Entity Recognition: A Maximum Entropy Approach Using Global Information', '1': 'This paper presents a maximum entropy-based named entity recognizer (NER).', '2': 'It differs from previous machine learning-based NERs in that it uses information from the whole document to classify each word, with just one classifier.', '3': 'Previous work that involves the gathering of information from the whole document often uses a secondary classifier, which corrects the mistakes of a primary sentence- based classifier.', '4': 'In this paper, we show that the maximum entropy framework is able to make use of global information directly, and achieves performance that is comparable to the best previous machine learning-based NERs on MUC6 and MUC7 test data.', '5': 'Considerable amount of wor...",[C02-1025],[../data/summaries/C02-1025.txt],[../data/tba/C02-1025.json]
5,C08-1098,Estimation of Conditional ProbabilitiesWith Decision Trees and an Application to Fine-Grained POS Tagging,../data/papers/C08-1098.xml,"{'0': 'Estimation of Conditional ProbabilitiesWith Decision Trees and an Application to Fine-Grained POS Tagging', '1': 'We present a HMM part-of-speech tagging method which is particularly suited for POS tagsets with a large number of fine-grained tags.', '2': 'It is based on three ideas: (1) splitting of the POS tags into attribute vectors and decomposition of the contextual POS probabilities of the HMM into a product of attribute probabilities, (2) estimation of the contextual probabilities with decision trees, and (3) use of high-order HMMs.', '3': 'In experiments on German and Czech data, our tagger outperformed state- of-the-art POS taggers.', '4': 'A Hidden-Markov-Model part-of-speech tagger (Brants, 2000, e.g.) computes the most probable POS tag sequence tËN = tË1, ..., tËN ...",[C08-1098],[../data/summaries/C08-1098.txt],[../data/tba/C08-1098.json]
6,C10-1045,"Better Arabic Parsing: Baselines, Evaluations, and Analysis",../data/papers/C10-1045.xml,"{'0': 'Better Arabic Parsing: Baselines, Evaluations, and Analysis', '1': 'In this paper, we offer broad insight into the underperformance of Arabic constituency parsing by analyzing the interplay of linguistic phenomena, annotation choices, and model design.', '2': 'First, we identify sources of syntactic ambiguity understudied in the existing parsing literature.', '3': 'Second, we show that although the Penn Arabic Treebank is similar to other tree- banks in gross statistical terms, annotation consistency remains problematic.', '4': 'Third, we develop a human interpretable grammar that is competitive with a latent variable PCFG.', '5': 'Fourth, we show how to build better models for three different parsers.', '6': 'Finally, we show that in application settings, the absence of gold se...",[C10-1045],[../data/summaries/C10-1045.txt],[../data/tba/C10-1045.json]
7,C90-2039,Strategic Lazy Incremental Copy Graph Unification,../data/papers/C90-2039.xml,"{'0': 'Strategic Lazy Incremental Copy Graph Unification', '1': 'The strategic lazy incremental copy graph unification method is a combination of two methods for unifying hmture structures.', '2': 'One, called the lazy incremental copy graph unification method, achieves structure sharing with constant order data access time which reduces the cequired memory.', '3': 'The other, called ti~e strategic incremental copy graph unification method, uses an early failure finding strategy which first tries to unify :;ubstructures tending to fail in unification; this method is; based on stochastic data on tim likelihood of failure and ,'educes unnecessary computation.', '4': 'The combined method .makes each feature structure unification efficient and also reduces garbage collection and page swapp...",[C90-2039],[../data/summaries/C90-2039.txt],[../data/tba/C90-2039.json]
8,C94-2154,THE CORRECT AND EFFICIENT IMPLEMENTATION OF APPROPRIATENESS SPECIFICATIONS FOR TYPED FEATURE STRUCTURES,../data/papers/C94-2154.xml,"{'0': 'THE CORRECT AND EFFICIENT IMPLEMENTATION OF APPROPRIATENESS SPECIFICATIONS FOR TYPED FEATURE STRUCTURES', '1': 'in this pa,per, we argue tha, t type inferencing incorrectly implements a.pl)rolwiateness specifica.tions for typed [ea.ture structures, promote a combina.tion of l;ype resolution and unfilling a,s a. correct a.nd ef'~ ticient Mternative, and consider the expressive limits of this a.lterna.tive approa.ch.', '2': '!['hroughout, we use feature cooccurence restrictions as illustration and linguistic motivation.', '3': 'Unification lbrmMisms ma.y be either un-typed (DCC~s, PATRII, 1,F(;) or typed (npsG).', '4': 'A m~L,ior reason for adding types to ~ forma,lism is to express restrictions on fea.ture cooccurences a.s in (;l's(:: [5] in order to rule out nonexista.nt tyl)es ...",[C94-2154],[../data/summaries/C94-2154.txt],[../data/tba/C94-2154.json]
9,D09-1092,Polylingual Topic Models,../data/papers/D09-1092.xml,"{'0': 'Polylingual Topic Models', '1': 'Topic models are a useful tool for analyzing large text collections, but have previously been applied in only monolingual, or at most bilingual, contexts.', '2': 'Meanwhile, massive collections of interlinked documents in dozens of languages, such as Wikipedia, are now widely available, calling for tools that can characterize content in many languages.', '3': 'We introduce a polylingual topic model that discovers topics aligned across multiple languages.', '4': 'We explore the model’s characteristics using two large corpora, each with over ten different languages, and demonstrate its usefulness in supporting machine translation and tracking topic trends across languages.', '5': 'Statistical topic models have emerged as an increasingly useful anal...","[D09-1092_swastika, D09-1092_vardha, D09-1092_sweta]","[../data/summaries/D09-1092_swastika.txt, ../data/summaries/D09-1092_vardha.txt, ../data/summaries/D09-1092_sweta.txt]","[../data/tba/D09-1092_swastika.json, ../data/tba/D09-1092_vardha.json, ../data/tba/D09-1092_sweta.json]"
