In [193]:
from pathlib import Path
import shutil
import pandas as pd
from lxml import etree
import pickle as pk
import re
import codecs
import nltk
import json

nltk.download("punkt")

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

[nltk_data] Downloading package punkt to /home/lukel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [194]:
def copy_original_data():
    uids = []
    data = Path("../original/data/")
    papers = data.glob("**/Reference_XML/*.xml")
    summaries = data.glob("**/**/*.human.txt")

    for summary in summaries:
        uid = re.sub("([A-Z]\d{2}-\d{4})(.*)", "\g<1>", summary.stem)
        uids.append(uid)
        path = f"../data/summaries/{summary.stem.replace('.human', '')}.txt"
        shutil.copyfile(summary, path)

    for paper in papers:
        if paper.stem in uids:
            shutil.copyfile(paper, f"../data/papers/{paper.stem}.xml")

In [195]:
def parse_xml(path):
    with codecs.open(path, "r", encoding="latin-1") as file:
        try:
            xml = file.read()
        except Exception as e:
            print("Could not parse: ", paper)
            print(e)

        root = etree.fromstring(xml)
        title = root.find("./S[@sid='0']").text if not None else ""
        text = root.xpath(".//S[not(@sid = '0')]")
        sentences = [s.text for s in text if s.text is not None]
        # sids = [s.attrib.get("sid") for s in text if s.text is not None]

        return title, text, sentences

In [196]:
def splitlines_summaries(summaries):
    sentences = []

    for summary in summaries:
        with codecs.open(summary, "r+", encoding="latin-1") as s:
            text = s.read().strip()
            text = re.sub(r'([a-z])\.([A-Z])', r'\1. \2', text)

            split = nltk.tokenize.sent_tokenize(text)
            sentences.append(split)

            s.seek(0)
            s.writelines([line + "\n" for line in split])
            s.truncate()
            
    return sentences

In [197]:
def get_papers():
    papers = Path("../data/papers").glob("*.xml")
    data = []

    for path in papers:
        title, text, paper_sentences = parse_xml(path)
        uid = path.stem.replace("", "")
        summaries = Path("../data/summaries").glob(f"*{uid}*.txt")
        path_summaries = [str(s) for s in summaries]

        paper = {
            "uid": uid,
            "title": title,
            "raw_paper": paper_sentences,
            "path_paper": path,
            "path_summaries": path_summaries,
        }

        data.append(paper)

    return sorted(data, key=lambda x: x["uid"])

In [198]:
def prepare_for_annotation(summaries):
    paths = []
    for summary in summaries:
        annotations = []

        with codecs.open(summary, "r", encoding="latin-1") as file:
            for index, sentence in enumerate(file, start=1):
                annotation = {
                    "source_id": index,
                    "target_id": None,
                    "summary_text": sentence.strip(),
                    "paper_text": "",
                    "strategy": "",
                }
                annotations.append(annotation)

        path = f"../data/annotation/{summary.split('/')[-1].replace('.txt', '')}.json"
        paths.append(path)

        with open(path, "w") as f:
            json.dump(annotations, f, ensure_ascii=False, indent=4)

    return paths

In [199]:
if __name__ == "__main__":
    Path("../data/papers").mkdir(exist_ok=True, parents=True)
    Path("../data/summaries").mkdir(exist_ok=True, parents=True)
    Path("../data/annotation").mkdir(exist_ok=True, parents=True)   

    copy_original_data()
    data = get_papers()

    df = pd.DataFrame(data)
    df["raw_summaries"] = df["path_summaries"].apply(lambda s: splitlines_summaries(s))
    df["path_annotations"] = df["path_summaries"].apply(lambda s: prepare_for_annotation(s))
    
    df.to_pickle("../data/data.pkl")
    df.to_csv("../data/data.csv")

    display(df)

Unnamed: 0,uid,title,raw_paper,path_paper,path_summaries,raw_summaries,path_annotations
0,A00-2018,A Maximum-Entropy-Inspired Parser *,"[We present a new parser for parsing down to Penn tree-bank style parse trees that achieves 90.1% average precision/recall for sentences of 40 and less, and for of length 100 and less when trained and tested on the previously established [5,9,10,15,17] &quot;standard&quot; sections of the Wall Street Journal treebank., This represents a 13% decrease in error rate over the best single-parser results on this corpus [9]., The major technical innovation is the use of a &quot;maximum-entropy-inspired&quot; model for conditioning and smoothing that let us successfully to test and combine many different conditioning events., We also present some partial results showing the effects of different conditioning information, including a surprising 2% improvement due to guessing the lexical head's p...",../data/papers/A00-2018.xml,"[../data/summaries/A00-2018_sweta.txt, ../data/summaries/A00-2018_vardha.txt, ../data/summaries/A00-2018_akanksha.txt]","[[In this paper the author aims at the major technical innovation that is the use of a ""maximum-entropy-inspired"" model for conditioning and smoothing that let successfully to test and combines many different conditioning events., They also present some partial results showing the effects of different conditioning information, including a surprising 2% improvement due to guessing the lexical head's pre-terminal before guessing the lexical head., They also talk about the new parser for parsing down to Penn tree-bank style parse trees that achieves 90.1% average precision or recall for sentences of length 40 and less, and 89.5% for sentences of length 100 and less when trained and tested on the previously established ""standard"" sections of the Wall Street Journal treebank., This represe...","[../data/annotation/A00-2018_sweta.json, ../data/annotation/A00-2018_vardha.json, ../data/annotation/A00-2018_akanksha.json]"
1,A00-2030,A Novel Use of Statistical Parsing to Extract Information from Text,"[Since 1995, a few statistical parsing algorithms have demonstrated a breakthrough in parsing accuracy, as measured against the UPenn TREEBANK as a gold standard., In this paper we report adapting a lexic al ized, probabilistic context-free parser to information extraction and evaluate this new technique on MUC-7 template elements and template relations., Since 1995, a few statistical parsing algorithms (Magerman, 1995; Collins, 1996 and 1997; Charniak, 1997; Rathnaparki, 1997) demonstrated a breakthrough in parsing accuracy, as measured against the University of Pennsylvania TREEBANK as a gold standard., Yet, relatively few have embedded one of these algorithms in a task., Chiba, (1999) was able to use such a parsing algorithm to reduce perplexity with the long term goal of improved s...",../data/papers/A00-2030.xml,"[../data/summaries/A00-2030_sweta.txt, ../data/summaries/A00-2030_vardha.txt, ../data/summaries/A00-2030_aakansha.txt]","[[In this paper the author aimed at reporting, adapting a lexicalized, probabilistic context-free parser to information extraction and evaluate the new technique on MUC-7 template elements and template relations., The author is able to integrate both syntactic and semantic information into the parsing process, thus avoiding potential errors of syntax first followed by semantics., Their parsing algorithm, trained on the UPenn TREEBANK, was run on the New York Times source to create unsupervised syntactic training which was constrained to be consistent with semantic annotation., They were able to specify relatively simple guidelines that students with no training in computational linguistics could annotate., The semantic training corpus was produced by students according to a simple set ...","[../data/annotation/A00-2030_sweta.json, ../data/annotation/A00-2030_vardha.json, ../data/annotation/A00-2030_aakansha.json]"
2,A97-1014,An Annotation Scheme for Free Word Order Languages,"[We describe an annotation scheme and a tool developed for creating linguistically annotated corpora for non-configurational languages., Since the requirements for such a formalism differ from those posited for configurational languages, several features have been added, influencing the architecture of the scheme., The resulting scheme reflects a stratificational notion of language, and makes only minimal assumpabout the interrelation of the particu- •lar representational strata., The work reported in this paper aims at providing syntactically annotated corpora (treebanks') for stochastic grammar induction., In particular, we focus on several methodological issues concerning the annotation of non-configurational languages., In section 2, we examine the appropriateness of existing annot...",../data/papers/A97-1014.xml,"[../data/summaries/A97-1014_vardha.txt, ../data/summaries/A97-1014_swastika.txt, ../data/summaries/A97-1014_sweta.txt]","[[This paper talks about an annotation scheme for free word order languages., The main key words annotated in this paper are tree bank, corpus, and free word order., It aims at providing syntactically annotated corpora ('tree banks') for stochastic grammar induction., The requirements for such formalism differ from those posited for configurational languages; several features have been added, influencing the architecture of the scheme., In order to avoid inconsistencies, the corpus is annotated in two stages: basic annotation and refinements., This paper focuses on annotating argument structure rather than constituent trees; it differs from existing tree banks in several aspects., These differences are illustrated by a comparison with the Penn Treebank annotation scheme., Our annotatio...","[../data/annotation/A97-1014_vardha.json, ../data/annotation/A97-1014_swastika.json, ../data/annotation/A97-1014_sweta.json]"
3,C00-2123,Word Re-ordering and DP-based Search in Statistical Machine Translation,"[In this paper, we describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP)., Starting from a DP-based solution to the traveling salesman problem, we present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm., A search restriction especially useful for the translation direction from German to English is presented., The experimental tests are carried out on the Verbmobil task (GermanEnglish, 8000-word vocabulary), which is a limited-domain spoken-language task., The goal of machine translation is the translation of a text given in some source language into a target language., We are given a source string fJ 1 = f1:::fj :::fJ of length J, which is to ...",../data/papers/C00-2123.xml,[../data/summaries/C00-2123.txt],"[[The authors in this paper describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP)., From a DP-based solution to the traveling salesman problem, they present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm., A beam search concept is applied as in speech recognition., There is no global pruning., An extended lexicon model is defined, and its likelihood is compared to a baseline lexicon model, which takes only single-word dependencies into account., In order to handle the necessary word reordering as an optimization problem within the dynamic programming approach, they describe a solution to the traveling salesman problem (TSP) which is based on d...",[../data/annotation/C00-2123.json]
4,C02-1025,Named Entity Recognition: A Maximum Entropy Approach Using Global Information,"[This paper presents a maximum entropy-based named entity recognizer (NER)., It differs from previous machine learning-based NERs in that it uses information from the whole document to classify each word, with just one classifier., Previous work that involves the gathering of information from the whole document often uses a secondary classifier, which corrects the mistakes of a primary sentence- based classifier., In this paper, we show that the maximum entropy framework is able to make use of global information directly, and achieves performance that is comparable to the best previous machine learning-based NERs on MUC6 and MUC7 test data., Considerable amount of work has been done in recent years on the named entity recognition task, partly due to the Message Understanding Conference...",../data/papers/C02-1025.xml,[../data/summaries/C02-1025.txt],"[[This paper presents a maximum entropy-based named entity recognizer (NER)., NER is useful in many NLP applications such as information extraction, question answering, etc .Chieu and Ng have shown that the maximum entropy framework is able to use global information directly from various sources., They believe that global context is useful in most languages, as it is a natural tendency for authors to use abbreviations on entities already mentioned previously., They have made use of local and global features to deal with the instances of same token in a document., Their results show that their high performance NER use less training data than other systems., The use of global features has shown excellent result in the performance on MUC-6 and MUC-7 test data., Using less training data th...",[../data/annotation/C02-1025.json]
5,C08-1098,Estimation of Conditional ProbabilitiesWith Decision Trees and an Application to Fine-Grained POS Tagging,"[We present a HMM part-of-speech tagging method which is particularly suited for POS tagsets with a large number of fine-grained tags., It is based on three ideas: (1) splitting of the POS tags into attribute vectors and decomposition of the contextual POS probabilities of the HMM into a product of attribute probabilities, (2) estimation of the contextual probabilities with decision trees, and (3) use of high-order HMMs., In experiments on German and Czech data, our tagger outperformed state- of-the-art POS taggers., A Hidden-Markov-Model part-of-speech tagger (Brants, 2000, e.g.) computes the most probable POS tag sequence tËN = tË1, ..., tËN for a given word sequence wN . POS taggers are usually trained on corpora with between 50 and 150 different POS tags., Tagsets of this size c...",../data/papers/C08-1098.xml,[../data/summaries/C08-1098.txt],"[[In this paper, Schmid and Laws present a RFTagger or Hidden-Markov-Model (HMM) part-of-speech tagger using German and Czech corpora., HMM tagging decomposes the POS tags into a set of simple attributes, and uses decision tree to estimate the probability of each attribute., Decision tree assigns classes to objects which are represented as attribute vectors., Their tagger applies a beam-search strategy to increase the speed and uses dot to separate the attributes., It also applies pre-pruning citeria., Their tagger is fast and can be successfully applied to a wide range of languages and training corpora., Their tagger is highly accurate in comparison to TnTagger and SVMTool.]]",[../data/annotation/C08-1098.json]
6,C10-1045,"Better Arabic Parsing: Baselines, Evaluations, and Analysis","[In this paper, we offer broad insight into the underperformance of Arabic constituency parsing by analyzing the interplay of linguistic phenomena, annotation choices, and model design., First, we identify sources of syntactic ambiguity understudied in the existing parsing literature., Second, we show that although the Penn Arabic Treebank is similar to other tree- banks in gross statistical terms, annotation consistency remains problematic., Third, we develop a human interpretable grammar that is competitive with a latent variable PCFG., Fourth, we show how to build better models for three different parsers., Finally, we show that in application settings, the absence of gold segmentation lowers parsing performance by 2â5% F1., It is well-known that constituency parsing models design...",../data/papers/C10-1045.xml,[../data/summaries/C10-1045.txt],"[[This paper offers a broad insight into of Arabic constituency parsing by analyzing the interplay of linguistic phenomena, annotation choices, and model design., It is probably the first analysis of Arabic parsing of this kind., It is well-known that English constituency parsing models do not generalize to other languages and treebanks., Explanations for this phenomenon are relative informativeness of lexicalization, insensitivity to morphology and the effect of variable word order and these factors lead to syntactic disambiguation., The authors use linguistic and annotation insights to develop a manually annotated grammar and evaluate it and finally provide a realistic evaluation in which segmentation is performed in a pipeline jointly with parsing., The authors show that PATB is sim...",[../data/annotation/C10-1045.json]
7,C90-2039,Strategic Lazy Incremental Copy Graph Unification,"[The strategic lazy incremental copy graph unification method is a combination of two methods for unifying hmture structures., One, called the lazy incremental copy graph unification method, achieves structure sharing with constant order data access time which reduces the cequired memory., The other, called ti~e strategic incremental copy graph unification method, uses an early failure finding strategy which first tries to unify :;ubstructures tending to fail in unification; this method is; based on stochastic data on tim likelihood of failure and ,'educes unnecessary computation., The combined method .makes each feature structure unification efficient and also reduces garbage collection and page swapping occurrences, thus increasing the total efficiency of natural language processing ...",../data/papers/C90-2039.xml,[../data/summaries/C90-2039.txt],"[[The strategic lazy incremental copy graph (SLING) unification method combines two incremental copy graph unification method - the lazy incremental copy graph (LING) unification and the strategic incremental copy graph (SING) unification method., The LING unification method achieves structure sharing which avoids memory wastage and increases the portion of token identical substructures of FSs., The SING unification method introduces the feature unification strategy and lists the factors on which itâs efficiency depends., The combined method increases the total efficiency of FS unification-based natural language processing systems.]]",[../data/annotation/C90-2039.json]
8,C94-2154,THE CORRECT AND EFFICIENT IMPLEMENTATION OF APPROPRIATENESS SPECIFICATIONS FOR TYPED FEATURE STRUCTURES,"[in this pa,per, we argue tha, t type inferencing incorrectly implements a.pl)rolwiateness specifica.tions for typed [ea.ture structures, promote a combina.tion of l;ype resolution and unfilling a,s a. correct a.nd ef'~ ticient Mternative, and consider the expressive limits of this a.lterna.tive approa.ch., !['hroughout, we use feature cooccurence restrictions as illustration and linguistic motivation., Unification lbrmMisms ma.y be either un-typed (DCC~s, PATRII, 1,F(;) or typed (npsG)., A m~L,ior reason for adding types to ~ forma,lism is to express restrictions on fea.ture cooccurences a.s in (;l's(:: [5] in order to rule out nonexista.nt tyl)es of objects., For example, there a.re no verbs which have the [km.ture +R. The simplest way to express such restrictions is by mea.ns of a.n...",../data/papers/C94-2154.xml,[../data/summaries/C94-2154.txt],"[[In this paper, the authors try to show the kind of constraints expressible by appropriateness conditions cane be implemented in a practical system employing typed features structures and unification as the primary operation on feature structures., Being able to express the class of constraints by appropriateness conditions corresponding closely to the class of constraints that can be efficiently pre-compiled is taken as a justification for appropriateness formalisms.]]",[../data/annotation/C94-2154.json]
9,D09-1092,Polylingual Topic Models,"[Topic models are a useful tool for analyzing large text collections, but have previously been applied in only monolingual, or at most bilingual, contexts., Meanwhile, massive collections of interlinked documents in dozens of languages, such as Wikipedia, are now widely available, calling for tools that can characterize content in many languages., We introduce a polylingual topic model that discovers topics aligned across multiple languages., We explore the model’s characteristics using two large corpora, each with over ten different languages, and demonstrate its usefulness in supporting machine translation and tracking topic trends across languages., Statistical topic models have emerged as an increasingly useful analysis tool for large text collections., Topic models have been used ...",../data/papers/D09-1092.xml,"[../data/summaries/D09-1092_sweta.txt, ../data/summaries/D09-1092_swastika.txt, ../data/summaries/D09-1092_vardha.txt]","[[In this paper the author aims at introducing a polylingual topic model that discovers topics aligned across multiple languages., They explore the modelâs characteristics using two large corpora, each with over ten different languages, and demonstrate its usefulness in supporting machine translation and tracking topic trends across languages., Statistical topic models have emerged as an increasingly useful analysis tool for large text collections., Topic models have been used for analyzing topic trends in research literature, inferring captions for images, social network analysis in email, and expanding queries with topically related words in information retrieval., The author argues that topic modelling is both a useful and appropriate tool for leveraging correspondences between se...","[../data/annotation/D09-1092_sweta.json, ../data/annotation/D09-1092_swastika.json, ../data/annotation/D09-1092_vardha.json]"
