In [12]:
import pandas as pd
import glob

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("max_colwidth", 800)

In [13]:
# Read annotation files from JSON
# Dropping rows where summary sentence could not be matched
annotation_files = glob.glob("../data/annotation/*.json")
summaries_df = pd.concat([pd.read_json(f) for f in annotation_files]).dropna(subset="target_sid")

summaries_df["source_sid"] = summaries_df["source_sid"].astype("int32").astype("string")
summaries_df["target_sid"] = summaries_df["target_sid"].astype("int32").astype("string")
summaries_df["strategy"] = summaries_df["strategy"].astype("category")

# Merge dataframes and get target sentences by id
papers_df = pd.read_pickle("../data/papers.pkl")
annotations_df = summaries_df.merge(papers_df, on="paper_id", how="left")
annotations_df["target_text"] = annotations_df.apply(lambda row: row["paper_text"].get(row["target_sid"]), axis=1)
annotations_df["target_doc"] = annotations_df["paper_text"]

# Column ordering
annotations_df = annotations_df[["summary_id", "paper_id", "source_sid", "target_sid", "source_text", "target_text", "target_doc", "strategy"]]

annotations_df.to_pickle("../data/annotations.pkl")
annotations_df.to_csv("../data/annotations.csv", index=False)

display(annotations_df.head(5))
display(annotations_df.tail(5))

Unnamed: 0,summary_id,paper_id,source_sid,target_sid,source_text,target_text,target_doc,strategy
0,C00-2123,C00-2123,1,1,The authors in this paper describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP).,"In this paper, we describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP).","{'0': 'Word Re-ordering and DP-based Search in Statistical Machine Translation', '1': 'In this paper, we describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP).', '2': 'Starting from a DP-based solution to the traveling salesman problem, we present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm.', '3': 'A search restriction especially useful for the translation direction from German to English is presented.', '4': 'The experimental tests are carried out on the Verbmobil task (GermanEnglish, 8000-word vocabulary), which is a limited-domain spoken-language task.', '5': 'The goal of machine translation is the translation of a text given in some so...",abstractive
1,C00-2123,C00-2123,2,2,"From a DP-based solution to the traveling salesman problem, they present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm.","Starting from a DP-based solution to the traveling salesman problem, we present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm.","{'0': 'Word Re-ordering and DP-based Search in Statistical Machine Translation', '1': 'In this paper, we describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP).', '2': 'Starting from a DP-based solution to the traveling salesman problem, we present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm.', '3': 'A search restriction especially useful for the translation direction from German to English is presented.', '4': 'The experimental tests are carried out on the Verbmobil task (GermanEnglish, 8000-word vocabulary), which is a limited-domain spoken-language task.', '5': 'The goal of machine translation is the translation of a text given in some so...",abstractive
2,C00-2123,C00-2123,3,165,A beam search concept is applied as in speech recognition.,We apply a beam search concept as in speech recognition.,"{'0': 'Word Re-ordering and DP-based Search in Statistical Machine Translation', '1': 'In this paper, we describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP).', '2': 'Starting from a DP-based solution to the traveling salesman problem, we present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm.', '3': 'A search restriction especially useful for the translation direction from German to English is presented.', '4': 'The experimental tests are carried out on the Verbmobil task (GermanEnglish, 8000-word vocabulary), which is a limited-domain spoken-language task.', '5': 'The goal of machine translation is the translation of a text given in some so...",abstractive
3,C00-2123,C00-2123,4,166,There is no global pruning.,However there is no global pruning.,"{'0': 'Word Re-ordering and DP-based Search in Statistical Machine Translation', '1': 'In this paper, we describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP).', '2': 'Starting from a DP-based solution to the traveling salesman problem, we present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm.', '3': 'A search restriction especially useful for the translation direction from German to English is presented.', '4': 'The experimental tests are carried out on the Verbmobil task (GermanEnglish, 8000-word vocabulary), which is a limited-domain spoken-language task.', '5': 'The goal of machine translation is the translation of a text given in some so...",abstractive
4,C00-2123,C00-2123,5,35,"An extended lexicon model is defined, and its likelihood is compared to a baseline lexicon model, which takes only single-word dependencies into account.","An extended lexicon model is defined, and its likelihood is compared to a baseline lexicon model, which takes only single-word dependencies into account.","{'0': 'Word Re-ordering and DP-based Search in Statistical Machine Translation', '1': 'In this paper, we describe a search procedure for statistical machine translation (MT) based on dynamic programming (DP).', '2': 'Starting from a DP-based solution to the traveling salesman problem, we present a novel technique to restrict the possible word reordering between source and target language in order to achieve an eÃcient search algorithm.', '3': 'A search restriction especially useful for the translation direction from German to English is presented.', '4': 'The experimental tests are carried out on the Verbmobil task (GermanEnglish, 8000-word vocabulary), which is a limited-domain spoken-language task.', '5': 'The goal of machine translation is the translation of a text given in some so...",extractive


Unnamed: 0,summary_id,paper_id,source_sid,target_sid,source_text,target_text,target_doc,strategy
128,W99-0623_vardha,W99-0623,3,3,Here both parametric and non-parametric models are explored.,Both parametric and non-parametric models are explored.,"{'0': 'Exploiting Diversity in Natural Language Processing: Combining Parsers', '1': 'Three state-of-the-art statistical parsers are combined to produce more accurate parses, as well as new bounds on achievable Treebank parsing accuracy.', '2': 'Two general approaches are presented and two combination techniques are described for each approach.', '3': 'Both parametric and non-parametric models are explored.', '4': 'The resulting parsers surpass the best previously published performance results for the Penn Treebank.', '5': 'The natural language processing community is in the strong position of having many available approaches to solving some of its most fundamental problems.', '6': 'The machine learning community has been in a similar situation and has studied the combination of multip...",extractive
129,W99-0623_vardha,W99-0623,4,51,"One can trivially create situations in which strictly binary-branching trees are combined to create a tree with only the root node and the terminal nodes, a completely flat structure.","One can trivially create situations in which strictly binary-branching trees are combined to create a tree with only the root node and the terminal nodes, a completely flat structure.","{'0': 'Exploiting Diversity in Natural Language Processing: Combining Parsers', '1': 'Three state-of-the-art statistical parsers are combined to produce more accurate parses, as well as new bounds on achievable Treebank parsing accuracy.', '2': 'Two general approaches are presented and two combination techniques are described for each approach.', '3': 'Both parametric and non-parametric models are explored.', '4': 'The resulting parsers surpass the best previously published performance results for the Penn Treebank.', '5': 'The natural language processing community is in the strong position of having many available approaches to solving some of its most fundamental problems.', '6': 'The machine learning community has been in a similar situation and has studied the combination of multip...",extractive
130,W99-0623_vardha,W99-0623,5,72,The three parsers were trained and tuned by their creators on various sections of the WSJ portion of the Penn Treebank.,"The three parsers were trained and tuned by their creators on various sections of the WSJ portion of the Penn Treebank, leaving only sections 22 and 23 completely untouched during the development of any of the parsers.","{'0': 'Exploiting Diversity in Natural Language Processing: Combining Parsers', '1': 'Three state-of-the-art statistical parsers are combined to produce more accurate parses, as well as new bounds on achievable Treebank parsing accuracy.', '2': 'Two general approaches are presented and two combination techniques are described for each approach.', '3': 'Both parametric and non-parametric models are explored.', '4': 'The resulting parsers surpass the best previously published performance results for the Penn Treebank.', '5': 'The natural language processing community is in the strong position of having many available approaches to solving some of its most fundamental problems.', '6': 'The machine learning community has been in a similar situation and has studied the combination of multip...",extractive
131,W99-0623_vardha,W99-0623,6,143,Through parser combination we have reduced the precision error rate by 30% and the recall error rate by 6% compared to the best previously published result.,Through parser combination we have reduced the precision error rate by 30% and the recall error rate by 6% compared to the best previously published result.,"{'0': 'Exploiting Diversity in Natural Language Processing: Combining Parsers', '1': 'Three state-of-the-art statistical parsers are combined to produce more accurate parses, as well as new bounds on achievable Treebank parsing accuracy.', '2': 'Two general approaches are presented and two combination techniques are described for each approach.', '3': 'Both parametric and non-parametric models are explored.', '4': 'The resulting parsers surpass the best previously published performance results for the Penn Treebank.', '5': 'The natural language processing community is in the strong position of having many available approaches to solving some of its most fundamental problems.', '6': 'The machine learning community has been in a similar situation and has studied the combination of multip...",extractive
132,W99-0623_vardha,W99-0623,7,144,Combining multiple highly-accurate independent parsers yields promising results.,Combining multiple highly-accurate independent parsers yields promising results.,"{'0': 'Exploiting Diversity in Natural Language Processing: Combining Parsers', '1': 'Three state-of-the-art statistical parsers are combined to produce more accurate parses, as well as new bounds on achievable Treebank parsing accuracy.', '2': 'Two general approaches are presented and two combination techniques are described for each approach.', '3': 'Both parametric and non-parametric models are explored.', '4': 'The resulting parsers surpass the best previously published performance results for the Penn Treebank.', '5': 'The natural language processing community is in the strong position of having many available approaches to solving some of its most fundamental problems.', '6': 'The machine learning community has been in a similar situation and has studied the combination of multip...",extractive


In [14]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(annotations_df, test_size=0.2, shuffle=True)
train_df.to_csv("../data/train.csv", index=False)
test_df.to_csv("../data/test.csv", index=False)