# An example code that tags any related work section texts

In [1]:
import json
from tqdm import tqdm
import numpy as np

import os
import sys

#from util import *
from data_util import scientific_sent_tokenize
from joint_tagger import CorwaTagger
import logging

import torch
from transformers import (
    AutoTokenizer,
)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class Args:
    def __init__(self):
        pass
    
class ArgumentParser:
    def __init__(self):
        self.args = {}
    
    def add_argument(self,*args,**kwargs):
        name = args[0][2:]
        self.args[name] = kwargs.get("default",None)
        
    def parse_args(self):
        args = Args()
        for k,v in self.args.items():
            setattr(args, k, v)
        return args
    def __str__(self):
        return self.args

In [4]:
argparser = ArgumentParser()
argparser.add_argument('--repfile', type=str, default = "allenai/scibert_scivocab_uncased", help="Word embedding file")
argparser.add_argument('--dropout', type=float, default=0, help="embedding_dropout rate")
argparser.add_argument('--bert_dim', type=int, default=768, help="bert_dimension")
argparser.add_argument('--MAX_SENT_LEN', type=int, default=512)
argparser.add_argument('--checkpoint', type=str, default = "/data/XiangciLi/checkpoints/joint_tagger/joint_tagger_train_scibert_final.model")
argparser.add_argument('--batch_size', type=int, default=32) # roberta-large: 2; bert: 8
args = argparser.parse_args()

In [5]:
related_work_text = """
Extractive Related Work Generation. Early related work generation systems employed the extractive summarization approach. Hoang and Kan (2010) pioneered the task, developing rules to select sentences following a topic hierarchy tree that was assumed to be given as input. Hu and Wan (2014) grouped sentences into topic-biased clusters with PLSA, modeled sentence importance with SVR, and applied a global optimization framework to select sentences. Chen and Zhuge (2019) se-lected sentences from papers that co-cited the same cited papers as the target paper in order to cover a minimum Steiner tree constructed from the paper's keywords. Wang et al. (2019) extracted Cited Text Spans (CTS), the matched text spans in the cited paper that are most related to a given citation. However, these extractive approaches aim to maximally cover the citation texts with the extracted sentences, thus mostly ignoring the reference type citations that are concise and abstractive ( §3.1.3). 
Abstractive Related Work Generation. Recently, Xing et al. (2020) extend the pointergenerator (See et al., 2017) to take two text inputs, allowing them to recover a masked citation sentence given its neighboring context sentences. Ge et al. (2021) encode the citation context, cited paper's abstract, and citation network and train their model with multiple objectives: sentence salience score regression of the cited paper's abstract, functional role classification of the citation sentence, and citation sentence generation. Chen et al. (2021) propose a relation-aware, multi-document encoder to generate a related work paragraph given a set of cited papers. Luu et al. (2021) fine-tune GPT2 (Radford et al., 2019) on scientific texts and explore several techniques for representing documents, such as using extracted named entities. All of the works described above focus on the generation aspect, while neglecting dataset collection; their datasets are mostly extracted automatically. Moreover, the datasets are not reused, though they are publicly available, because these works all use slightly different problem definitions, and thus the models are not directly comparable (Li and Ouyang, 2022) . In this work, we focus on collecting a dataset that is widely applicable to various related work generation settings, rather than proposing another incomparable approach. 
"""

In [6]:
generated_paragraphs = {}
count = 0
for para in related_work_text.split("\n"):
    if para:
        generated_paragraphs["generated_"+str(count)] = " ".join(scientific_sent_tokenize(para))
        count += 1

In [7]:
pseudo_related_work_json = {
    "paper_id": "generated",
    "bib_entries": {},
    "body_text": []
}
for para_id, paragraph in generated_paragraphs.items():
    pseudo_related_work_json["body_text"].append({
        "section": para_id,
        "text": paragraph,
        "cite_spans": [],
    })

In [8]:
joint_tagger_tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
additional_special_tokens = {'additional_special_tokens': ['[BOS]']}
joint_tagger_tokenizer.add_special_tokens(additional_special_tokens)

1

In [9]:
tagger = CorwaTagger(joint_tagger_tokenizer, device, args)

Model loaded!


In [10]:
all_span_citation_mappings = tagger.run_prediction(generated_paragraphs, {"generated": pseudo_related_work_json})

100%|████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.66it/s]
2it [00:00, 584.21it/s]


In [11]:
all_span_citation_mappings

[{'id': 'generated_1',
  'paragraph': "[BOS] Abstractive Related Work Generation. [BOS] Recently, Xing et al. (2020) extend the pointergenerator (See et al., 2017) to take two text inputs, allowing them to recover a masked citation sentence given its neighboring context sentences. [BOS] Ge et al. (2021) encode the citation context, cited paper's abstract, and citation network and train their model with multiple objectives: sentence salience score regression of the cited paper's abstract, functional role classification of the citation sentence, and citation sentence generation. [BOS] Chen et al. (2021) propose a relation-aware, multi-document encoder to generate a related work paragraph given a set of cited papers. [BOS] Luu et al. (2021) fine-tune GPT2 (Radford et al., 2019) on scientific texts and explore several techniques for representing documents, such as using extracted named entities. [BOS] All of the works described above focus on the generation aspect, while neglecting dataset