In [1]:
import argparse

import json
from tqdm import tqdm
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize

import os
import re
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [2]:
def has_keyword(keywords, sentence):
    for kw in keywords:
        if kw in sentence:
            return True
    return False

def contain_citation(sentence):
    matches = re.findall("\[[0-9,]*\]|\([a-zA-Z.& ]+,[^A-Za-z0-9_]?[0-9]*[a-zA-Z0-9.&,; ]*\)|[A-Z][a-zA-Z. ]* \([0-9]*\)", sentence)
    return len(matches)

def contain_year(sentence):
    matches = re.findall("19[0-9]{2}|20[0-9]{2}", sentence)
    return len(matches)

def patch_sent_tokenize(sentences):
    out = []
    i = 0
    while i < len(sentences):
        if i>0 and sentences[i-1][-4:] == " et." and sentences[i][:2] == "al":
            out[-1] += " " + sentences[i]
        elif i>0 and (sentences[i-1][-4:] == " al." or sentences[i-1]=="al."):
            out[-1] += " " + sentences[i]
        elif i>0 and sentences[i-1][-4:] == "e.g.":
            out[-1] += " " + sentences[i]
        elif i>0 and sentences[i-1][-4:] == "i.e.":
            out[-1] += " " + sentences[i]
        else:
            out.append(sentences[i])
        i += 1
    return out

In [5]:
related_work_file = "related_work.jsonl"

In [6]:
count = 0
paragraphs = []
paper_ids = []
with open(related_work_file,"r") as f_pdf:
    #with open(output_file,"w") as wf:
    for line in tqdm(f_pdf):
        related_work_dict = json.loads(line)
        for pi, para in enumerate(related_work_dict["related_work"]):
            cite_span_texts = set([citation["text"] for citation in para["cite_spans"]])
            sentences = []
            citation_counts = []
            tags = []
            tag = ""
            for si, sentence in enumerate(patch_sent_tokenize(sent_tokenize(para["text"]))):
                sentence = re.sub("([^\x00-\x7F])+","",sentence)
                citation_count = 0
                for citation in cite_span_texts:
                    if citation in sentence:
                        citation_count+=1
                if citation_count == 0: # Try to extract citation for the second time, in case S2ORC did not find them out.
                    citation_count = contain_citation(sentence)
                if citation_count == 0:
                    citation_count = contain_year(sentence)
                sentences.append(sentence)
            paragraphs.append(sentences)
            paper_ids.append(related_work_dict["paper_id"] + "_" + str(pi+1))

11020it [00:13, 818.04it/s]


In [7]:
docs = {}
for paper_id, str_seq in zip(paper_ids, paragraphs):
    paperid, paraid = paper_id.split("_")
    this_doc = docs.get(paperid,[])
    decorated_str_seq = ["[BOS] "+seq for seq in str_seq]
    this_doc.append(decorated_str_seq)
    docs[paperid] = this_doc

In [8]:
path = "unlabeled_related_work/"

In [9]:
if not os.path.exists(path):
    os.makedirs(path)
for doc_id, doc in docs.items():
    with open(path+doc_id+".txt","w") as f:
        for paragraph in doc:
            for line in paragraph:
                f.write(line+"\n")
            f.write("\n")
    with open(path+doc_id+".ann","w") as f:
        pass