In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import openai
from time import sleep
from copy import deepcopy
import json
from tqdm import tqdm
import numpy as np

import os
import sys
from glob import glob
from util import *
from data_util import scientific_sent_tokenize
from joint_tagger import CorwaTagger

import torch
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from prompting_util import *

In [None]:
prefix = "example_pdfs/"
keyword = ['related work']
target_file_name = "file_name.json"
keyword_relation = "in"

In [None]:
def link_and_update_citations(this_paper):
    discovered_citations = {}
    for bib_key, bib in this_paper["pdf_parse"]["bib_entries"].items():
        if len(bib["authors"]) > 0:
            key = bib["title"].lower() + "@" + bib["authors"][0]["last"].replace(" ","").lower()
            if key in title_author_lookup:
                if title_author_lookup[key]["year"] == "":
                    title_author_lookup[key]["year"] = bib["year"]
                discovered_citations[title_author_lookup[key]["paper_id"]] = bib_key
    citation_mapping = this_paper.get("discovered_citations",{})
    citation_mapping.update(discovered_citations)
    this_paper["discovered_citations"] = citation_mapping
    return this_paper

In [None]:
def prepare_cited_papers_text(chronological_paper_ids, use_faceted_summary=True, include_usage=True, include_relationship=True, use_CTS=False):
    cited_papers_text = ""
    for i, cited_paper_id in enumerate(chronological_paper_ids):
        cited_metadata = global_metadata[cited_paper_id]
        if cited_paper_id != target_paper_id:
            cited_year = "(" + str(cited_metadata["year"]) + ")" if cited_metadata["year"] else ""
            cited_marker = cited_metadata["author"] + " et al. " + cited_year
            cited_title = cited_metadata["title"] + " by " + cited_marker
            
            if use_faceted_summary:
                cited_papers_text += str(i+1)+". " + cited_title
                cited_papers_text += "\n" + cited_facted_summaries[cited_paper_id] + "\n\n" 
            else:
                cited_papers_text += str(i+1)+". " + cited_title
                cited_papers_text += "\nAbstract:" + cited_abstracts[cited_paper_id] + "\n\n" 
            
            if include_usage and cited_paper_id in citation_intent_summary:
                cited_papers_text += "[Usage] "+citation_intent_summary[cited_paper_id] + "\n" 
                cited_papers_text += "\n"

            if include_relationship and cited_paper_id in relationships_by_cited_paper and len(relationships_by_cited_paper) > 0:
                cited_papers_text += "How other papers cite it: \n"
                for relation in relationships_by_cited_paper[cited_paper_id]:
                    cited_papers_text += relation["response"] + "\n" 
                cited_papers_text += "\n"
                
            if use_CTS and len(cited_text_spans[cited_paper_id]) > 0:
                cited_papers_text += "Potentially useful sentences from this paper: \n"
                for section, sentence in cited_text_spans[cited_paper_id]:
                    cited_papers_text += "["+section+"] "+sentence + "\n" 
                cited_papers_text += "\n"
    cited_papers_text = cited_papers_text.replace(target_marker, "our paper")
    return cited_papers_text

In [None]:
def retrieve_citation_spans(citing_paper_id, cited_paper_id, span_types = ["Dominant", "Reference"]):
    spans = []
    if cited_paper_id in cited_jsons[citing_paper_id]["discovered_citations"]:
        cited_bib_key = cited_jsons[citing_paper_id]["discovered_citations"][cited_paper_id]
        if cited_bib_key in span_citation_mappings_lookup[citing_paper_id]:
            all_mentions = span_citation_mappings_lookup[citing_paper_id][cited_bib_key]

            for span_type in span_types:
                for mention, citation_mark in all_mentions[span_type].items():
                    pid, cid = mention.split("_")
                    this_paragraph = span_citation_mappings_dict[citing_paper_id+"_"+pid]
                    start = this_paragraph["span_citation_mapping"][int(cid)]["char_start"]
                    end = this_paragraph["span_citation_mapping"][int(cid)]["char_end"]
                    span_text = this_paragraph["paragraph"][start:end].replace("[BOS]","")
                    spans.append({
                        "paragraph_id": pid,
                        "citation_id": cid,
                        "type": span_type,
                        "marker": citation_mark,
                        "text": span_text, 
                    })
    return spans

In [None]:
def make_relationship_prompt(retrieved_spans, citing_paper_id, cited_paper_id):
    concatenated_spans = "\n".join([str(i+1)+". "+span["text"] for i, span in enumerate(retrieved_spans)])
    concatenated_markers = " or ".join(list(set([span["marker"] for span in retrieved_spans])))

    citing_metadata = global_metadata[citing_paper_id]
    citing_year = "(" + str(citing_metadata["year"]) + ")" if citing_metadata["year"] else ""
    citing_marker = citing_metadata["author"] + " et al. " + citing_year
    citing_title = citing_metadata["title"] + " by " + citing_marker

    cited_metadata = global_metadata[cited_paper_id]
    cited_year = "(" + str(cited_metadata["year"]) + ")" if cited_metadata["year"] else ""
    cited_marker = cited_metadata["author"] + " et al. " + cited_year
    cited_title = cited_metadata["title"] + " by " + cited_marker

    citing_paper = "Faceted summary of the citing paper, " + citing_title + ": \n" + cited_facted_summaries[citing_paper_id]
    cited_paper = "Faceted summary of the cited paper, " + cited_title + ": \n" + cited_facted_summaries[cited_paper_id]
    citation_span_texts = "Citation contexts that "+citing_marker+ " cites " + cited_marker+" (which is cited as "+concatenated_markers+"): \n" + concatenated_spans
    prompt_question = "Very briefly explain the relationship between " + cited_marker +" and " + citing_marker+". TLDR: "
    relationship_prompt = "\n\n".join([citing_paper, cited_paper, citation_span_texts, prompt_question])
    return relationship_prompt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
argparser = ArgumentParser()
argparser.add_argument('--repfile', type=str, default = "allenai/scibert_scivocab_uncased", help="Word embedding file")
argparser.add_argument('--dropout', type=float, default=0, help="embedding_dropout rate")
argparser.add_argument('--bert_dim', type=int, default=768, help="bert_dimension")
argparser.add_argument('--MAX_SENT_LEN', type=int, default=512)
argparser.add_argument('--checkpoint', type=str, default = "joint_tagger_train_scibert_final.model")
argparser.add_argument('--batch_size', type=int, default=1) # roberta-large: 2; bert: 8
args = argparser.parse_args()

In [None]:
target_paper_name = prefix+target_file_name
with open(target_paper_name) as f:
    target_json = json.load(f)
# Exclude target paper's gold related work section.
excluded_body_text = []
for paragraph in target_json["pdf_parse"]["body_text"]:
    if not is_related_work_section(keyword, paragraph["section"].lower(), keyword_relation):
        excluded_body_text.append(paragraph)
target_json_no_related_work = deepcopy(target_json)
target_json_no_related_work["pdf_parse"]["body_text"] = excluded_body_text

In [None]:
cited_jsons = {}
for json_name in glob(prefix+"*.json"):
    ID = ".".join(json_name.split("/")[-1].split(".")[:-1])
    with open(json_name) as f:
        paper = json.load(f)
        try:
            paper_id, paper = create_paper_id(paper, ID)
            cited_jsons[paper_id] = paper
        except:
            pass
target_paper_id, target_paper = create_paper_id(target_json_no_related_work)
cited_jsons[target_paper_id] = target_paper
target_paper_json = cited_jsons[target_paper_id]

In [None]:
pdf_parse_jsons = {paper_id: paper["pdf_parse"]  for paper_id, paper in cited_jsons.items()}
paragraphs = {}
for paper_id, paper in pdf_parse_jsons.items():
    for pi, para in enumerate(paper["body_text"]):
        paragraph_id = paper_id + "_" + str(pi)
        paragraphs[paragraph_id] = " ".join(scientific_sent_tokenize(para["text"]))

In [None]:
joint_tagger_tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
additional_special_tokens = {'additional_special_tokens': ['[BOS]']}
joint_tagger_tokenizer.add_special_tokens(additional_special_tokens)

In [None]:
tagger = CorwaTagger(joint_tagger_tokenizer, device, args)

In [None]:
all_span_citation_mappings = tagger.run_prediction(paragraphs, pdf_parse_jsons)

In [None]:
span_citation_mappings_dict = {paragraph["id"]: paragraph for paragraph in all_span_citation_mappings} 

In [None]:
span_citation_mappings_lookup = {}
for paragraph in all_span_citation_mappings:
    paper_id, pid = paragraph["id"].split("_")
    this_paper = span_citation_mappings_lookup.get(paper_id,{})
    for si, span in enumerate(paragraph["span_citation_mapping"]):
        for span_type, citations in span["span_citation_mapping"].items():
            for citation_mark, bib_key in citations.items():
                this_cited_paper = this_paper.get(bib_key,{"Dominant":{}, "Reference":{}})
                this_cited_paper[span_type][pid+"_"+str(si)] = citation_mark
                this_paper[bib_key] = this_cited_paper
    span_citation_mappings_lookup[paper_id] = this_paper

In [None]:
global_metadata = {}
for paper_id, paper in cited_jsons.items():
    if len(paper["authors"]) > 0:
        global_metadata[paper_id] = {
            "paper_id": paper_id,
            "title": paper["title"],
            "author": paper["authors"][0]["last"].replace(" ",""),
            "year": paper["year"],
        }
    else:
        global_metadata[paper_id] = {
            "paper_id": paper_id,
            "title": paper["title"],
            "author": "Unknown",
            "year": paper["year"],
        }

In [None]:
title_author_lookup = {}
for paper_id, paper in global_metadata.items():
    key = paper["title"].lower() + "@" + paper["author"].lower().replace(" ","")
    title_author_lookup[key] = paper

In [None]:
for key, paper in cited_jsons.items():
    this_paper = link_and_update_citations(paper)
    if global_metadata[key]["year"] == "":
        global_metadata[key]["year"] = key.split("@")[1]

In [None]:
len(global_metadata)

In [None]:
api_key = ""
openai.organization = ""
openai.api_key = api_key
openai.Model.list()

In [None]:
cited_facted_summaries = {}
while len(cited_jsons) > len(cited_facted_summaries):
    for key in tqdm(cited_jsons.keys()):
        if key in cited_facted_summaries:
            continue
        cited_json = cited_jsons[key]
        TAIC = ""
        TAIC += "[Title] " + cited_json["title"] + " \n"
        if cited_json["pdf_parse"]["abstract"]:
            for paragraph in cited_json["pdf_parse"]["abstract"]:
                TAIC += "[Abstract] " + paragraph["text"] + " \n"
        for paragraph in cited_json["pdf_parse"]["body_text"]:
            if "intro" in paragraph["section"].lower() or "conclusion" in paragraph["section"].lower():
                if not has_tokenization_error(paragraph["text"]):
                    TAIC += "["+paragraph["section"]+"] " + paragraph["text"] + " \n"

        facet_prompt = TAIC + "What are the objective, method, findings, contributions and keywords of the paper above? Answer in the format of \n Objective: XXX. \n Method: XXX. \n Findings: XXX. \n Contribution: XXX. \n Keywords: A; B; C."
        try:
            response = openai.ChatCompletion.create(
              model="gpt-3.5-turbo-0301",
              messages=[
                    {"role": "user", "content": facet_prompt}
                ],
              temperature = 0,
            )
            cited_facted_summaries[key] = response["choices"][0]["message"]["content"]
        except:
            print("Failed", key)
        #sleep(1)

In [None]:
relationships = {}
new_entry = -1
while new_entry != 0:
    new_entry = 0
    for cited_paper_id in tqdm(cited_jsons.keys()):
        for citing_paper_id in cited_jsons.keys():
            if cited_paper_id + "+" + citing_paper_id in relationships:
                continue
            retrieved_spans = retrieve_citation_spans(citing_paper_id, cited_paper_id)
            if len(retrieved_spans) > 0:
                relationship_prompt = make_relationship_prompt(retrieved_spans, citing_paper_id, cited_paper_id)
                try:
                    response = openai.ChatCompletion.create(
                      model="gpt-3.5-turbo-0301",
                      messages=[
                            {"role": "user", "content": relationship_prompt}
                        ],
                      temperature = 0,
                    )
                    response_text = response["choices"][0]["message"]["content"]
                    this_pair = {
                        "citing_paper": citing_paper_id,
                        "cited_paper": cited_paper_id,
                        "prompt": relationship_prompt, 
                        "response": response_text
                    }
                    relationships[cited_paper_id + "+" + citing_paper_id] = this_pair
                    #print(this_pair["response"])
                    new_entry += 1
                except:
                    print("#"*30)
                    print(citing_paper_id)
                    print(cited_paper_id)
                    print("Failed!")
                #sleep(10)

In [None]:
relationships_by_cited_paper = {}
for key, relation in relationships.items():
    this_paper = relationships_by_cited_paper.get(relation["cited_paper"],[])
    this_paper.append(relation)
    relationships_by_cited_paper[relation["cited_paper"]] = this_paper

In [None]:
citation_spans_by_cited_paper = {}
for citing_paper_id in cited_jsons.keys():
    for cited_paper_id in cited_jsons.keys():
        retrieved_spans = retrieve_citation_spans(citing_paper_id, cited_paper_id)
        if len(retrieved_spans) > 0:
            this_paper = citation_spans_by_cited_paper.get(cited_paper_id,{})
            this_paper[citing_paper_id] = retrieved_spans
            citation_spans_by_cited_paper[cited_paper_id] = this_paper

In [None]:
citation_intent_summary = {}
while len(citation_intent_summary) < len(relationships_by_cited_paper):
    for cited_paper_id, relations in tqdm(relationships_by_cited_paper.items()):
        if cited_paper_id in citation_intent_summary:
            continue
        cited_metadata = global_metadata[cited_paper_id]
        cited_year = "(" + str(cited_metadata["year"]) + ")" if cited_metadata["year"] else ""
        cited_marker = cited_metadata["author"] + " et al. " + cited_year
        cited_title = cited_metadata["title"] + " by " + cited_marker

        summary = cited_facted_summaries[cited_paper_id]

        concatenated_relation = []
        for relation in relations:
            relation_text = relation["response"]
            citing_paper_id = relation["citing_paper"]
            if citing_paper_id in citation_spans_by_cited_paper[cited_paper_id]:
                examples = []
                for ci, citation in enumerate(citation_spans_by_cited_paper[cited_paper_id][citing_paper_id]):
                    examples.append(str(ci+1)+". " + citation["text"])
                concatenated_relation.append(relation_text+"\nExample citation fragments: \n" + "\n".join(examples))
        concatenated_relation = "\n\n".join(concatenated_relation)

        #citation_intent_prompt = "Faceted summary of " + cited_title +":\n"
        #citation_intent_prompt += summary + "\n\n"
        #citation_intent_prompt += "How other papers cite it: \n" + concatenated_relation + "\n\n"
        # Faceted summary is a distractor!!
        citation_intent_prompt = "How other papers cite "+cited_marker+": \n" + concatenated_relation + "\n\n"
        citation_intent_prompt += "Very briefly answer what "+cited_marker+\
        " is mostly known for, and the common citation intent. "
        citation_intent_prompt += "Hint: pay attention to how "+cited_marker+" is referred by the citing papers. "
        citation_intent_prompt += 'Answer in the format of "'+cited_marker+' is known for XXX and it is cited for YYY". TLDR: '

        try:
            response = openai.ChatCompletion.create(
                      model="gpt-3.5-turbo-0301",
                      messages=[
                            {"role": "user", "content": citation_intent_prompt}
                        ],
                      temperature = 0,
                    )
            response_text = response["choices"][0]["message"]["content"]
            #print(response_text)
            citation_intent_summary[cited_paper_id] = response_text
        except:
            print(cited_paper_id, "failed!")
        #sleep(10)


In [None]:
gold_related_work = ""
for paragraph in target_json["pdf_parse"]["body_text"]:
    if is_related_work_section(keyword, paragraph["section"].lower(), keyword_relation):
        gold_related_work += paragraph["text"] + " \n"

In [None]:
faceted_summary = cited_facted_summaries[target_paper_id]
title = global_metadata[target_paper_id]["title"]
high_level_idea_prompt = "Our title: " + title + "\nFaceted summary of our paper: " + faceted_summary + "\n\nWrite a short summary of the main idea of the following related work section paragraphs. Ignore citations.\n\n" + gold_related_work

In [None]:
response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo-0301",
  messages=[
        {"role": "user", "content": high_level_idea_prompt}
    ],
  temperature = 0,
)
main_idea = response["choices"][0]["message"]["content"]

In [None]:
print(main_idea)

In [None]:
TAIC = ""
TAIC += "[Title] " + target_paper_json["title"] + " \n"
if target_paper_json["pdf_parse"]["abstract"]:
    for paragraph in target_paper_json["pdf_parse"]["abstract"]:
        TAIC += "[Abstract] " + paragraph["text"] + " \n"
for paragraph in target_paper_json["pdf_parse"]["body_text"]:
    if "intro" in paragraph["section"].lower() or "conclusion" in paragraph["section"].lower():
        TAIC += "["+paragraph["section"]+"] " + paragraph["text"] + " \n"
target_paper_TAIC = TAIC

In [None]:
chronological_paper_ids = sorted(list(global_metadata.keys()), key=lambda x: x.split("@")[1], reverse=False)
target_paper = global_metadata[target_paper_id]
target_marker = target_paper["author"]+" et al. (" + str(target_paper["year"]) + ")"

In [None]:
cited_papers_text = prepare_cited_papers_text(chronological_paper_ids, include_usage=True, include_relationship=True)

In [None]:
intro_instruction = "We have finished writing the title, abstract, introduction and conclusion section of our NLP paper as follows: \n"
semi_automatic_generation_instruction = """
However, the related work section is still missing.
Write our related work section that concisely cites all the following papers in a natural way using all of the main ideas as the main story.
Keep it short, e.g. a few paragraphs at most. Make sure the related work section do not conflict with the sections already written.
You can freely reorder the cited papers to adapt to the main ideas.
Pay extra attention to [Usage] which indicates how each work is cited by other work. \n
"""
semi_automatic_generation_prompt = intro_instruction + target_paper_TAIC + semi_automatic_generation_instruction +\
"\n Main idea of our related work section: " + main_idea + "\n\nList of cited papers: "+ cited_papers_text + "\nOur related work section: " 

In [None]:
print(semi_automatic_generation_prompt)

In [None]:
assert len(gpt_tokenizer(semi_automatic_generation_prompt).input_ids) <= 8000
response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt}
    ],
  temperature = 0,
)
semi_automatic_related_work_gpt4 = response["choices"][0]["message"]["content"]
print(semi_automatic_related_work_gpt4)

In [None]:
intro_instruction = "We have finished writing the title, abstract, introduction and conclusion section of our NLP paper as follows: \n"
fully_automatic_generation_instruction = """
However, the related work section is still missing.
Write our related work section that concisely cites all the following papers in a natural way.
Keep it short, e.g. a few paragraphs at most. Stick the topic to the main topic of our paper.
Pay extra attention to [Usage] which indicates how each work is cited by other work. \n
"""
fully_automatic_generation_prompt = intro_instruction + target_paper_TAIC + fully_automatic_generation_instruction + "List of cited papers: "+ cited_papers_text + "\nOur related work section: "


In [None]:
assert len(gpt_tokenizer(fully_automatic_generation_prompt).input_ids) <= 8000

response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": fully_automatic_generation_prompt}
    ],
  temperature = 0,
)
fully_automatic_related_work_gpt4 = response["choices"][0]["message"]["content"]
print(fully_automatic_related_work_gpt4)

In [None]:
cited_papers_text_no_usage = prepare_cited_papers_text(chronological_paper_ids, include_usage=False, include_relationship=True)
semi_automatic_generation_prompt_no_usage = intro_instruction + target_paper_TAIC + semi_automatic_generation_instruction +\
"\n Main idea of our related work section: " + main_idea + "\n\nList of cited papers: "+ cited_papers_text_no_usage + "\nOur related work section: " 

In [None]:
response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt_no_usage}
    ],
  temperature = 0,
)
semi_automatic_related_work_gpt4_no_usage = response["choices"][0]["message"]["content"]
print(semi_automatic_related_work_gpt4_no_usage)

In [None]:
cited_papers_text_no_relationship = prepare_cited_papers_text(chronological_paper_ids, include_usage=True, include_relationship=False)
semi_automatic_generation_prompt_no_relationship = intro_instruction + target_paper_TAIC + semi_automatic_generation_instruction +\
"\n Main idea of our related work section: " + main_idea + "\n\nList of cited papers: "+ cited_papers_text_no_relationship + "\nOur related work section: " 

In [None]:
response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt_no_relationship}
    ],
  temperature = 0,
)
semi_automatic_related_work_gpt4_no_relationship = response["choices"][0]["message"]["content"]
print(semi_automatic_related_work_gpt4_no_relationship)

In [None]:
intro_instruction = "We have finished writing the title, abstract, introduction and conclusion section of our NLP paper. "
semi_automatic_generation_instruction = """
However, the related work section is still missing.
Write our related work section that concisely cites all the following papers in a natural way using all of the main ideas as the main story.
Keep it short, e.g. a few paragraphs at most. Make sure the related work section do not conflict with the sections already written.
You can freely reorder the cited papers to adapt to the main ideas.
Pay extra attention to [Usage] which indicates how each work is cited by other work. \n
"""
semi_automatic_generation_prompt_no_TAIC = intro_instruction + semi_automatic_generation_instruction +\
"\n Main idea of our related work section: " + main_idea + "\n\nList of cited papers: "+ cited_papers_text + "\nOur related work section: " 

In [None]:
response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt_no_TAIC}
    ],
  temperature = 0,
)
semi_automatic_related_work_gpt4_no_TAIC = response["choices"][0]["message"]["content"]
print(semi_automatic_related_work_gpt4_no_TAIC)

In [None]:
output_paragraphs = {}
pi = 0
for paragraph in semi_automatic_related_work_gpt4.split("\n"):
    if paragraph:
        output_paragraphs["predicted_"+str(pi)] = " ".join(scientific_sent_tokenize(paragraph))
        pi+=1
        
pseudo_related_work_json = {
    "paper_id": "predicted",
    "bib_entries": {},
    "body_text": []
}
for para_id, paragraph in output_paragraphs.items():
    pseudo_related_work_json["body_text"].append({
        "section": para_id,
        "text": paragraph,
        "cite_spans": [],
    })

In [None]:
generated_span_citation_mappings = tagger.run_prediction(output_paragraphs, {"predicted": pseudo_related_work_json})

In [None]:
for paragraph in generated_span_citation_mappings:
    for span in paragraph["span_citation_mapping"]:
        for citation_type, citation in span["span_citation_mapping"].items():
            for marker in citation.keys():
                paper_id = match_paper_id(marker, global_metadata)
                if paper_id:
                    citation[marker] = paper_id

In [None]:
cited_paper_sentences = {}
for paper_id, cited_json in cited_jsons.items():
    sentences = []
    for paragraph in cited_json["pdf_parse"]["body_text"]:
        these_sentences = scientific_sent_tokenize(paragraph["text"],add_bos_token=False)
        for sentence in these_sentences:
            sentences.append((paragraph["section"], sentence))
    cited_paper_sentences[paper_id] = sentences

In [None]:
intro_instruction = "We have finished writing the title, abstract, introduction and conclusion section of our NLP paper as follows: \n"
semi_automatic_generation_instruction = """
However, the related work section is still missing.
Write our related work section that concisely cites the following papers in a natural way using all of the main ideas as the main story.
Keep it short, e.g. 3 paragraphs at most. Make sure the related work section do not conflict with the sections already written.
You can freely reorder the cited papers to adapt to the main ideas.
Pay extra attention to [Usage] which indicates how each work is cited by other work. \n
"""
 

for k in [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]:
    cited_text_spans = {paper_id: [] for paper_id in cited_paper_sentences.keys()}
    for paragraph in generated_span_citation_mappings:
        text = paragraph["paragraph"]
        for span in paragraph["span_citation_mapping"]:
            start = span["char_start"]
            end = span["char_end"]
            span_text = text[start:end]
            for citation_type, citation in span["span_citation_mapping"].items():
                for marker, paper_id in citation.items():
                    if paper_id is not None and cited_paper_sentences[paper_id] is not None:
                        retrieved_sentences = rouge_retrieval(span_text, cited_paper_sentences[paper_id],k=k)
                        cited_text_spans[paper_id].extend(retrieved_sentences)

    cited_papers_text_CTS = prepare_cited_papers_text(chronological_paper_ids, use_faceted_summary=True, include_usage=True, include_relationship=True, use_CTS=True)
    semi_automatic_generation_prompt_CTS = intro_instruction + target_paper_TAIC + semi_automatic_generation_instruction +\
    "\n Main idea of our related work section: " + main_idea + "\n\n List of cited papers: "+\
    cited_papers_text_CTS + "\nOur related work section that cites ALL of the listed papers above: " 
    if len(gpt_tokenizer(semi_automatic_generation_prompt_CTS).input_ids) <= 8000:
        print("k="+str(k))
        break

In [None]:
print(semi_automatic_generation_prompt_CTS)

In [None]:
assert len(gpt_tokenizer(semi_automatic_generation_prompt_CTS).input_ids) <= 8000

response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt_CTS}
    ],
  temperature = 0,
)
related_work_text_CTS = response["choices"][0]["message"]["content"]
print(related_work_text_CTS)

In [None]:
cited_abstracts = {}
for key in cited_jsons.keys():
    if key in cited_abstracts:
        continue
    cited_json = cited_jsons[key]
    if cited_json["pdf_parse"]["abstract"]:
        cited_abstracts[key] = cited_json["pdf_parse"]["abstract"][0]["text"]
    else:
        TAIC = ""
        for paragraph in cited_json["pdf_parse"]["body_text"]:
            if "intro" in paragraph["section"].lower():
                TAIC += paragraph["text"] + " \n"
        cited_abstracts[key] = TAIC

In [None]:
intro_instruction = "We have finished writing the title, abstract, introduction and conclusion section of our NLP paper as follows: \n"
semi_automatic_generation_instruction = """
However, the related work section is still missing.
Write our related work section that concisely cites all the following papers in a natural way using all of the main ideas as the main story.
Keep it short, e.g. a few paragraphs at most. Make sure the related work section do not conflict with the sections already written.
You can freely reorder the cited papers to adapt to the main ideas.
Pay extra attention to [Usage] which indicates how each work is cited by other work. \n
"""
cited_papers_text_abstract = prepare_cited_papers_text(chronological_paper_ids, use_faceted_summary = False, include_usage=True, include_relationship=True)
semi_automatic_generation_prompt_abstract = intro_instruction + target_paper_TAIC + semi_automatic_generation_instruction +\
"\n Main idea of our related work section: " + main_idea + "\n\nList of cited papers: "+ cited_papers_text_abstract + "\nOur related work section: " 

In [None]:
assert len(gpt_tokenizer(semi_automatic_generation_prompt_abstract).input_ids) <= 8000

response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt_abstract}
    ],
  temperature = 0,
)
semi_automatic_related_work_gpt4_abstract = response["choices"][0]["message"]["content"]
print(semi_automatic_related_work_gpt4_abstract)

In [None]:
intro_instruction = "We have finished writing the title, abstract, introduction and conclusion section of our NLP paper as follows: \n"
semi_automatic_generation_instruction_no_usage_relationship = """
However, the related work section is still missing.
Write our related work section that concisely cites all the following papers in a natural way using all of the main ideas as the main story.
Keep it short, e.g. a few paragraphs at most. Make sure the related work section do not conflict with the sections already written.
You can freely reorder the cited papers to adapt to the main ideas.\n
"""
cited_papers_text_no_usage_relationship = prepare_cited_papers_text(chronological_paper_ids, include_usage=False, include_relationship=False)
semi_automatic_generation_prompt_no_usage_relationship = intro_instruction + target_paper_TAIC + semi_automatic_generation_instruction_no_usage_relationship +\
"\n Main idea of our related work section: " + main_idea + "\n\nList of cited papers: "+ cited_papers_text_no_usage_relationship + "\nOur related work section: " 

In [None]:
response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt_no_usage_relationship}
    ],
  temperature = 0,
)
semi_automatic_related_work_gpt4_no_usage_relationship = response["choices"][0]["message"]["content"]
print(semi_automatic_related_work_gpt4_no_usage_relationship)

In [None]:
output_paragraphs = {}
pi = 0
for paragraph in semi_automatic_related_work_gpt4_abstract.split("\n"):
    if paragraph:
        output_paragraphs["predicted_"+str(pi)] = " ".join(scientific_sent_tokenize(paragraph))
        pi+=1
        
pseudo_related_work_json = {
    "paper_id": "predicted",
    "bib_entries": {},
    "body_text": []
}
for para_id, paragraph in output_paragraphs.items():
    pseudo_related_work_json["body_text"].append({
        "section": para_id,
        "text": paragraph,
        "cite_spans": [],
    })
    
generated_span_citation_mappings = tagger.run_prediction(output_paragraphs, {"predicted": pseudo_related_work_json})

for paragraph in generated_span_citation_mappings:
    for span in paragraph["span_citation_mapping"]:
        for citation_type, citation in span["span_citation_mapping"].items():
            for marker in citation.keys():
                paper_id = match_paper_id(marker, global_metadata)
                if paper_id:
                    citation[marker] = paper_id
                    
cited_paper_sentences = {}
for paper_id, cited_json in cited_jsons.items():
    sentences = []
    for paragraph in cited_json["pdf_parse"]["body_text"]:
        these_sentences = scientific_sent_tokenize(paragraph["text"],add_bos_token=False)
        for sentence in these_sentences:
            sentences.append((paragraph["section"], sentence))
    cited_paper_sentences[paper_id] = sentences

In [None]:
intro_instruction = "We have finished writing the title, abstract, introduction and conclusion section of our NLP paper as follows: \n"
semi_automatic_generation_instruction = """
However, the related work section is still missing.
Write our related work section that concisely cites the following papers in a natural way using all of the main ideas as the main story.
Keep it short, e.g. 3 paragraphs at most. Make sure the related work section do not conflict with the sections already written.
You can freely reorder the cited papers to adapt to the main ideas.
Pay extra attention to [Usage] which indicates how each work is cited by other work. \n
"""
 

for k in [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]:
    cited_text_spans = {paper_id: [] for paper_id in cited_paper_sentences.keys()}
    for paragraph in generated_span_citation_mappings:
        text = paragraph["paragraph"]
        for span in paragraph["span_citation_mapping"]:
            start = span["char_start"]
            end = span["char_end"]
            span_text = text[start:end]
            for citation_type, citation in span["span_citation_mapping"].items():
                for marker, paper_id in citation.items():
                    if paper_id is not None and cited_paper_sentences[paper_id] is not None:
                        retrieved_sentences = rouge_retrieval(span_text, cited_paper_sentences[paper_id],k=k)
                        cited_text_spans[paper_id].extend(retrieved_sentences)

    cited_papers_text_abstract_CTS = prepare_cited_papers_text(chronological_paper_ids, use_faceted_summary=False, include_usage=True, include_relationship=True, use_CTS=True)
    semi_automatic_generation_prompt_abstract_CTS = intro_instruction + target_paper_TAIC + semi_automatic_generation_instruction +\
    "\n Main idea of our related work section: " + main_idea + "\n\n List of cited papers: "+\
    cited_papers_text_abstract_CTS + "\nOur related work section that cites ALL of the listed papers above: " 
    if len(gpt_tokenizer(semi_automatic_generation_prompt_abstract_CTS).input_ids) <= 8000:
        print("k="+str(k))
        break

In [None]:
assert len(gpt_tokenizer(semi_automatic_generation_prompt_abstract_CTS).input_ids) <= 8000

response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt_abstract_CTS}
    ],
  temperature = 0,
)
related_work_text_abstract_CTS = response["choices"][0]["message"]["content"]
print(related_work_text_abstract_CTS)

In [None]:
intro_instruction = "We have finished writing the title, abstract, introduction and conclusion section of our NLP paper as follows: \n"
semi_automatic_generation_instruction = """
However, the related work section is still missing.
Write our related work section that concisely cites all the following papers in a natural way using all of the main ideas as the main story.
Keep it short, e.g. a few paragraphs at most. Make sure the related work section do not conflict with the sections already written.
You can freely reorder the cited papers to adapt to the main ideas.\n
"""
cited_papers_text_abstract_no_usage_relationship = prepare_cited_papers_text(chronological_paper_ids, use_faceted_summary = False, include_usage=False, include_relationship=False)
semi_automatic_generation_prompt_abstract_no_usage_relationship = intro_instruction + target_paper_TAIC + semi_automatic_generation_instruction +\
"\n Main idea of our related work section: " + main_idea + "\n\nList of cited papers: "+ cited_papers_text_abstract_no_usage_relationship + "\nOur related work section: " 

In [None]:
assert len(gpt_tokenizer(semi_automatic_generation_prompt_abstract_no_usage_relationship).input_ids) <= 8000

response = openai.ChatCompletion.create(
  model="gpt-4-0314",
  messages=[
        #{"role": "system", "content": system_prompt},
        {"role": "user", "content": semi_automatic_generation_prompt_abstract_no_usage_relationship}
    ],
  temperature = 0,
)
semi_automatic_related_work_gpt4_abstract_no_usage_relationship = response["choices"][0]["message"]["content"]
print(semi_automatic_related_work_gpt4_abstract_no_usage_relationship)

In [None]:
everything_collected = {
    "prefix": prefix,
    "keyword": keyword,
    "target_paper_name": target_paper_name,
    "cited_jsons": cited_jsons,
    "global_metadata": global_metadata,
    "cited_facted_summaries": cited_facted_summaries,
    "relationships_by_cited_paper": relationships_by_cited_paper,
    "citation_spans_by_cited_paper": citation_spans_by_cited_paper,
    "citation_intent_summary": citation_intent_summary,
    "high_level_idea_prompt": high_level_idea_prompt,
    "main_idea": main_idea,
    "target_marker": target_marker,
    "fully_automatic_generation_prompt": fully_automatic_generation_prompt,
    "fully_automatic_related_work_gpt4": fully_automatic_related_work_gpt4,
    "semi_automatic_generation_prompt": semi_automatic_generation_prompt,
    "semi_automatic_related_work_gpt4": semi_automatic_related_work_gpt4,
    "semi_automatic_generation_prompt_no_usage": semi_automatic_generation_prompt_no_usage,
    "semi_automatic_related_work_gpt4_no_usage": semi_automatic_related_work_gpt4_no_usage,
    "semi_automatic_generation_prompt_no_relationship": semi_automatic_generation_prompt_no_relationship,
    "semi_automatic_related_work_gpt4_no_relationship": semi_automatic_related_work_gpt4_no_relationship,
    "semi_automatic_generation_prompt_no_TAIC": semi_automatic_generation_prompt_no_TAIC,
    "semi_automatic_related_work_gpt4_no_TAIC": semi_automatic_related_work_gpt4_no_TAIC,
    "k": k,
    "semi_automatic_generation_prompt_CTS": semi_automatic_generation_prompt_CTS,
    "related_work_text_CTS": related_work_text_CTS,
    "semi_automatic_generation_prompt_abstract": semi_automatic_generation_prompt_abstract,
    "semi_automatic_related_work_gpt4_abstract": semi_automatic_related_work_gpt4_abstract,
    "semi_automatic_generation_prompt_no_usage_relationship": semi_automatic_generation_prompt_no_usage_relationship,
    "semi_automatic_related_work_gpt4_no_usage_relationship": semi_automatic_related_work_gpt4_no_usage_relationship,
    "semi_automatic_generation_prompt_abstract_CTS": semi_automatic_generation_prompt_abstract_CTS,
    "related_work_text_abstract_CTS": related_work_text_abstract_CTS,
    "semi_automatic_generation_prompt_abstract_no_usage_relationship": semi_automatic_generation_prompt_abstract_no_usage_relationship,
    "semi_automatic_related_work_gpt4_abstract_no_usage_relationship": semi_automatic_related_work_gpt4_abstract_no_usage_relationship,
    "span_citation_mappings_lookup": span_citation_mappings_lookup,
}
with open(prefix.replace("pdfs/","dump_full.json"),"w") as f:
    json.dump(everything_collected, f)