In [1]:
import json
from openai import OpenAI
from jinja2 import Template
# import instructor
# from pydantic import BaseModel
# client = instructor.from_openai(OpenAI())
client = OpenAI()


def run_openai(rendered_prompt, system_prompt, model="gpt-3.5-turbo", max_tokens=512, temperature=0.5):
    response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object"}, # note json object constrains to only generating one item per request
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": rendered_prompt}
        ],
        max_tokens=max_tokens,
        temperature=temperature
    )
    content = response.choices[0].message.content
    try:
        return json.loads(content)
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON response: {content}")
        return [{"error": str(e), "response": content}]


# def run_openai_with_instructor(text, response_model, model="gpt-3.5-turbo"):
#     return client.chat.completions.create(
#         model=model,
#         response_model=response_model,
#         messages=[{"role": "user", "content": text}],
#     )

### Prompts for Identifying Open-Form Relations

In [2]:
SYSTEM_PROMPT = "You are an expert in identifying and characterizing the relationships between entities in text."

PAIRWISE_PROMPT_1 = """
Here is a text:
{{text}}

Please identify the relationship between the head/subject entity {{head}} and the tail/object entity {{tail}}.
If the relationship is not meaningful, please instantiate "relation" with a null value in the JSON response.

Answer directly in a JSON format as follows: {"head": {{head}}, "tail": {{tail}}, "relation": ...}}}
Return only JSON, do not use backticks or other markdown syntax.
"""


PAIRWISE_PROMPT_2 = """
Here is a text:
{{text}}

Please identify the relationship between the head/subject entity {{head}} and the tail/object entity {{tail}}.
If the relationship is not meaningful, please instantiate "relation" with a null value in the JSON response.

Pick a relation with a similar level of specificity to the following examples:
{{relation_examples}}

IMPORTANT: Depending on which entity is the head and which is the tail, we expect a differently worded relation!

Answer directly in a JSON format as follows: {"head": {{head}}, "tail": {{tail}}, "relation": ...}}}
Return only JSON, do not use backticks or other markdown syntax.
"""


LABEL_EXAMPLES_BY_VERBOSITY = {
    0: ["contains", "spouse", "employer", "causes"],
    1: ["is a", "is a type of", "is a part of", "is a member", "married to"],
    2: ["is the father of", "had a dinner with"],
    3: ["place where person had an accident", "date when company was founded", "tried to commit crime in"]
}

In [3]:
text = """
Historic shipwreck identified in Lake Michigan MUSKEGON – The bottom of Lake Michigan is literally a graveyard of shipwrecks .
Local maritime historians say 1,200 of the 2,000 sunken vessels in Lake Michigan no longer exist because they hit shore and broke apart .
Experts add that about 360 wrecks have been found in the lake ' s deeper water , but there are still many wrecks out there that remain undiscovered .
A group of explorers recently found a historic steamship off the coast of Muskegon more than a century after it sank . Members of the Michigan Shipwreck Research Association say this is one of the deepest wrecks ever discovered in Lake Michigan . The story begins around midnight on Feb . 9 , 1899 . Lake Michigan was ice-caked and the 214-foot John V . Moran bucked the ice floes on its run from Milwaukee to Muskegon to deliver a cargo of barreled flour and package goods . The ship was only 11 years old and had an iron-reinforced hull for winter transit , but the conditions on this particular day were too much for the steamer . Ice struck a hole in the hull and water began pouring in . Capt . John McLeod dumped as much of the cargo as he could to lighten the load and try to keep the ship afloat , but it began slipping underneath the ice .
"""
rendered_prompt = Template(PAIRWISE_PROMPT_2).render(
    text=text,
    head="John McLeod",
    tail="Lake Michigan",
    relation_examples=LABEL_EXAMPLES_BY_VERBOSITY[2]
)
print(rendered_prompt)
output = run_openai(rendered_prompt, system_prompt=SYSTEM_PROMPT, model="gpt-3.5-turbo", max_tokens=512, temperature=0.5)
output



Here is a text:

Historic shipwreck identified in Lake Michigan MUSKEGON – The bottom of Lake Michigan is literally a graveyard of shipwrecks .
Local maritime historians say 1,200 of the 2,000 sunken vessels in Lake Michigan no longer exist because they hit shore and broke apart .
Experts add that about 360 wrecks have been found in the lake ' s deeper water , but there are still many wrecks out there that remain undiscovered .
A group of explorers recently found a historic steamship off the coast of Muskegon more than a century after it sank . Members of the Michigan Shipwreck Research Association say this is one of the deepest wrecks ever discovered in Lake Michigan . The story begins around midnight on Feb . 9 , 1899 . Lake Michigan was ice-caked and the 214-foot John V . Moran bucked the ice floes on its run from Milwaukee to Muskegon to deliver a cargo of barreled flour and package goods . The ship was only 11 years old and had an iron-reinforced hull for winter transit , but the

{'head': 'John McLeod',
 'tail': 'Lake Michigan',
 'relation': 'was the captain of'}

In [4]:
rendered_prompt = Template(PAIRWISE_PROMPT_2).render(
    text=text,
    head="Lake Michigan",
    tail="John McLeod",
    relation_examples=LABEL_EXAMPLES_BY_VERBOSITY[2]
)
output = run_openai(rendered_prompt, system_prompt=SYSTEM_PROMPT, model="gpt-3.5-turbo", max_tokens=512, temperature=0.5)
output

{'head': 'Lake Michigan',
 'tail': 'John McLeod',
 'relation': 'is the location where the shipwreck of John V. Moran occurred'}

In [5]:
# download spacy model
# !pip install spacy
# !python -m spacy download en_core_web_trf

### Creating Pairwise Inputs for LLM

In [6]:
import spacy

nlp = spacy.load("en_core_web_trf")

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# !wget https://huggingface.co/datasets/urchade/pile-mistral-v0.1/resolve/main/data.json?download=true -O pile-mistral-v0.1.json

with open('pile-mistral-v0.1.json') as f:
    example_data = json.load(f)
for x in example_data:
    x["text"] = " ".join(x["tokenized_text"])
len(example_data)

19783

In [8]:
import random
import tqdm

random.seed(24)


def create_data_items(nlp, dataset, n_sents=1):
    """
    Create items that contain sentence + entity pairs for relation extraction.
    Note we also add sentence IDs because we need to group all pairs of a sentence together later.
    We don't need all sentences of a doc but we do need all pairs of a sentence!
    """
    pair_items = []
    sent_id = 0
    for x in tqdm.tqdm(dataset):
        doc = nlp(x['text'])
        sents = list(doc.sents)
        sampled_sents = random.sample(sents, min(n_sents, len(sents)))
        for sent in sampled_sents:
            entities = [ent.text for ent in sent.ents]
            for e1 in entities:
                for e2 in entities:
                    if e1 != e2:
                        pair_items.append({"relation": {"head": e1, "tail": e2}, "text": sent.text, "sentence_id": sent_id})
            sent_id += 1
    return pair_items

In [9]:
pair_items = create_data_items(nlp, example_data[:10], n_sents=1)
len(pair_items)

100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


34

In [10]:
random.shuffle(pair_items)

### Generate Labelled Data

In [11]:
verbosity_levels = sorted(LABEL_EXAMPLES_BY_VERBOSITY)
verbosity_levels

[0, 1, 2, 3]

In [12]:
data_with_llm_outputs = []
for x in tqdm.tqdm(pair_items[:25]):
    print(x)
    verbosity = random.choice(verbosity_levels)
    rendered_prompt = Template(PAIRWISE_PROMPT_2).render(
        text=x['text'],
        head=x['relation']['head'],
        tail=x['relation']['tail'],
        relation_examples=LABEL_EXAMPLES_BY_VERBOSITY[verbosity]
    )
    output = run_openai(rendered_prompt, system_prompt=SYSTEM_PROMPT, model="gpt-3.5-turbo", max_tokens=512, temperature=0.5)
    relation_label = output['relation']
    print()
    print('verbosity', verbosity)
    print('generated relation:', output)
    print()
    new_item = x.copy()
    new_item['relation']['label'] = relation_label
    data_with_llm_outputs.append(new_item)

  0%|          | 0/25 [00:00<?, ?it/s]

{'relation': {'head': 'Americans', 'tail': 'Kenworthy'}, 'text': 'As Kenworthy explains it , social democracy is really just about setting up governmental insurance to cover Americans in the event of unexpected expenses and losses of income .', 'sentence_id': 2}


  4%|▍         | 1/25 [00:01<00:24,  1.01s/it]


verbosity 1
generated relation: {'head': 'Americans', 'tail': 'Kenworthy', 'relation': 'is explained by'}

{'relation': {'head': 'Kenworthy', 'tail': 'Americans'}, 'text': 'As Kenworthy explains it , social democracy is really just about setting up governmental insurance to cover Americans in the event of unexpected expenses and losses of income .', 'sentence_id': 2}


  8%|▊         | 2/25 [00:02<00:23,  1.03s/it]


verbosity 3
generated relation: {'head': 'Kenworthy', 'tail': 'Americans', 'relation': 'proposes governmental insurance to cover'}

{'relation': {'head': 'Ashes', 'tail': 'Friday'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 12%|█▏        | 3/25 [00:02<00:19,  1.10it/s]


verbosity 2
generated relation: {'head': 'Ashes', 'tail': 'Friday', 'relation': 'is the date of'}

{'relation': {'head': '15', 'tail': 'one'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 16%|█▌        | 4/25 [00:04<00:22,  1.07s/it]


verbosity 2
generated relation: {'head': 15, 'tail': 'one', 'relation': 'belongs to'}

{'relation': {'head': 'Cricket Australia', 'tail': 'Friday'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 20%|██        | 5/25 [00:05<00:22,  1.11s/it]


verbosity 2
generated relation: {'head': 'Cricket Australia', 'tail': 'Friday', 'relation': 'confirmed the news on'}

{'relation': {'head': '15', 'tail': '60'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 24%|██▍       | 6/25 [00:06<00:19,  1.00s/it]


verbosity 1
generated relation: {'head': 15, 'tail': 60, 'relation': 'belongs to'}

{'relation': {'head': 'Friday', 'tail': 'Cricket Australia'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 28%|██▊       | 7/25 [00:07<00:19,  1.07s/it]


verbosity 0
generated relation: {'head': 'Friday', 'tail': 'Cricket Australia', 'relation': 'confirms'}

{'relation': {'head': 'six', 'tail': '15'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 32%|███▏      | 8/25 [00:08<00:16,  1.02it/s]


verbosity 3
generated relation: {'head': 'six', 'tail': 15, 'relation': 'number of reference strains'}

{'relation': {'head': 'Cricket Australia', 'tail': 'Cummins'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 36%|███▌      | 9/25 [00:09<00:16,  1.00s/it]


verbosity 1
generated relation: {'head': 'Cricket Australia', 'tail': 'Cummins', 'relation': 'confirmed by'}

{'relation': {'head': 'later this year', 'tail': 'Friday'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 40%|████      | 10/25 [00:10<00:14,  1.02it/s]


verbosity 1
generated relation: {'head': 'later this year', 'tail': 'Friday', 'relation': 'is a'}

{'relation': {'head': 'later this year', 'tail': 'Cricket Australia'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 44%|████▍     | 11/25 [00:11<00:13,  1.04it/s]


verbosity 2
generated relation: {'head': 'later this year', 'tail': 'Cricket Australia', 'relation': 'confirmed the departure of'}

{'relation': {'head': 'Ashes', 'tail': 'later this year'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 48%|████▊     | 12/25 [00:11<00:11,  1.10it/s]


verbosity 3
generated relation: {'head': 'Ashes', 'tail': 'later this year', 'relation': 'scheduled event'}

{'relation': {'head': 'Cummins', 'tail': 'Friday'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 52%|█████▏    | 13/25 [00:12<00:11,  1.01it/s]


verbosity 1
generated relation: {'head': 'Cummins', 'tail': 'Friday', 'relation': 'is confirmed on'}

{'relation': {'head': 'one', 'tail': '15'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 56%|█████▌    | 14/25 [00:14<00:12,  1.12s/it]


verbosity 1
generated relation: {'head': 'one', 'tail': 15, 'relation': 'belongs to'}

{'relation': {'head': '60', 'tail': '15'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 60%|██████    | 15/25 [00:15<00:10,  1.04s/it]


verbosity 3
generated relation: {'head': 60, 'tail': 15, 'relation': 'number of strains compared to'}

{'relation': {'head': 'six', 'tail': '60'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 64%|██████▍   | 16/25 [00:16<00:08,  1.04it/s]


verbosity 2
generated relation: {'head': 'six', 'tail': 60, 'relation': 'analyzed the variation of'}

{'relation': {'head': 'Friday', 'tail': 'Ashes'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 68%|██████▊   | 17/25 [00:16<00:07,  1.05it/s]


verbosity 1
generated relation: {'head': 'Friday', 'tail': 'Ashes', 'relation': 'is a time period for'}

{'relation': {'head': '15', 'tail': 'six'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 72%|███████▏  | 18/25 [00:17<00:06,  1.03it/s]


verbosity 1
generated relation: {'head': 15, 'tail': 6, 'relation': 'belongs to'}

{'relation': {'head': 'one', 'tail': 'six'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 76%|███████▌  | 19/25 [00:18<00:05,  1.10it/s]


verbosity 0
generated relation: {'head': 'one', 'tail': 'six', 'relation': 'belongs_to'}

{'relation': {'head': 'later this year', 'tail': 'Cummins'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 80%|████████  | 20/25 [00:19<00:04,  1.13it/s]


verbosity 1
generated relation: {'head': 'later this year', 'tail': 'Cummins', 'relation': 'is preparing for'}

{'relation': {'head': 'six', 'tail': 'one'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 84%|████████▍ | 21/25 [00:20<00:03,  1.03it/s]


verbosity 2
generated relation: {'head': 'six', 'tail': 'one', 'relation': 'belonging to'}

{'relation': {'head': 'Friday', 'tail': 'Cummins'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 88%|████████▊ | 22/25 [00:21<00:03,  1.04s/it]


verbosity 2
generated relation: {'head': 'Friday', 'tail': 'Cummins', 'relation': 'confirmed on'}

{'relation': {'head': 'Cummins', 'tail': 'later this year'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


 92%|█████████▏| 23/25 [00:23<00:02,  1.06s/it]


verbosity 1
generated relation: {'head': 'Cummins', 'tail': 'later this year', 'relation': 'will be preparing for'}

{'relation': {'head': '60', 'tail': 'six'}, 'text': 'We analyzed the variation of one nuclear ( Gpi ) and one mitochondrial ( Nd1 ) gene among 60 TcI strains and 15 reference strains belonging to the six DTUs .', 'sentence_id': 1}


 96%|█████████▌| 24/25 [00:23<00:01,  1.02s/it]


verbosity 3
generated relation: {'head': 60, 'tail': 'six', 'relation': 'number of TcI strains compared to number of DTUs'}

{'relation': {'head': 'Cummins', 'tail': 'Cricket Australia'}, 'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .', 'sentence_id': 6}


100%|██████████| 25/25 [00:24<00:00,  1.01it/s]


verbosity 1
generated relation: {'head': 'Cummins', 'tail': 'Cricket Australia', 'relation': 'is leaving'}






In [13]:
from collections import defaultdict

def group_items_by_sentence(data):
    sentence_id_to_items = defaultdict(list)
    for x in data:
        sentence_id_to_items[x['sentence_id']].append(x)
    sentence_items = []
    for pair_items in sentence_id_to_items.values():
        sent_item = {
            'text': pair_items[0]['text'],
            'relations': [x['relation'] for x in pair_items]
        }
        sentence_items.append(sent_item)
    return sentence_items

In [14]:
data_with_llm_outputs

[{'relation': {'head': 'Americans',
   'tail': 'Kenworthy',
   'label': 'is explained by'},
  'text': 'As Kenworthy explains it , social democracy is really just about setting up governmental insurance to cover Americans in the event of unexpected expenses and losses of income .',
  'sentence_id': 2},
 {'relation': {'head': 'Kenworthy',
   'tail': 'Americans',
   'label': 'proposes governmental insurance to cover'},
  'text': 'As Kenworthy explains it , social democracy is really just about setting up governmental insurance to cover Americans in the event of unexpected expenses and losses of income .',
  'sentence_id': 2},
 {'relation': {'head': 'Ashes', 'tail': 'Friday', 'label': 'is the date of'},
  'text': 'Cummins will be leaving this series and return home to prepare for the Ashes later this year , Cricket Australia confirmed on Friday .',
  'sentence_id': 6},
 {'relation': {'head': '15', 'tail': 'one', 'label': 'belongs to'},
  'text': 'We analyzed the variation of one nuclear ( 

In [15]:
annotated_dataset = group_items_by_sentence(data_with_llm_outputs)
len(annotated_dataset)

3

In [16]:
annotated_dataset[0]

{'text': 'As Kenworthy explains it , social democracy is really just about setting up governmental insurance to cover Americans in the event of unexpected expenses and losses of income .',
 'relations': [{'head': 'Americans',
   'tail': 'Kenworthy',
   'label': 'is explained by'},
  {'head': 'Kenworthy',
   'tail': 'Americans',
   'label': 'proposes governmental insurance to cover'}]}