# ðŸšž Zero-shot RE Training

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# if you're running this in a colab notebook, you can run this cell to install the necessary dependencies
!git clone https://github.com/jackboyla/GLiREL.git
!cd GLiREL && pip install -e .
!python -m spacy download en_core_web_sm

In [2]:
import os 
# os.chdir('./GLiREL')

In [None]:
!python train.py --config config_small_rel.yaml --log_dir logs --relation_extraction

In [None]:
from glirel import GLiREL

save_path = 'logs/model_4000'
model = GLiREL.from_pretrained(save_path)
model

# Inference

To infer, the model needs `tokens`, `NER`, and `labels`.

### Eval data

In [23]:
import json
with open('./data/few_rel_eval.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

i = 0

tokens = data[i]['tokenized_text']
ner = data[i]['ner']
labels = list(set([r['relation_text'] for r in data[i]['relations']]))
print(tokens)
print()
print(ner)
print(labels)

['The', 'race', 'took', 'place', 'between', 'Godstow', 'and', 'Binsey', 'along', 'the', 'Upper', 'River', 'Thames', '.']

[[7, 8, 'Q4914513', 'Binsey'], [11, 13, 'Q19686', 'River Thames']]
['located in or next to body of water']


In [24]:
labels = ['country of origin', 'licensed to broadcast to', 'father', 'followed by'] + labels
print(labels)

['country of origin', 'licensed to broadcast to', 'father', 'followed by', 'located in or next to body of water']


In [25]:
relations = model.predict_relations(tokens, labels, threshold=0.01, ner=ner)

print('Number of relations:', len(relations))  # num entity pairs (both directions) * num classes.... provided they're over the threshold

sorted_data_desc = sorted(relations, key=lambda x: x['score'], reverse=True)
print("\nDescending Order by Score:")
for item in sorted_data_desc:
    print(item)

Number of relations: 4

Descending Order by Score:
{'head_pos': [7, 8], 'tail_pos': [11, 13], 'head_text': ['Binsey'], 'tail_text': ['River', 'Thames'], 'label': 'located in or next to body of water', 'score': 0.9235768914222717}
{'head_pos': [11, 13], 'tail_pos': [7, 8], 'head_text': ['River', 'Thames'], 'tail_text': ['Binsey'], 'label': 'located in or next to body of water', 'score': 0.12615662813186646}
{'head_pos': [11, 13], 'tail_pos': [7, 8], 'head_text': ['River', 'Thames'], 'tail_text': ['Binsey'], 'label': 'followed by', 'score': 0.024778323248028755}
{'head_pos': [7, 8], 'tail_pos': [11, 13], 'head_text': ['Binsey'], 'tail_text': ['River', 'Thames'], 'label': 'followed by', 'score': 0.01304696500301361}


In [26]:
# How many did the model get right?

gt = [[r['head']['mention'], r['head']['position'], r['tail']['mention'], r['tail']['position'], r['relation_text']] for r in data[i]['relations']]
pred = [[r['head_text'], r['head_pos'], r['tail_text'], r['tail_pos'], r['label']] for r in relations]


# Function to compare entries
def compare_entries(entry_a, entry_b):
    # Compare head position, tail position, and label
    return entry_a[1] == entry_b[1] and entry_a[3] == entry_b[3] and entry_a[-1] == entry_b[-1]

# Find matching entries
matches = []
for entry_second in gt:
    for entry_first in pred:
        if compare_entries(entry_first, entry_second):
            matches.append(entry_second)

# Print matching entries from the second list that are found in the first list
print(len(matches), 'out of', len(relations), 'predictions and', len(gt), 'ground truths')
print(matches)

1 out of 4 predictions and 1 ground truths
[['Binsey', [7, 8], 'River Thames', [11, 13], 'located in or next to body of water']]


### Real-world example

In [29]:
# Real-world example

text = "Jack Dorsey's father, Tim Dorsey, is a licensed pilot. Jack met his wife Sarah Paulson in New York in 2003. They have one son, Edward."
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

labels = ['country of origin', 'licensed to broadcast to', 'parent', 'followed by', 'located in or next to body of water', 'spouse', 'child']

tokens = [token.text for token in doc]

ner = [[ent.start, ent.end, ent.label_, ent.text] for ent in doc.ents]
print(f"Entities detected: {ner}")

relations = model.predict_relations(tokens, labels, threshold=0.01, ner=ner)

print('Number of relations:', len(relations))

sorted_data_desc = sorted(relations, key=lambda x: x['score'], reverse=True)
print("\nDescending Order by Score:")
for item in sorted_data_desc:
    print(f"{item['head_text']} --> {item['label']} --> {item['tail_text']} | socre: {item['score']}")

Entities detected: [[0, 2, 'PERSON', 'Jack Dorsey'], [5, 7, 'PERSON', 'Tim Dorsey'], [13, 14, 'PERSON', 'Jack'], [17, 19, 'PERSON', 'Sarah Paulson'], [20, 22, 'GPE', 'New York'], [23, 24, 'DATE', '2003'], [27, 28, 'CARDINAL', 'one'], [30, 31, 'PERSON', 'Edward']]
Number of relations: 90

Descending Order by Score:
['Sarah', 'Paulson'] --> spouse --> ['New', 'York'] | socre: 0.6608812212944031
['Sarah', 'Paulson'] --> spouse --> ['Jack', 'Dorsey'] | socre: 0.6601175665855408
['Edward'] --> spouse --> ['New', 'York'] | socre: 0.6493653655052185
['one'] --> spouse --> ['New', 'York'] | socre: 0.6480509042739868
['Edward'] --> spouse --> ['Jack', 'Dorsey'] | socre: 0.6474933624267578
['one'] --> spouse --> ['Jack', 'Dorsey'] | socre: 0.645997166633606
['Sarah', 'Paulson'] --> parent --> ['Jack', 'Dorsey'] | socre: 0.6457919478416443
['Sarah', 'Paulson'] --> parent --> ['New', 'York'] | socre: 0.6436636447906494
['Tim', 'Dorsey'] --> spouse --> ['New', 'York'] | socre: 0.6415780186653137
['

In [6]:
import spacy
from spacy.tokens import Span
from spacy import displacy

def visualize_relation(text, relations):
    nlp = spacy.blank("en")
    doc = nlp(text)

    # Manually set dependency relations to visualize relations
    for token in doc:
        token.dep_ = "dep"  # default to 'dep'

    spans = []
    # Calculate character offsets for each entity
    for rel in relations:
        head = Span(doc, rel['head_pos'][0], rel['head_pos'][1], label=rel['head_text'])
        tail = Span(doc, rel['tail_pos'][0], rel['tail_pos'][1], label=rel['tail_text'])

        doc.ents += (head, tail)

        # Mock dependencies
        head_root = head.root
        tail_root = tail.root

        head_root.dep_ = "rel"  # Relation type can be customized
        head_root.head = tail_root  # Point head to tail

    options = {"fine_grained": True}
    displacy.render(doc, style="dep", options=options, jupyter=True)

# Example data
text = "Binsey located in or next to body of water River Thames"
relations = [
    {'head_pos': [0, 1], 'tail_pos': [9, 11], 'head_text': 'Binsey', 'tail_text': 'River Thames', 'label': 'located in or next to body of water', 'score': 0.9235768914222717},
    # {'head_pos': [9, 11], 'tail_pos': [0, 1], 'head_text': 'River Thames', 'tail_text': 'Binsey', 'label': 'located in or next to body of water', 'score': 0.12615662813186646}
]

visualize_relation(text, relations)
