## Get Started

Dataset 1: Conll04 set
---
The entity and relation extraction dataset including four entity types in the dataset (Location,Organization,Person, and Other) and five relation types(Kill, Livein, Locatedin, OrgBasedin and Workfor).
http://lavis.cs.hs-rm.de/storage/spert/public/datasets/conll04/

Entity Type:

- Location: Loc (gt); GPE & LOC (spacy)
- Organization: Org (gt); ORG (spacy)
- Person: Peop (gt); PERSON(spacy)
- Other: Other (gt); All other tags (spacy)


### Get an overview of the dataset

In [1]:
import json
import random

# use pip install json / random if needed

with open("conll04.json", "r") as file:
    data = json.load(file)
print(data[0])

example = data[0]
entity = [(line['type'], " ".join([example['tokens'][i] for i in range(line['start'], line['end'])])) for line in example['entities']]
print("Entity: ", entity)

relation = [f"{entity[rline['head']]} -> {rline['type']} -> {entity[rline['tail']]}" for rline in example['relations']]
print("Relation: ", relation)


{'tokens': ['John', 'Wilkes', 'Booth', ',', 'who', 'assassinated', 'President', 'Lincoln', ',', 'was', 'an', 'actor', '.'], 'entities': [{'type': 'Peop', 'start': 0, 'end': 3}, {'type': 'Peop', 'start': 6, 'end': 8}], 'relations': [{'type': 'Kill', 'head': 0, 'tail': 1}], 'orig_id': 5178}
Entity:  [('Peop', 'John Wilkes Booth'), ('Peop', 'President Lincoln')]
Relation:  ["('Peop', 'John Wilkes Booth') -> Kill -> ('Peop', 'President Lincoln')"]


In [2]:
def get_IE_gt(example):
    entity = [(line['type'], " ".join([example['tokens'][i] for i in range(line['start'], line['end'])])) for line in example['entities']]
    relation = [f"{entity[rline['head']]} -> {rline['type']} -> {entity[rline['tail']]}" for rline in example['relations']]
    return entity, relation

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
def get_entity_spacy(example):
    entityList = []
    doc = " ".join(example['tokens'])
    doc = nlp(doc)
    for entity in doc.ents:
        entityList.append((entity.label_, entity.text))
    return entityList

In [5]:
import textacy
import re

In [6]:
def get_relation_spacy(example):
    relationList = []
    doc = " ".join(example['tokens'])
    doc = nlp(doc)
    tris = textacy.extract.triples.subject_verb_object_triples(doc)
    for item in tris:
        if item:
            relationList.append(" ".join([" ".join([str(x) for x in item[0]]), "->", " ".join([str(x) for x in item[1]]), "->", " ".join([str(x) for x in item[2]])]))
    return relationList

### How to use Langchain in information Extraction tasks?

### Langchain

In [None]:
!pip install langchain
# or
# !conda install langchain -c conda-forge

**Environment Setup:**

In [7]:
!pip install openai
import os
os.environ["OPENAI_API_KEY"] = "YOUR_OPEN_AI_KEY"



In [8]:
from langchain.indexes import GraphIndexCreator
from langchain.llms import OpenAI
sentence = " ".join(example['tokens'])
print(sentence)
index_creator = GraphIndexCreator(llm=OpenAI(temperature=0))
graph = index_creator.from_text(sentence)
triples = graph.get_triples()
print(triples)

John Wilkes Booth , who assassinated President Lincoln , was an actor .
[('John Wilkes Booth', 'President Lincoln', 'assassinated'), ('John Wilkes Booth', 'an actor)\nEND OF EXAMPL', 'was')]


In [9]:
from langchain.chains import GraphQAChain
chain = GraphQAChain.from_llm(OpenAI(temperature=0), graph=graph, verbose=True)
entityLC = []
for tri in triples:
    ent1, ent2 = tri[0], tri[1]
    res1 = chain.run(f"Is the entity type of {ent1} an organization, person, location or other?").strip()
    res2 = chain.run(f"Is the entity type of {ent2} an organization, person, location or other?").strip()
    entityLC.extend([(res1, ent1), (res2, ent2)])
print(entityLC)



[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Person[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m None[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Person[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Person', 'John Wilkes Booth'), ('Person', 'President Lincoln'), ('Person', 'John Wilkes Booth'), ('Person', 'an actor)\nEND OF EXAMPL')]


In [10]:
index_creator = GraphIndexCreator(llm=OpenAI(temperature=0))
def get_IE_LC(example):
    sentence = " ".join(example['tokens'])
    graph = index_creator.from_text(sentence)
    triples = graph.get_triples()
    chain = GraphQAChain.from_llm(OpenAI(temperature=0), graph=graph, verbose=True)
    entity_LC = []
    relation_LC = []
    for tri in triples:
        ent1, ent2 = tri[0], tri[1]
        res1 = chain.run(f"Is the entity type of {ent1} an organization, person, location or other?").strip()
        res2 = chain.run(f"Is the entity type of {ent2} an organization, person, location or other?").strip()
        entity_LC.extend([(res1, ent1), (res2, ent2)])
        relation_LC.append(" ".join([ent1, "->", tri[2], "->", ent2]))
        
    entity_LC = list(set(entity_LC))
    return entity_LC, relation_LC

In [11]:
Entity_GT = []
Relation_GT = []

Entity_Spacy = []
Relation_Spacy = []

# Entity_LC = []
# Relation_LC = []

for line in data[:50]:
    entity_gt, relation_gt = get_IE_gt(line)
    Entity_GT.append(entity_gt)
    Relation_GT.append(relation_gt)
    
    entity_spacy = get_entity_spacy(line)
    relation_spacy = get_relation_spacy(line)
    Entity_Spacy.append(entity_spacy)
    Relation_Spacy.append(relation_spacy)
    
#     entity_LC, relation_LC = get_IE_LC(line)
#     Entity_LC.append(entity_LC)
#     Relation_LC.append(relation_LC)
    
    print(entity_gt)
    print(entity_spacy)
#     print(entity_LC)
    
    print(relation_gt)
    print(relation_spacy)
#     print(relation_LC)
    print('--------')



[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Person[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m None[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Person[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Peop', 'John Wilkes Booth'), ('Peop', 'President Lincoln')]
[('PERSON', 'John Wilkes Booth'), ('PERSON', 'Lincoln')]
[('Other', 'an actor)\nEND OF EXAMPL'), ('Person', 'President Lincoln'), ('Person', 'John Wilkes Booth')]
["('Peop', 'John Wilkes Booth') -> Kill -> ('Peop', 'President Lincoln')"]
['who -> assassinated -> President Lincoln']
['John Wilkes Booth -> assassinated -> President Linco



[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Loc', 'Morgan City'), ('Loc', 'La.')]
[('CARDINAL', 'Ten'), ('GPE', 'Morgan City'), ('GPE', 'La.'), ('DATE', 'Monday')]
[('Other', 'while heading inland Monday to escape the storm'), ('Person', 'Ten oil workers')]
["('Loc', 'Morgan City') -> Located_In -> ('Loc', 'La.')"]
['oil workers -> were missing -> Morgan City']
['Ten oil workers -> rig capsized -> while heading inland Monday to escape the storm']
--------


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Fin


[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Loc', 'Illinois'), ('Peop', 'James Thompson'), ('Loc', 'Chicago')]
[('GPE', 'Illinois'), ('PERSON', 'James Thompson'), ('DATE', 'last month'), ('GPE', 'Chicago'), ('DATE', 'next July')]
[('Person.', 'James Thompson'), ('Other', 'legislation'), ('Other', 'budgets'), ('Location', 'Illinois'), ('Other', 'parent-run councils'), ('Organization', "Chicago's central school boar



[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Peop', 'Martin Luther King III'), ('Peop', 'James Earl Ray')]
[('PERSON', 'Martin Luther King III'



[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m .38-caliber Colt Cobra[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Person[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Person[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Other', 'Photo'), ('Other', 'Dueling'), ('Other', 'GunIt'), ('Other', 'November'), ('Peop', 'Jack Ruby'), ('Other', 'Colt Cobra'), ('Peop', 'Lee Harvey Oswald'), ('Peop', 'President John F. Kennedy')]
[('NORP', 'Americans'), ('DATE', 'November day'), ('DATE', '1963'), ('PERSON', 'Jack Ruby'), ('PERSON', 'Colt Cobra'), ('PERSON', 'Lee Harvey Oswald'), ('PERSON', 'John F. Kenned



[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Highway 61[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Con


[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Loc', 'DAYTON'), ('Loc', 'Ohio'), ('Org', 'AP')]
[('ORG', 'DAYTON'), ('GPE', 'Ohio'), ('ORG', 'AP')]
[('City is a location.', 'city'), ('Location', 'Ohio'), ('Dayton is a location.', 'Dayton')]
["('Loc', 'DAYTON') -> Located_In -> ('Loc', 'Ohio')", "('Org', 'AP') -> OrgBased_In -> ('Loc', 'DAYTON')", "('Org', 'AP') -> OrgBased_In -> ('Loc', 'Ohio')"]
[]
['Dayton -> is a -> city', 'Dayton -> is in -> Ohio']
--------


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Conte



[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m None[0m
Full Contex

Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m NONE[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Peop', 'Cheek'), ('Loc', 'Addis Ababa'), ('Loc', 'Ethiopia.')]
[('DATE', '53'), ('GPE', 'Addis Ababa'), ('GPE', 'Ethiopia')]
[('Other', 'charge'), 



[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m


[1m> Entering new GraphQAChain chain...[0m
Entities Extracted:
[32;1m[1;3m Other[0m
Full Context:
[32;1m[1;3m[0m

[1m> Finished chain.[0m
[('Other', 'Wednesday'), ('Loc', 'Winterset'), ('Peop', 'John Wayne')]
[('CARDINAL', 'Hundreds'), ('DATE', 'Wednesday'), ('ORG', 'Winterset'), ('PERSON', 'John Wayne')]
[('Person', 'John Wayne'), ('Other', 'named for John Wayne'), ("I don't know.", 'Winterset')]
["('Peop', 'John Wayne') -> Live_In -> ('Loc', 'Winterset')"]
[]
['Winterset -> busiest street -> named for John Wayne', "John Wayne -> c

# Evaluation
---

In [None]:
- Location: Loc (gt); GPE & LOC (spacy)
- Organization: Org (gt); ORG (spacy)
- Person: Peop (gt); PERSON(spacy)
- Other: Other (gt); All other tags (spacy)

In [20]:
TagDictSpacy = {
    'GPE': 'Loc',
    'LOC': 'Loc',
    'ORG': 'Org',
    'PERSON': 'Peop'
}

- Precision is the percentage of named entities found by the learning system that are correct. 
- Recall is the percentage of named entities in the corpus found by the system. A named entity is correct only if it is an exact match of the corresponding entity in the data file.

## Entity-level evaluation for the person entity

In [21]:
NERResults = []
for i in range(len(Entity_GT)):
    predict = {r[1]: TagDictSpacy[r[0]] for r in Entity_Spacy[i] if r[0] in TagDictSpacy}
    for ent in Entity_GT[i]:
        if ent[1] in predict:
            if 'Peop' in [ent[0], predict[ent[1]]]:
                NERResults.append([ent[1], ent[0], predict[ent[1]]])
        else:
            if ent[0] in 'Peop':
                NERResults.append([ent[1], ent[0], 'Other'])
print(NERResults)

[['John Wilkes Booth', 'Peop', 'Peop'], ['President Lincoln', 'Peop', 'Other'], ["Kevin O 'Brien", 'Peop', 'Other'], ['Wang Ziqiang', 'Peop', 'Peop'], ['Marie Magdefrau Ferraro', 'Peop', 'Peop'], ['Avihu Bin-Nun', 'Peop', 'Peop'], ['Thomas', 'Peop', 'Other'], ['James Thompson', 'Peop', 'Peop'], ['Brian Michaud', 'Peop', 'Peop'], ['Oswald', 'Peop', 'Peop'], ['Kennedy', 'Peop', 'Peop'], ['Yang Jianbai', 'Peop', 'Peop'], ['Daniel Urquhart', 'Peop', 'Peop'], ['Kovacevic', 'Peop', 'Peop'], ['Martin Luther King III', 'Peop', 'Peop'], ['James Earl Ray', 'Peop', 'Other'], ['Benjamin Harrison', 'Peop', 'Peop'], ['Khrushchev', 'Peop', 'Peop'], ['Roy Medvedev', 'Peop', 'Peop'], ['Walter R. Mears', 'Peop', 'Peop'], ['Gerald Przenislawski', 'Peop', 'Peop'], ['Jesse Jackson', 'Peop', 'Peop'], ['Jack Ruby', 'Peop', 'Peop'], ['Colt Cobra', 'Other', 'Peop'], ['Lee Harvey Oswald', 'Peop', 'Peop'], ['President John F. Kennedy', 'Peop', 'Other'], ['Cetin', 'Peop', 'Peop'], ['Croats', 'Other', 'Peop'], ['V

In [None]:
!pip install -U scikit-learn

In [22]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
people_gt = [1 if line[1] == 'Peop' else 0 for line in NERResults ]
people_spacy = [1 if line[2] == 'Peop' else 0 for line in NERResults ]
pre = precision_score(people_gt, people_spacy)
recall = recall_score(people_gt, people_spacy)
f1 = f1_score(people_gt, people_spacy)
print(pre, recall, f1)
tn, fp, fn, tp = confusion_matrix(people_gt, people_spacy).ravel()
print("True Positives:", tp)
print("False Positives:", fp) # 	 was incorrectly predicted as person while it should have been something else.
print("True Negatives:", tn)
print("False Negatives:", fn) # 	incorrectly predicted as somthing else while it should have been person.

0.9210526315789473 0.7446808510638298 0.8235294117647057
True Positives: 35
False Positives: 3
True Negatives: 0
False Negatives: 12


## Q1: Try to calculate the precision , recall, f1-score of Spacy on *Location* entity extraction results compared to the groundtruth?

In [25]:
TagDictLC = {
    'Location': 'Loc',
    'Organization': 'Org',
    'Person': 'Peop'
}

**Performance of LangChain**

In [26]:
NERResults = []
for i in range(len(Entity_GT)):
    predict = {r[1]: TagDictLC[r[0]] for r in Entity_LC[i] if r[0] in TagDictLC}
    for ent in Entity_GT[i]:
        if ent[1] in predict:
            if 'Peop' in [ent[0], predict[ent[1]]]:
                NERResults.append([ent[1], ent[0], predict[ent[1]]])
        else:
            if ent[0] in 'Peop':
                NERResults.append([ent[1], ent[0], 'Other'])
print(NERResults)
people_gt = [1 if line[1] == 'Peop' else 0 for line in NERResults ]
people_LC = [1 if line[2] == 'Peop' else 0 for line in NERResults ]

pre = precision_score(people_gt, people_LC)
recall = recall_score(people_gt, people_LC)
f1 = f1_score(people_gt, people_LC)
print(pre, recall, f1)
tn, fp, fn, tp = confusion_matrix(people_gt, people_LC).ravel()
print("True Positives:", tp)
print("False Positives:", fp) # 	 was incorrectly predicted as person while it should have been something else.
print("True Negatives:", tn)
print("False Negatives:", fn) # 	incorrectly predicted as somthing else while it should have been person.

[['John Wilkes Booth', 'Peop', 'Peop'], ['President Lincoln', 'Peop', 'Peop'], ["Kevin O 'Brien", 'Peop', 'Other'], ['Wang Ziqiang', 'Peop', 'Peop'], ['Marie Magdefrau Ferraro', 'Peop', 'Peop'], ['Avihu Bin-Nun', 'Peop', 'Other'], ['Thomas', 'Peop', 'Other'], ['James Thompson', 'Peop', 'Other'], ['Brian Michaud', 'Peop', 'Peop'], ['Oswald', 'Peop', 'Peop'], ['Kennedy', 'Peop', 'Peop'], ['Yang Jianbai', 'Peop', 'Peop'], ['Daniel Urquhart', 'Peop', 'Peop'], ['Kovacevic', 'Peop', 'Other'], ['Serb', 'Other', 'Peop'], ['Martin Luther King III', 'Peop', 'Peop'], ['James Earl Ray', 'Peop', 'Peop'], ['Benjamin Harrison', 'Peop', 'Peop'], ['Khrushchev', 'Peop', 'Peop'], ['Roy Medvedev', 'Peop', 'Peop'], ['Walter R. Mears', 'Peop', 'Peop'], ['Germans', 'Other', 'Peop'], ['Gerald Przenislawski', 'Peop', 'Peop'], ['Jesse Jackson', 'Peop', 'Peop'], ['Jack Ruby', 'Peop', 'Peop'], ['Lee Harvey Oswald', 'Peop', 'Peop'], ['President John F. Kennedy', 'Peop', 'Peop'], ['Cetin', 'Peop', 'Other'], ['Verno

# Model-level evaluation for the collective model

In [None]:
!pip install seqeval

In [17]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

NERModelResults = []
for i in range(len(Entity_GT)):
    predict = {r[1]: TagDictSpacy[r[0]] for r in Entity_Spacy[i] if r[0] in TagDictSpacy}
    for ent in Entity_GT[i]:
        if ent[1] in predict:
            NERModelResults.append([ent[1], ent[0], predict[ent[1]]])
        else:
            NERModelResults.append([ent[1], ent[0], 'Other'])
print(NERModelResults)
NERGT = [[line[1] for line in NERModelResults]]
NERSpacy = [[line[2] for line in NERModelResults]]

print("----------------------")
print(f"precision: {precision_score(NERGT, NERSpacy)}, recall: {recall_score(NERGT, NERSpacy)}, f1: {f1_score(NERGT, NERSpacy)}")

[['John Wilkes Booth', 'Peop', 'Peop'], ['President Lincoln', 'Peop', 'Other'], ['Palace of Fine Arts', 'Loc', 'Other'], ['San Francisco', 'Loc', 'Loc'], ['June 30', 'Other', 'Other'], ['July 1-2 ,', 'Other', 'Other'], ["Kevin O 'Brien", 'Peop', 'Other'], ['Wang Ziqiang', 'Peop', 'Peop'], ['Institute of Mechanics', 'Org', 'Other'], ['Sun Hung Kai Properties', 'Org', 'Org'], ['Hong Kong', 'Loc', 'Loc'], ['27 percent', 'Other', 'Other'], ['Marie Magdefrau Ferraro', 'Peop', 'Peop'], ['Bethany', 'Loc', 'Loc'], ['Conn.', 'Loc', 'Loc'], ['Connecticut Audubon Society wildlife', 'Org', 'Other'], ['Morgan City', 'Loc', 'Loc'], ['La.', 'Loc', 'Loc'], ['10 , 000 acres', 'Other', 'Other'], ['Idaho', 'Loc', 'Loc'], ['Lowman', 'Loc', 'Org'], ['70 miles', 'Other', 'Other'], ['Boise.', 'Loc', 'Other'], ['Air Force', 'Org', 'Org'], ['Avihu Bin-Nun', 'Peop', 'Peop'], ['Savannah River Plant', 'Loc', 'Other'], ['Aiken', 'Loc', 'Loc'], ['S.C.', 'Loc', 'Loc'], ['Thomas', 'Peop', 'Other'], ['Chisholm', 'Loc'



## Relation Extraction Evaluation

In [18]:
"""Compare LC to gt"""
RGT = {}
RLC = {}
for i in range(len(Relation_GT)):
    gt = [l.split(' -> ') for l in Relation_GT[i]]
    gt = [(l[0][1:-1].split(", ")[1][1:-1], l[2][1:-1].split(", ")[1][1:-1]) for l in gt]
    for r in gt:
        RGT[r] = 1
    LC = [l.split(' -> ') for l in Relation_LC[i]]
    LC = [(l[0], l[2]) for l in LC]
    for r in LC:
        RLC[r] = 1
relations = list({**RGT,**RLC}.keys())
for r in relations:
    if r not in RGT:
        RGT[r] = 0
    if r not in RLC:
        RLC[r] = 0
relationTrue = [RGT[r] for r in relations]
relationPred = [RLC[r] for r in relations]
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
print(precision_score(relationTrue, relationPred))
print(recall_score(relationTrue, relationPred))
print(f1_score(relationTrue, relationPred))
tn, fp, fn, tp = confusion_matrix(relationTrue, relationPred).ravel()
print("True Positives:", tp)
print("False Positives:", fp) # 	 was incorrectly predicted as relation while it should not
print("True Negatives:", tn)
print("False Negatives:", fn) # 	did not extract as a relation while it should have been a relation.

0.10714285714285714
0.2571428571428571
0.15126050420168066
True Positives: 18
False Positives: 150
True Negatives: 0
False Negatives: 52


In [19]:
"""Compare Spacy to gt"""
RGT = {}
RSpacy = {}
for i in range(len(Relation_GT)):
    gt = [l.split(' -> ') for l in Relation_GT[i]]
    gt = [(l[0][1:-1].split(", ")[1][1:-1], l[2][1:-1].split(", ")[1][1:-1]) for l in gt]
#     print(gt)
    for r in gt:
        RGT[r] = 1
    LC = [l.split(' -> ') for l in Relation_Spacy[i]]
    LC = [(l[0], l[2]) for l in LC]
#     print(LC)
    for r in LC:
        RSpacy[r] = 1
relations = list({**RGT,**RSpacy}.keys())
for r in relations:
    if r not in RGT:
        RGT[r] = 0
    if r not in RSpacy:
        RSpacy[r] = 0
relationTrue = [RGT[r] for r in relations]
relationPred = [RSpacy[r] for r in relations]
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
print(precision_score(relationTrue, relationPred))
print(recall_score(relationTrue, relationPred))
print(f1_score(relationTrue, relationPred))
tn, fp, fn, tp = confusion_matrix(relationTrue, relationPred).ravel()
print("True Positives:", tp)
print("False Positives:", fp) # 	 was incorrectly predicted as relation while it should not
print("True Negatives:", tn)
print("False Negatives:", fn) # 	did not extract as a relation while it should have been a relation.

0.0
0.0
0.0
True Positives: 0
False Positives: 37
True Negatives: 0
False Negatives: 70
