# Named Entity Recognition based on Language Models, PoS tagging and Subwords

In [None]:
%%capture
!pip install --upgrade pip
!pip install flair

In [None]:
# PoS tagger

from flair.data import Sentence
from flair.models import SequenceTagger
tagger = SequenceTagger.load("flair/pos-english")


Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

2022-04-19 10:43:34,467 loading file /root/.flair/models/pos-english/a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-04-19 10:43:35,010 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


In [None]:
# NER model

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER-uncased")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER-uncased")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

In [None]:
def get_pos_entities(text,category):
    # make example sentence
    sentence = Sentence(text)

    # predict NER tags
    tagger.predict(sentence)

    # print sentence
    #print(sentence)
    # iterate over entities and print
    entities = []
    current_entity = ""
    for t in sentence.tokens:
        for label in t.annotation_layers.keys():
            text = t.text
            label = t.get_labels(label)[0].value   
            if (label == category):
                if (current_entity == ""):
                    current_entity += text
                else:
                    current_entity += " " + text
            elif len(current_entity) > 0:
                entities.append(current_entity)
                current_entity = ""
          
    if (len(current_entity)>0):
        entities.append(current_entity)
    return entities

r = get_pos_entities("who's a kung fu star from hong kong","NN")
print(r)

['kung fu star']


In [None]:
def get_entities(text):
    entities = []
    entity = ""
    index = -1
    offset = -1
    for token in nlp(text):
        if (index == -1):
            index = token['index']
            offset = token['start']
        word = token['word']
        if (word[0] == '#'):
            word = token['word'].replace("#","")
            
        if (token['start']== offset):
            entity += word
        elif (token['index']-index < 2):
            entity += " " + word
        else:
            entities.append(entity)
            entity = word
        index = token['index']
        offset = token['end']
        
    if (len(entity) > 0):    
        entities.append(entity)
    if (len(entities) == 0):
        cardinal_entities =  get_pos_entities(text,"CD")
        if (len(cardinal_entities)>0):
            return cardinal_entities
        noun_entities =  get_pos_entities(text,"NN")
        if (len(noun_entities)>0):
            return noun_entities
        
    return entities

r = get_entities("which city did carl-alfred schumacher die")
print(r)

['carl-alfred schumacher']


# Wikidata Entity Linking based on MediaWiki API

The MediaWiki Action API is a web service that allows access to some wiki-features like authentication, page operations, and search. It can provide meta information about the wiki and the logged-in user.

action=wbsearchentities

Searches for entities using labels and aliases.

Returns a label and description for the entity in the user language if possible. Returns details of the matched term. The matched term text is also present in the aliases key if different from the display label.

In [64]:
import requests
def get_wikidata_candidates(label):
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    query_path = "https://www.wikidata.org/w/api.php?action=wbsearchentities&search=QUERY_TEXT&language=en&limit=10&type=item&format=json"
    r = requests.get(query_path.replace("QUERY_TEXT",label))
    
    for answer in r.json()['search']:
        candidate = {
            'label': answer['display']['label']['value'],
            'id':answer['id']
#            'description' : answer['display']['description']
        }
        candidates.append(candidate)
    return candidates

r = get_wikidata_candidates("Berlín")
print(r)

[{'label': 'Berlin', 'id': 'Q64'}, {'label': 'Berlin', 'id': 'Q821244'}, {'label': 'Berlin', 'id': 'Q614184'}, {'label': 'Berlín', 'id': 'Q582242'}, {'label': '1936 Summer Olympics', 'id': 'Q8150'}, {'label': 'Berlín', 'id': 'Q821971'}, {'label': 'Berlín', 'id': 'Q62270519'}, {'label': 'Technical University of Berlin', 'id': 'Q51985'}, {'label': 'Mitte', 'id': 'Q163966'}, {'label': 'Berlín', 'id': 'Q20274855'}]


### DBpedia Entity Linking based on DBpedia Lookup service

The DBpedia Lookup is an entity retrieval service for Linked Data. It provides a straightforward solution for the frequent use case of resolving keywords and natural language to related resource identifiers in the DBpedia knowledge graph. Related means that either the label or abstract of a resource matches, or an anchor text that was frequently used in Wikipedia to refer to a specific resource matches (e.g. the resource http://dbpedia.org/resource/United_States can be looked up by the string “USA”). 

So whether you need an auto-complete service for your RDF application, Linked Data enhancements for your CSV tables or simply a way to retrieve specific DBpedia identifiers – the DBpedia Lookup is for you!

As a part of the DBpedia Technology Stack the DBpedia Lookup can be deployed conveniently via Docker and works well with DBpedia Databus Collections. The DBpedia Lookup uses an Apache Lucene Index for resource indexing and retrieval and provides a web interface for querying.

In [72]:
import requests
def get_dbpedia_candidates(label):
    candidates = []
    if (label==""):
        return candidates
    # type: One of the following values: form, form, item, lexeme, property, sense, sense
    query_path = "http://lookup.dbpedia.org/api/search?query=QUERY_TEXT&maxResults=10&type=class&format=json"
    r = requests.get(query_path.replace("QUERY_TEXT",label))
    for answer in r.json()['docs']:
      if ('label' in answer) and ('resource' in answer):
        candidate = {
            'label': answer['label'][0].replace("<B>","").replace("</B>",""),
            'id':answer['resource'][0].split("http://dbpedia.org/resource/")[1]
#            'description' : answer['display']['description']
        }
        candidates.append(candidate)
    return candidates

r = get_dbpedia_candidates("Berlín")
print(r)

[{'label': 'Berlin', 'id': 'Berlin'}, {'label': 'Bergen', 'id': 'Bergen'}, {'label': 'Tennis Borussia Berlin', 'id': 'Tennis_Borussia_Berlin'}, {'label': '1. FC Union Berlin', 'id': '1._FC_Union_Berlin'}, {'label': 'West Berlin', 'id': 'West_Berlin'}, {'label': 'East Berlin', 'id': 'East_Berlin'}, {'label': 'Berlín, Usulután', 'id': 'Berlín,_Usulután'}, {'label': 'Humboldt University of Berlin', 'id': 'Humboldt_University_of_Berlin'}, {'label': 'Hertha BSC', 'id': 'Hertha_BSC'}, {'label': 'Carlos Berlín Montero', 'id': 'Carlos_Berlín_Montero'}]


# Wikidata Simple-Question with Answers

We only consider questions with a single answer (i.e. predicate = 'P').

In [None]:
import pandas as pd
df = pd.read_csv('wsq-labels.csv', index_col=0)
df.head()

Unnamed: 0,subject,predicate,object,question,subject_label,predicate_label,object_label
0,Q7358590,P20,Q1637790,Where did roger marquis die,Roger Marquis,place of death,Holyoke
1,Q154335,P509,Q12152,what was the cause of death of yves klein,Yves Klein,cause of death,myocardial infarction
2,Q2747238,P413,Q5059480,What position does carlos gomez play?,Carlos Gómez,position played on team / speciality,center fielder
3,Q62498,P21,Q6581097,how does engelbert zaschka identify,Engelbert Zaschka,sex or gender,male
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball,Pee Wee Reese,position played on team / speciality,shortstop


## NER Task

In [None]:
# initialize entities column
entities = []

In [None]:
index = 0
rows = df.to_numpy().tolist()
for row in rows[index:]:
    question = row[3] #question
    print(index,":",question)
    question_entities = get_entities(question)
    entities.append(question_entities)
    index += 1
print(len(entities),"questions analyzed!")

In [None]:
df['entity']=entities
df.to_csv('wsq-entities.csv')
df.head()

Unnamed: 0,subject,predicate,object,question,subject_label,predicate_label,object_label,entity
0,Q7358590,P20,Q1637790,Where did roger marquis die,Roger Marquis,place of death,Holyoke,[roger marquis]
1,Q154335,P509,Q12152,what was the cause of death of yves klein,Yves Klein,cause of death,myocardial infarction,[yves klein]
2,Q2747238,P413,Q5059480,What position does carlos gomez play?,Carlos Gómez,position played on team / speciality,center fielder,[carlos gomez]
3,Q62498,P21,Q6581097,how does engelbert zaschka identify,Engelbert Zaschka,sex or gender,male,[engelbert zaschka]
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball,Pee Wee Reese,position played on team / speciality,shortstop,[pee wee reese]


## Entity Linking Task

### Wikidata Entity Linking

In [None]:
# initialize wikidata entities column
wikidata_items = []

In [None]:
index = 0
rows = df.to_numpy().tolist()
for row in rows[index:]:
    items = []
    q_entities = row[7] #entity
    print(index,":",q_entities)
    for entity in q_entities:
      for item in get_wikidata_candidates(entity):
          items.append(item['id'])
    wikidata_items.append(items)    
    index += 1
print(len(wikidata_items),"questions analyzed!")

In [None]:
df['wiki_items']=wikidata_items
df.to_csv('wsq-wikidata.csv')
df.head()

Unnamed: 0,subject,predicate,object,question,subject_label,predicate_label,object_label,entity,wiki_items
0,Q7358590,P20,Q1637790,Where did roger marquis die,Roger Marquis,place of death,Holyoke,[roger marquis],"[Q7358592, Q7358590]"
1,Q154335,P509,Q12152,what was the cause of death of yves klein,Yves Klein,cause of death,myocardial infarction,[yves klein],"[Q154335, Q98233558, Q8062325, Q92397997, Q806..."
2,Q2747238,P413,Q5059480,What position does carlos gomez play?,Carlos Gómez,position played on team / speciality,center fielder,[carlos gomez],"[Q949506, Q203210, Q51944192, Q2747238, Q50421..."
3,Q62498,P21,Q6581097,how does engelbert zaschka identify,Engelbert Zaschka,sex or gender,male,[engelbert zaschka],[Q62498]
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball,Pee Wee Reese,position played on team / speciality,shortstop,[pee wee reese],[Q182485]


### DBpedia Entity Linking

In [73]:
# initialize dbpedia entities column
dbpedia_items = []

In [74]:
index = 0
for row in rows[index:]:
    items = []
    q_entities = row[7] #entity
    print(index,":",q_entities)
    for entity in q_entities:
      for item in get_dbpedia_candidates(entity):
          items.append(item['id'])    
    dbpedia_items.append(items)
    index += 1
print(len(dbpedia_items),"questions analyzed!")

0 : ['roger marquis']
1 : ['yves klein']
2 : ['carlos gomez']
3 : ['engelbert zaschka']
4 : ['pee wee reese']
5 : ['swiss']
6 : ['padraic mcguinness']
7 : ['sam edwards']
8 : ['italianate']
9 : ['neo contra']
10 : ['angie estes']
11 : ['jose francisco torres']
12 : ['warsaw']
13 : ['jakarta']
14 : ['prague']
15 : ['guy pnini']
16 : ['scott grimes']
17 : ['sidewalks entertainment']
18 : ['tour perret']
19 : ['guy clark']
20 : ['australian', 'roll']
21 : ['dysona']
22 : ['robo v']
23 : ['example', 'romance film']
24 : ['shigeyasu suzuki']
25 : ['yonatan rozen']
26 : ['indian', 'fazil']
27 : ['paolo de la haza']
28 : ['daniela cristofori']
29 : ['country', 'delivery']
30 : ['crixas do tocantins']
31 : ['ian iqbal rashid']
32 : ['monango']
33 : ['who', 'rca records']
34 : ['artist', 'love']
35 : ['gampelay', 'allegiance']
36 : ['raghuvinte swantham raziya']
37 : ['which films', 'leonid gaidai']
38 : ['j. a. folger']
39 : ['shinji mori']
40 : ['msui', 'john pizzarelli']
41 : ['pandarus']
42

KeyboardInterrupt: ignored

In [None]:
df['dbpedia_items']=dbpedia_items
df.to_csv('wsq-dbpedia.csv')
df.head()

# Evaluation

In [None]:
%%capture
!pip install unidecode

In [None]:
def evaluate_labels(true_list,pred_list):
  index = 0
  tp, tn, fp, fn = 0, 0, 0, 0
  empty_values = 0
  for l1 in true_list:
    predicted_values = pred_list[index]
    if (len(predicted_values) == 0):
      empty_values += 1
    elif (l1[0] not in predicted_values):
      fn += 1
    else:  
      for l2 in predicted_values:
        if (l2 in l1):
          tp += 1
        else:
          fp += 1
    index += 1
    if (index % 1000 == 0):
      print(index)
  print("Total:",index)
  print("Empty Values:",empty_values)
  print("TruePositives:",tp,"TrueNegatives:",tn,"FalsePositives:",fp,"FalseNegatives:",fn)
  precision = tp / (fp + tp)
  print("Precission:",precision)
  recall = tp / (fn + tp)
  print("Recall:",recall)
  f1 = 2 * ((precision*recall)/(precision+recall))
  print("F1:",f1)

In [60]:
my_df = df[df.predicate.str.contains('P',case=False)]
my_df.head()

Unnamed: 0,subject,predicate,object,question,subject_label,predicate_label,object_label,entity,wiki_items
0,Q7358590,P20,Q1637790,Where did roger marquis die,Roger Marquis,place of death,Holyoke,[roger marquis],"[Q7358592, Q7358590]"
1,Q154335,P509,Q12152,what was the cause of death of yves klein,Yves Klein,cause of death,myocardial infarction,[yves klein],"[Q154335, Q98233558, Q8062325, Q92397997, Q806..."
2,Q2747238,P413,Q5059480,What position does carlos gomez play?,Carlos Gómez,position played on team / speciality,center fielder,[carlos gomez],"[Q949506, Q203210, Q51944192, Q2747238, Q50421..."
3,Q62498,P21,Q6581097,how does engelbert zaschka identify,Engelbert Zaschka,sex or gender,male,[engelbert zaschka],[Q62498]
4,Q182485,P413,Q1143358,what position does pee wee reese play in baseball,Pee Wee Reese,position played on team / speciality,shortstop,[pee wee reese],[Q182485]


In [61]:
# NER Performance
import unidecode
y_true = [[unidecode.unidecode(i).lower().replace(" ","_")] for i in my_df['subject_label']]
y_pred = []
for entities in my_df['entity']:
  norm_entities = []
  for entity in entities:
    norm_entities.append(unidecode.unidecode(entity).lower().replace(" ","_"))
  y_pred.append(norm_entities)
evaluate_labels(y_true,y_pred)

1000
2000
3000
4000
Total: 4296
Empty Values: 6
TruePositives: 3043 TrueNegatives: 0 FalsePositives: 266 FalseNegatives: 1248
Precission: 0.919613176186159
Recall: 0.7091587042647401
F1: 0.8007894736842105


In [62]:
# Wikidata Entity Linking Performance
y_true = [[i] for i in my_df['subject']]
y_pred = my_df['wiki_items'].tolist()
evaluate_labels(y_true,y_pred)

1000
2000
3000
4000
Total: 4296
Empty Values: 110
TruePositives: 3382 TrueNegatives: 0 FalsePositives: 12089 FalseNegatives: 815
Precission: 0.21860254670027793
Recall: 0.8058136764355492
F1: 0.3439088875330486
