# Approach

Named Entity Recognition based on Uncased Language Models combined with PoS tagging and using subwords tokenization

In [None]:
%%capture
!pip install --upgrade pip
!pip install flair
!pip install unidecode

In [1]:
import unidecode
import hashlib

# PoS tagger
from flair.data import Sentence
from flair.models import SequenceTagger

# uncased NER model
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

def hash_text(text):
    hash_object = hashlib.md5(text.encode())
    md5_hash = hash_object.hexdigest()
    return str(md5_hash) 

pos_language_model = "flair/pos-english"
ner_language_model = "dslim/bert-base-NER-uncased"

ner_tagger = SequenceTagger.load(pos_language_model)
ner_tokenizer = AutoTokenizer.from_pretrained(ner_language_model)
ner_model = AutoModelForTokenClassification.from_pretrained(ner_language_model)
ner_nlp = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)

entities_cache = {}

def unicase(label):
  return unidecode.unidecode(label.strip()).lower()

def get_pos_entities(text,main_categories=['CD','NN','NNS','NNP','NNPS'],additional_categories=['JJ','NNS','CC'],drop_categories=['IN']):
    # make sentence
    sentence = Sentence(text)

    # predict NER tags
    ner_tagger.predict(sentence)

    # iterate over entities 
    entities = []
    current_entity = ""
    partial_entity = ""
    for t in sentence.tokens:
        for label in t.annotation_layers.keys():
            text = t.text
            label = t.get_labels(label)[0].value   
            if (label in main_categories):
                if (len(partial_entity)>0):
                  current_entity = partial_entity + " " + text
                  partial_entity = ""
                elif (current_entity == ""):
                  current_entity += text
                else:
                    current_entity += " " + text
            elif(label in additional_categories):
              if (len(current_entity)>0):
                current_entity += " " + text
              elif (len(partial_entity)>0):
                partial_entity += " " + text
              else:
                partial_entity += text
            elif(label in drop_categories):
               current_entity = ""
               partial_entity = ""
            elif len(current_entity) > 0:
                entities.append(current_entity)
                current_entity = ""
                partial_entity = ""
          
    if (len(current_entity)>0):
        entities.append(current_entity)
    return entities

def get_entities(text,additional=True):
    key = hash_text(text)
    if (key in entities_cache):
        #print("use of entities_cache!")
        return entities_cache[key]
    entities = []
    entity = ""
    index = -1
    offset = -1
    for token in ner_nlp(text):
        if (index == -1):
            index = token['index']
            offset = token['start']
        word = token['word']
        if (word[0] == '#'):
            word = token['word'].replace("#","")
            
        if (token['start']== offset):
            entity += word
        elif (token['index']-index < 2):
            entity += " " + word
        else:
            entities.append(entity)
            entity = word
        index = token['index']
        offset = token['end']
        
    if (len(entity) > 0):    
        entities.append(entity)
    if additional:      
      pos_entities = [unicase(e) for e in get_pos_entities(text)]      
      if (len(entities) == 0):
        return pos_entities
      final_entities = [] 
      for e in entities:
          final_entity = e
          for pe in pos_entities:
            if (e in pe):
              final_entity = pe
          final_entities.append(final_entity)
      if (len(final_entities)>0):
        return list(set(final_entities))
    entities_cache[key] = entities
    return entities

2022-04-29 13:48:56,930 loading file /Users/cbadenes/.flair/models/pos-english/a9a73f6cd878edce8a0fa518db76f441f1cc49c2525b2b4557af278ec2f0659e.121306ea62993d04cd1978398b68396931a39eb47754c8a06a87f325ea70ac63
2022-04-29 13:48:57,398 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD


In [2]:
text = "what male actor was born in  warsaw"
print("PoS entities:",get_pos_entities(text))
print("Entities:", get_entities(text,False))

PoS entities: ['male actor', 'warsaw']
Entities: ['warsaw']


In [3]:
def get_entities_by_muheqa(text,additional=True):
    entities = get_entities_by_flert(text)
    if additional:      
      pos_entities = [unicase(e) for e in get_pos_entities(text)]      
      if (len(entities) == 0):
        return pos_entities
      final_entities = [] 
      for e in entities:
          final_entity = e
          for pe in pos_entities:
            if (e in pe):
              final_entity = pe
          final_entities.append(final_entity)
      if (len(final_entities)>0):
        return list(set(final_entities))                              
    return entities

# Evaluation

## Datasets

### SimpleQuestions Dataset

In [4]:
import pandas as pd
df = pd.read_csv('data/wsq-labels.csv', index_col=0)
# inverse predicates contain no entity. In all other cases, the entity corresponds to the subject.
#my_df = df[df.predicate.str.contains('P',case=False)]
#my_df.head()
entities = []
for index, row in df.iterrows():
  entity = row['subject_label']  
  entities.append([entity])
df['entities'] = entities
wsq_df = df.drop(['subject','predicate','object','subject_label','predicate_label','object_label'], axis=1)
wsq_df.head()

Unnamed: 0,question,entities
0,Where did roger marquis die,[Roger Marquis]
1,what was the cause of death of yves klein,[Yves Klein]
2,What position does carlos gomez play?,[Carlos Gómez]
3,how does engelbert zaschka identify,[Engelbert Zaschka]
4,what position does pee wee reese play in baseball,[Pee Wee Reese]


### Wikidata QA

In [5]:
import pandas as pd
df = pd.read_csv('data/wqa-labels.csv', index_col=0)
entities = []
for index, row in df.iterrows():
  entities.append(row['subject_labels'].replace("[","").replace("]","").replace("\'","").split(","))
df['entities'] = entities
wqa_df = df.drop(['subjects','predicates','objects','subject_labels','predicate_labels','object_labels'], axis=1)
wqa_df.head()

Unnamed: 0,question,entities
0,Who is the president of Poland?,[president of Poland]
1,How many Turing awards have people from Austri...,"[Turing awards, Austria]"
2,Give me all countries that have won a FIFA Wor...,[FIFA World Cup]
3,What is the population of Chile?,[Chile]
4,Who is the author of One Piece?,[One Piece]


## Metrics

In [6]:
%%capture
!pip install unidecode
import unidecode

def normalize(label):
  return unidecode.unidecode(label.strip()).lower().replace(" ","_")

def precision(tp,fp):
  if (fp+tp == 0):
    return 0.0
  return tp / (fp + tp)

def recall(tp,fn):
  if (fn+tp == 0):
    return 0.0
  return tp / (fn + tp)

def f1(tp,fp,fn):
  p = precision(tp,fp)
  r = recall(tp,fn)
  if (p+r == 0):
    return 0.0
  return 2 * ((p*r)/(p+r))

def average(values):
  return sum(values) / len(values) 

# lists of entity lists
def evaluate_labels(true_list,pred_list):
  tp, tn, fp, fn = 0, 0, 0, 0
  precision_list, recall_list, f1_list = [], [], []
  empty_values = 0
  for index in range(len(true_list)):
    # normalize entities
    valid_entities = [normalize(e) for e in true_list[index] if e != '']
    predicted_entities = [normalize(e) for e in pred_list[index]]
    ptp, ptn, pfp, pfn = 0, 0, 0, 0
    if (len(valid_entities)==0):
      empty_values += 1
    for entity in valid_entities:
      if (entity not in predicted_entities):
        pfn += 1
    for entity in predicted_entities:
      if (entity in valid_entities):
        ptp += 1
      else:
        pfp += 1    
    precision_list.append(precision(ptp,pfp))
    recall_list.append(recall(ptp,pfn))
    f1_list.append(f1(ptp,pfp,pfn))
    tp += ptp
    tn += ptn
    fp += pfp
    fn += pfn  
  return  {
      'total': index,
      'empty': empty_values,
      'tp': tp,
      'tn': tn, 
      'fp': fp,
      'fn':fn,
      'micro-precision': precision(tp,fp),
      'micro-recall': recall(tp,fn),
      'micro-f1': f1(tp,fp,fn),
      'macro-precision': average(precision_list),
      'macro-recall': average(recall_list),
      'macro-f1': average(f1_list)
  }  

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## SOTA Methods

### FLERT

From paper: Schweter, Stefan and A. Akbik. “FLERT: Document-Level Features for Named Entity Recognition.” ArXiv abs/2011.06993 (2020): n. pag.

In [7]:
from flair.data import Sentence
from flair.models import SequenceTagger

flert_tagger = SequenceTagger.load("flair/ner-english-large")

def get_entities_by_flert(text):
  sentence = Sentence(text)
  flert_tagger.predict(sentence)
  entities = []
  for entity in sentence.get_spans('ner'):
    entities.append(entity.text)
  return entities

get_entities_by_flert("George Washington went to Washington")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

2022-04-29 13:57:28,977 loading file /Users/cbadenes/.flair/models/ner-english-large/07301f59bb8cb113803be316267f06ddf9243cdbba92a4c8067ef92442d2c574.554244d3476d97501a766a98078421817b14654496b86f2f7bd139dc502a4f29


Downloading:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

2022-04-29 13:57:59,996 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


['George Washington', 'Washington']

### BERT-base NER


In [8]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

bert_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
bert_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

bert_nlp = pipeline("ner", model=bert_model, tokenizer=bert_tokenizer)

def get_entities_by_bert_base(text):
  entities = []
  entity = ""
  index = -1
  offset = -1
  for token in bert_nlp(text):
    if (index == -1):
        index = token['index']
        offset = token['start']
    word = token['word']
    if (word[0] == '#'):
        word = token['word'].replace("#","")        
    if (token['start']== offset):
        entity += word
    elif (token['index']-index < 2):
        entity += " " + word
    else:
        entities.append(entity)
        entity = word
    index = token['index']
    offset = token['end']
  if (len(entity) > 0):    
        entities.append(entity)
  return entities

#get_entities_by_bert_base("George Washington went to Washington")
get_entities_by_bert_base("Washington is here ")

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

['Washington']

### ROBERTA-base NER

In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

roberta_tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
roberta_model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

roberta_nlp = pipeline("ner", model=roberta_model, tokenizer=roberta_tokenizer)

def get_entities_by_roberta_base(text):
  entities = []
  entity = ""
  index = -1
  offset = -1
  for token in roberta_nlp(text):
    if (index == -1):
        index = token['index']
        offset = token['start']
    word = token['word']
    if (word[0] == '#'):
        word = token['word'].replace("#","")        
    if (token['start']== offset):
        entity += word
    elif (token['index']-index < 2):
        entity += " " + word
    else:
        entities.append(entity.replace("Ġ",""))
        entity = word
    index = token['index']
    offset = token['end']
  if (len(entity) > 0):    
        entities.append(entity.replace("Ġ",""))
  return entities

get_entities_by_roberta_base("George Washington went to Washington")

Downloading:   0%|          | 0.00/255 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

['George Washington', 'Washington']

# Results

In [10]:
from IPython.display import clear_output
import json
import pandas as pd

def json_file(name):
  return name+"-entities.json"

def csv_file(name):
  return name+"-entities.csv"

def evaluate_data(name,dataframe):
  l1, l2, l3, l4, l5, l6, l7 = [], [], [], [], [], [], []
  total = 0
  for index, row in dataframe.iterrows():
      question = row['question']
      print(index,":",question)
      #l1.append(get_entities(question,False))
      #l2.append(get_pos_entities(question))
      #l3.append(get_entities(question,True))
      #l4.append(get_entities_by_flert(question))
      #l5.append(get_entities_by_bert_base(question))
      #l6.append(get_entities_by_roberta_base(question))
      l7.append(get_entities_by_muheqa(question))
      total += 1
  #dataframe['MuHeQA_NER']=l1
  #dataframe['MuHeQA_PoS']=l2
  #dataframe['MuHeQA_NER+PoS']=l3
  #dataframe['FLERT']=l4
  #dataframe['BERT']=l5
  #dataframe['RoBERTA']=l6
  dataframe['MuHeQA_FLERT']=l7
  clear_output(wait=True)
  print(total,"questions analyzed!")
  dataframe.to_json(json_file(name), orient='split')
  dataframe.to_csv(csv_file(name))
  return dataframe

def make_report(name,additional=[]):
  
  df = pd.read_json(json_file(name), orient='split')
  y_true =df['entities'].tolist()
  results = []
  for col in df.columns:
    if (col == 'question') or (col == 'entities'):
      continue
    y_pred = df[col].tolist()
    result = evaluate_labels(y_true,y_pred)
    result['model']=col
    results.append(result)

  for row in additional:
    results.append(row)

  df_results = pd.DataFrame(results)
  return df_results


### SimpleQuestions Dataset

In [11]:
evaluate_data('wsq',wsq_df)

5622 questions analyzed!


Unnamed: 0,question,entities,MuHeQA_FLERT
0,Where did roger marquis die,[Roger Marquis],[roger marquis]
1,what was the cause of death of yves klein,[Yves Klein],[yves klein]
2,What position does carlos gomez play?,[Carlos Gómez],[carlos gomez]
3,how does engelbert zaschka identify,[Engelbert Zaschka],[engelbert zaschka]
4,what position does pee wee reese play in baseball,[Pee Wee Reese],[pee wee reese]
...,...,...,...
5617,Who is a notable figure that was born in barce...,[Barcelona],[barcelona]
5618,what films have been produced by jun lana?,[Jun Lana],[jun lana]
5619,Where was gunnar johansen born in Denmark?,[Gunnar Johansen],"[Denmark, gunnar johansen]"
5620,what celestial object is 2974 holden,[2974 Holden],"[celestial object, 2974 holden]"


In [12]:
wsq_df.head(20)

Unnamed: 0,question,entities,MuHeQA_FLERT
0,Where did roger marquis die,[Roger Marquis],[roger marquis]
1,what was the cause of death of yves klein,[Yves Klein],[yves klein]
2,What position does carlos gomez play?,[Carlos Gómez],[carlos gomez]
3,how does engelbert zaschka identify,[Engelbert Zaschka],[engelbert zaschka]
4,what position does pee wee reese play in baseball,[Pee Wee Reese],[pee wee reese]
5,Which Swiss conductor's cause of death is myoc...,[myocardial infarction],[Swiss]
6,where was padraic mcguinness's place of death,[Padraic McGuinness],[padraic mcguinness]
7,what is the place of birth of sam edwards?,[Sam Edwards],[sam edwards]
8,Which home is an example of italianate archite...,[Italianate architecture],[italianate architecture]
9,who published neo contra,[Neo Contra],[neo contra]


In [80]:
make_report('wsq')

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,5621,0,3738,0,1725,1885,0.684239,0.66477,0.674364,0.63757,0.66471,0.646395,MuHeQA_NER
1,5621,0,2955,0,6120,2668,0.32562,0.52552,0.402096,0.348734,0.525436,0.406415,MuHeQA_PoS
2,5621,0,3915,0,2635,1707,0.59771,0.696371,0.64328,0.657402,0.696371,0.670206,MuHeQA_NER+PoS
3,5621,0,2823,0,1026,2799,0.733437,0.502134,0.596136,0.486867,0.502134,0.491877,FLERT
4,5621,0,140,0,946,5482,0.128913,0.024902,0.041741,0.02428,0.024902,0.024487,BERT
5,5621,0,3250,0,1592,2373,0.67121,0.577983,0.621118,0.558016,0.577908,0.564508,RoBERTA
6,5621,0,3562,0,3460,2060,0.507263,0.633582,0.563429,0.576782,0.633582,0.595488,MuHeQA_FLERT


### Wikidata QA

In [13]:
evaluate_data('wqa',wqa_df)

101 questions analyzed!


Unnamed: 0,question,entities,MuHeQA_FLERT
0,Who is the president of Poland?,[president of Poland],[Poland]
1,How many Turing awards have people from Austri...,"[Turing awards, Austria]","[Turing, Austria]"
2,Give me all countries that have won a FIFA Wor...,[FIFA World Cup],[FIFA World Cup]
3,What is the population of Chile?,[Chile],[Chile]
4,Who is the author of One Piece?,[One Piece],[One Piece]
...,...,...,...
96,Who wrote The Old Man and the Sea?,[The Old Man and the Sea],[The Old Man and the Sea]
97,Which YouTube channels talk about maths?,"[YouTube channels, maths]",[YouTube]
98,List Italian sauces.,"[sauce, Italy]",[Italian]
99,What diseases are associated with the gene FGF14?,[FGF14],"[diseases, gene fgf14]"


In [78]:
# add results from Table 6 in paper Diomedi, Daniel and Aidan Hogan. “Question Answering over Knowledge Graphs with Neural Machine Translation and Entity Linking.” ArXiv abs/2107.02865 (2021)
value = {'total':100,'empty':0, 'tp':0, 'tn':0, 'fp':0, 'fn':0, 'micro-precision':0.180, 'micro-recall':0.170, 'micro-f1':0.175, 'macro-precision':0.183, 'macro-recall':0.215, 'macro-f1':0.188, 'model':'ElNeuQA'}
make_report('wqa',[value])

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,100,4,56,0,12,94,0.823529,0.373333,0.513761,0.470297,0.367162,0.401556,MuHeQA_NER
1,100,4,106,0,66,44,0.616279,0.706667,0.658385,0.638614,0.70462,0.649552,MuHeQA_PoS
2,100,4,95,0,40,55,0.703704,0.633333,0.666667,0.717822,0.65264,0.660726,MuHeQA_NER+PoS
3,100,4,56,0,12,94,0.823529,0.373333,0.513761,0.465347,0.375413,0.403866,FLERT
4,100,4,51,0,18,99,0.73913,0.34,0.465753,0.425743,0.340759,0.369213,BERT
5,100,4,59,0,11,91,0.842857,0.393333,0.536364,0.490099,0.395215,0.425318,RoBERTA
6,100,4,102,0,39,48,0.723404,0.68,0.701031,0.737624,0.672442,0.679444,MuHeQA_FLERT
7,100,0,0,0,0,0,0.18,0.17,0.175,0.183,0.215,0.188,ElNeuQA
