# Experiment Setup 

## Install dependencies and prepare modules

In [1]:
!pip install --upgrade pip
!pip install flair
!pip install coloredlogs
!pip install unidecode
!pip install fcache

import coloredlogs, logging
import application.logformatter as lf
from fcache.cache import FileCache

log_level = logging.DEBUG

fh = logging.StreamHandler()
fh.setFormatter(lf.CustomFormatter())
fh.setLevel(log_level)

logger = logging.getLogger('muheqa')
logger.addHandler(fh)
logger.setLevel(log_level)


logging.getLogger().setLevel(logging.WARNING)

import application.summary.entity as ent
import application.summary.concept as cp
import application.summary.keyword as key
import application.cache as cache

entity_discovery  = ent.Entity()
concept_discovery = cp.Concept()
keyword_discovery = key.Keyword()





[38;20m2022-08-29 17:04:12,745 - muheqa - DEBUG - initializing Entity class instance... (entity.py:11)[0m
[38;20m2022-08-29 17:04:18,780 - muheqa - DEBUG - initializing Concept class instance... (concept.py:12)[0m
[38;20m2022-08-29 17:04:19,462 - muheqa - DEBUG - initializing Keyword class instance... (keyword.py:11)[0m
[38;20m2022-08-29 17:04:19,463 - muheqa - DEBUG - initializing Entity class instance... (entity.py:11)[0m
[38;20m2022-08-29 17:04:25,125 - muheqa - DEBUG - initializing Concept class instance... (concept.py:12)[0m


## Define Metrics

In [2]:
import unidecode

def normalize(label):
  return unidecode.unidecode(label.strip()).lower().replace(" ","_")

def precision(tp,fp):
  if (fp+tp == 0):
    return 0.0
  return tp / (fp + tp)

def recall(tp,fn):
  if (fn+tp == 0):
    return 0.0
  return tp / (fn + tp)

def f1(tp,fp,fn):
  p = precision(tp,fp)
  r = recall(tp,fn)
  if (p+r == 0):
    return 0.0
  return 2 * ((p*r)/(p+r))

def average(values):
  return sum(values) / len(values) 

# lists of entity lists
def evaluate_labels(true_list,pred_list):
  tp, tn, fp, fn = 0, 0, 0, 0
  precision_list, recall_list, f1_list = [], [], []
  empty_values = 0
  for index in range(len(true_list)):
    # normalize entities
    valid_entities = [normalize(e) for e in true_list[index] if e != '']
    predicted_entities = [normalize(e) for e in pred_list[index]]
    ptp, ptn, pfp, pfn = 0, 0, 0, 0
    if (len(valid_entities)==0):
      empty_values += 1
    for entity in valid_entities:
      if (entity not in predicted_entities):
        pfn += 1
    for entity in predicted_entities:
      if (entity in valid_entities):
        ptp += 1
      else:
        pfp += 1    
    precision_list.append(precision(ptp,pfp))
    recall_list.append(recall(ptp,pfn))
    f1_list.append(f1(ptp,pfp,pfn))
    tp += ptp
    tn += ptn
    fp += pfp
    fn += pfn  
  return  {
      'total': index,
      'empty': empty_values,
      'tp': tp,
      'tn': tn, 
      'fp': fp,
      'fn':fn,
      'micro-precision': precision(tp,fp),
      'micro-recall': recall(tp,fn),
      'micro-f1': f1(tp,fp,fn),
      'macro-precision': average(precision_list),
      'macro-recall': average(recall_list),
      'macro-f1': average(f1_list)
  }

print("metrics are ready")

metrics are ready


## Load SOTA Methods

### FLERT

From paper: Schweter, Stefan and A. Akbik. “FLERT: Document-Level Features for Named Entity Recognition.” ArXiv abs/2011.06993 (2020)

In [3]:
from flair.data import Sentence
from flair.models import SequenceTagger

flert_tagger = SequenceTagger.load("flair/ner-english-large")
flert_cache = cache.Cache("muheqa_flert")

def get_entities_by_flert(text):
    if (flert_cache.exists(text)):
        return flert_cache.get(text)
    sentence = Sentence(text)
    flert_tagger.predict(sentence)
    entities = []
    for entity in sentence.get_spans('ner'):
        entities.append(entity.text)
    flert_cache.set(text,entities)
    return entities


[38;20m2022-08-29 17:04:46,702 - muheqa - DEBUG - initializing muheqa_flert cache ... (cache.py:9)[0m


Loading 110 elements from cache: /Users/cbadenes/Library/Caches/muheqa_flert/cache ...


In [4]:
get_entities_by_flert("George Washington went to Washington")

[38;20m2022-08-29 17:04:46,710 - muheqa - DEBUG - reading value of 7bd3189e6a6c8993bc12244a66e09c65 from cache (cache.py:29)[0m


['George Washington', 'Washington']

### BERT-based NER

In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

bert_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
bert_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
bert_cache = cache.Cache("muheqa_bert")


bert_nlp = pipeline("ner", model=bert_model, tokenizer=bert_tokenizer)

def get_entities_by_bert_base(text):
    if (bert_cache.exists(text)):
        return bert_cache.get(text)
    entities = []
    entity = ""
    index = -1
    offset = -1
    for token in bert_nlp(text):
        if (index == -1):
            index = token['index']
            offset = token['start']
        word = token['word']
        if (word[0] == '#'):
            word = token['word'].replace("#","")        
        if (token['start']== offset):
            entity += word
        elif (token['index']-index < 2):
            entity += " " + word
        else:
            entities.append(entity)
            entity = word
        index = token['index']
        offset = token['end']
    if (len(entity) > 0):    
        entities.append(entity)
    bert_cache.set(text,entities)
    return entities

[38;20m2022-08-29 17:04:52,448 - muheqa - DEBUG - initializing muheqa_bert cache ... (cache.py:9)[0m


Loading 110 elements from cache: /Users/cbadenes/Library/Caches/muheqa_bert/cache ...


In [6]:
# unit test
get_entities_by_bert_base("George Washington went to Washington")

[38;20m2022-08-29 17:04:52,454 - muheqa - DEBUG - reading value of 7bd3189e6a6c8993bc12244a66e09c65 from cache (cache.py:29)[0m


['George Washington', 'Washington']

### RoBERTA-based NER

In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

roberta_tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
roberta_model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english")
roberta_cache = cache.Cache("muheqa_roberta")

roberta_nlp = pipeline("ner", model=roberta_model, tokenizer=roberta_tokenizer)

def get_entities_by_roberta_base(text):
    if (roberta_cache.exists(text)):
        return roberta_cache.get(text)
    entities = []
    entity = ""
    index = -1
    offset = -1
    for token in roberta_nlp(text):
        if (index == -1):
            index = token['index']
            offset = token['start']
        word = token['word']
        if (word[0] == '#'):
            word = token['word'].replace("#","")        
        if (token['start']== offset):
            entity += word
        elif (token['index']-index < 2):
            entity += " " + word
        else:
            entities.append(entity.replace("Ġ",""))
            entity = word
        index = token['index']
        offset = token['end']
    if (len(entity) > 0):    
        entities.append(entity.replace("Ġ",""))
    roberta_cache.set(text,entities)
    return entities

[38;20m2022-08-29 17:05:00,790 - muheqa - DEBUG - initializing muheqa_roberta cache ... (cache.py:9)[0m


Loading 110 elements from cache: /Users/cbadenes/Library/Caches/muheqa_roberta/cache ...


In [8]:
# unit test
get_entities_by_roberta_base("George Washington went to Washington")

[38;20m2022-08-29 17:05:00,797 - muheqa - DEBUG - reading value of 7bd3189e6a6c8993bc12244a66e09c65 from cache (cache.py:29)[0m


['George Washington', 'Washington']

# Evaluation Results

In [19]:
from IPython.display import clear_output
import json
import pandas as pd

def json_file(name):
  return name+"-keywords.json"

def csv_file(name):
  return name+"-keywords.csv"

def evaluate_data(name,dataframe):
    logging.getLogger('muheqa').setLevel(logging.WARNING)
    l1, l2, l3, l4 = [], [], [], []
    total = 0
    for index, row in dataframe.iterrows():
        question = row['question']
        print(index,":",question)
        l1.append(keyword_discovery.get(question))
        l2.append(get_entities_by_flert(question))
        l3.append(get_entities_by_bert_base(question))
        l4.append(get_entities_by_roberta_base(question))
        total += 1
    dataframe['MuHeQA_Keywords']=l1
    dataframe['FLERT_NER']=l2
    dataframe['BERT_NER']=l3
    dataframe['RoBERTA_NER']=l4
    clear_output(wait=True)
    print(total,"questions analyzed!")
    dataframe.to_json(json_file(name), orient='split')
    dataframe.to_csv(csv_file(name))
    logging.getLogger('muheqa').setLevel(logging.DEBUG)
    return dataframe

def make_report(name,additional=[]):
  
  df = pd.read_json(json_file(name), orient='split')
  y_true =df['entities'].tolist()
  results = []
  for col in df.columns:
    if (col == 'question') or (col == 'entities'):
      continue
    y_pred = df[col].tolist()
    result = evaluate_labels(y_true,y_pred)
    result['model']=col
    results.append(result)

  for row in additional:
    results.append(row)

  df_results = pd.DataFrame(results)
  return df_results

print("evaluation methods are ready")

evaluation methods are ready


# Basic Test

In [10]:
#query = "what does 2674 pandarus orbit?"
query = "in which country was overnight delivery filmed in?"
logger.info("Query: '" + query + "'")
logger.info("Entities:"+ str(entity_discovery.get(query)))
logger.info("Concepts:"+ str(concept_discovery.get(query)))
logger.info("Keywords:"+ str(keyword_discovery.get(query)))
logger.info("FLERT:"+ str(get_entities_by_flert(query)))
logger.info("BERT:"+ str(get_entities_by_bert_base(query)))
logger.info("RoBERTA:"+ str(get_entities_by_roberta_base(query)))

[33;94m2022-08-29 17:05:00,808 - muheqa - INFO - Query: 'in which country was overnight delivery filmed in?' (2087347042.py:3)[0m
[38;20m2022-08-29 17:05:00,809 - muheqa - DEBUG - getting entities... (entity.py:19)[0m
[33;94m2022-08-29 17:05:00,985 - muheqa - INFO - Entities:[] (2087347042.py:4)[0m
[38;20m2022-08-29 17:05:00,986 - muheqa - DEBUG - getting concepts ... (concept.py:18)[0m
[38;20m2022-08-29 17:05:01,261 - muheqa - DEBUG - Token:in [ IN]  (concept.py:37)[0m
[38;20m2022-08-29 17:05:01,262 - muheqa - DEBUG - Token:which [ WDT]  (concept.py:37)[0m
[38;20m2022-08-29 17:05:01,262 - muheqa - DEBUG - Token:country [ NN]  (concept.py:37)[0m
[38;20m2022-08-29 17:05:01,262 - muheqa - DEBUG - Token:was [ VBD]  (concept.py:37)[0m
[38;20m2022-08-29 17:05:01,262 - muheqa - DEBUG - Token:overnight [ JJ]  (concept.py:37)[0m
[38;20m2022-08-29 17:05:01,262 - muheqa - DEBUG - Token:delivery [ NN]  (concept.py:37)[0m
[38;20m2022-08-29 17:05:01,263 - muheqa - DEBUG - Token

# SimpleQuestions Dataset

In [16]:
import pandas as pd
df = pd.read_csv('datasets/simple_questions/wsq-labels.csv', index_col=0)
# inverse predicates contain no entity. In all other cases, the entity corresponds to the subject.
entities = []
questions = []
for index, row in df.iterrows():
    entity = row['subject_label']  
    question = row['question']
    if (entity.lower() in question.lower()):
        entities.append([entity])
        questions.append(question)
sq_df = pd.DataFrame(list(zip(questions, entities)),columns =['question', 'entities'])
sq_df.head()

Unnamed: 0,question,entities
0,Where did roger marquis die,[Roger Marquis]
1,what was the cause of death of yves klein,[Yves Klein]
2,how does engelbert zaschka identify,[Engelbert Zaschka]
3,what position does pee wee reese play in baseball,[Pee Wee Reese]
4,Which Swiss conductor's cause of death is myoc...,[myocardial infarction]


In [17]:
sq_df.describe()

Unnamed: 0,question,entities
count,5187,5187
unique,5172,4806
top,Name an actor.,[defender]
freq,6,21


In [22]:
evaluate_data('sq_results',sq_df.head(100))

100 questions analyzed!


Unnamed: 0,question,entities,MuHeQA_Keywords,FLERT_NER,BERT_NER,RoBERTA_NER
0,Where did roger marquis die,[Roger Marquis],[roger marquis],[roger marquis],[],[roger marquis]
1,what was the cause of death of yves klein,[Yves Klein],[yves klein],[yves klein],[],[yves klein]
2,how does engelbert zaschka identify,[Engelbert Zaschka],[engelbert zaschka],[engelbert zaschka],"[engel, zasch]",[engelbert zaschka]
3,what position does pee wee reese play in baseball,[Pee Wee Reese],[pee wee reese],[pee wee reese],[],[pee wee reese]
4,Which Swiss conductor's cause of death is myoc...,[myocardial infarction],[swiss conductor],[Swiss],[Swiss],[Swiss]
...,...,...,...,...,...,...
95,what language was the act of killing in?,[The Act of Killing],[language],[],[],[]
96,which type of film is the bitter tea of genera...,[The Bitter Tea of General Yen],[general yen],[],[],[]
97,Who is an artist signed to warner bros. records,[Warner Bros. Records],[warner bros. records],[],[],[warner bros. records]
98,who is a person with politician as a profession,[politician],[profession],[],[],[]


In [23]:
make_report('sq_results')

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,99,0,78,0,35,22,0.690265,0.78,0.732394,0.74,0.78,0.753333,MuHeQA_Keywords
1,99,0,55,0,10,45,0.846154,0.55,0.666667,0.54,0.55,0.543333,FLERT_NER
2,99,0,2,0,17,98,0.105263,0.02,0.033613,0.02,0.02,0.02,BERT_NER
3,99,0,64,0,22,36,0.744186,0.64,0.688172,0.625,0.64,0.63,RoBERTA_NER


## WikidataQA Dataset

In [31]:
import pandas as pd
df = pd.read_csv('datasets/wikidataQA/wqa-labels.csv', index_col=0)
entities = []
questions = []
for index, row in df.iterrows():
    entity = row['subject_labels'].replace("[","").replace("]","").replace("\'","").split(",")
    question = row['question']
    if (entity[0].lower() in question.lower()):
        entities.append(entity)
        questions.append(question)
wqa_df = pd.DataFrame(list(zip(questions, entities)),columns =['question', 'entities'])
wqa_df.head()

Unnamed: 0,question,entities
0,Who is the president of Poland?,[president of Poland]
1,How many Turing awards have people from Austri...,"[Turing awards, Austria]"
2,Give me all countries that have won a FIFA Wor...,[FIFA World Cup]
3,What is the population of Chile?,[Chile]
4,Who is the author of One Piece?,[One Piece]


In [32]:
wqa_df.describe()

Unnamed: 0,question,entities
count,100,100
unique,100,94
top,Who is the president of Poland?,[]
freq,1,4


In [34]:
evaluate_data('wqa_results',wqa_df)

100 questions analyzed!


Unnamed: 0,question,entities,MuHeQA_Keywords,FLERT_NER,BERT_NER,RoBERTA_NER
0,Who is the president of Poland?,[president of Poland],[poland],[Poland],[Poland],[Poland]
1,How many Turing awards have people from Austri...,"[Turing awards, Austria]","[austria, many turing awards]","[Turing, Austria]","[Turing, Austria]","[Turing awards, Austria]"
2,Give me all countries that have won a FIFA Wor...,[FIFA World Cup],[fifa world cup],[FIFA World Cup],[FIFA World Cup],[FIFA World Cup]
3,What is the population of Chile?,[Chile],[chile],[Chile],[Chile],[Chile]
4,Who is the author of One Piece?,[One Piece],[one piece],[One Piece],[One Piece],[One Piece]
...,...,...,...,...,...,...
95,Who wrote The Old Man and the Sea?,[The Old Man and the Sea],[the old man and the sea],[The Old Man and the Sea],[The Old Man and the Sea],[The Old Man and the Sea]
96,Which YouTube channels talk about maths?,"[YouTube channels, maths]","[youtube channels, maths]",[YouTube],[],[YouTube]
97,List Italian sauces.,"[sauce, Italy]",[italian sauces],[Italian],[Italian],[Italian]
98,What diseases are associated with the gene FGF14?,[FGF14],[gene fgf14],[],[F],[FGF14]


In [35]:
make_report('wqa_results')

Unnamed: 0,total,empty,tp,tn,fp,fn,micro-precision,micro-recall,micro-f1,macro-precision,macro-recall,macro-f1,model
0,99,4,95,0,39,54,0.708955,0.637584,0.671378,0.725,0.659167,0.667333,MuHeQA_Keywords
1,99,4,56,0,11,93,0.835821,0.375839,0.518519,0.47,0.379167,0.407905,FLERT_NER
2,99,4,51,0,17,98,0.75,0.342282,0.470046,0.43,0.344167,0.372905,BERT_NER
3,99,4,59,0,10,90,0.855072,0.395973,0.541284,0.495,0.399167,0.429571,RoBERTA_NER
