# **Testing the performance of NER tools on Crossref Funder Regsitry**

## 1. Connecting to the Google Drive

In [1]:
# run this code when running the code on Google Colab
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.insert(0,'/content/drive/MyDrive/Indepdendent_Study/')

Mounted at /content/drive


## 2. Installation and Setup

### 1. Installing Required Libraries

In [2]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.12.2-py3-none-any.whl (373 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.1/373.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecated>=1.2.4
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting transformer-smaller-training-vocab>=0.2.1
  Downloading transformer_smaller_training_vocab-0.2.3-py3-none-any.whl (12 kB)
Collecting transformers[sentencepiece]>=4.18.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting conllu>=4.0
  Downloading conllu-4.5.2-py2.py3-none-any.whl (16 kB)
Collecting huggingface-hub>=0.10.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200

### 2. Importing Required Libraries

In [3]:
import spacy
from flair.data import Sentence
from flair.models import SequenceTagger

### 3. Loading Pretrained Language Models

In [4]:
nlp_sm = spacy.load('en_core_web_sm')
spacy.cli.download("en_core_web_md")
nlp_md = spacy.load('en_core_web_md')
spacy.cli.download("en_core_web_lg")
nlp_lg = spacy.load('en_core_web_lg')
tagger = SequenceTagger.load("flair/ner-english-large")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2023-04-22 16:09:45,064 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


## 4. Crossref Funder Registry

### 1. Storing Crossref results in a dictionary

In [5]:
# storing the funding organization data in a dictonary
import csv
import xml
import xml.etree.ElementTree as ET
from string import ascii_lowercase as alph

## create dictionary for sorted searching of orga, names
def funder_dictionary_creation(filename):
    # create empty dict with list for every character in alphabet, one "other" label for non-standard first characters (e.g. Chinese names)
    tree = ET.parse(filename)
    root = tree.getroot()
    orga_dict = {}
    for c in alph:
        orga_dict[c] = []
        if c=="z":
            orga_dict['other'] = []

    for orga in range(2,31552):
        names = []

        # iterate through entries in each organization and check whether the tag is interesting
        for entry in root[orga]:

            #print(entry)
            if entry.tag == '{http://www.w3.org/2008/05/skos-xl#}prefLabel' or entry.tag == '{http://www.w3.org/2008/05/skos-xl#}altLabel':

                # orga_name = entry[0][0].text.lower()
                orga_name = entry[0][0].text
                #print(orga_name)
                #print(orga_name[0])
                if orga_name[0] in alph:
                    orga_dict[orga_name[0]] == orga_dict[orga_name[0]].append(orga_name)
                else:
                    orga_dict['other'].append(orga_name)

    w = csv.writer(open("crossref_organizations_sorted_dict.csv", "w", encoding='utf-8', newline=''), delimiter='|')
    for letter,orgs in orga_dict.items():
        w.writerow([letter,list(orgs)])

    return orga_dict
orga_dict = funder_dictionary_creation("/content/drive/MyDrive/Independent_Study/registry.rdf")

### 2. Flattening the organization list from dictionary

In [6]:
orga_list= []
for ele in orga_dict:
  orga_list += orga_dict[ele]
orga_list[:30]

['amfAR, The Foundation for AIDS Research',
 'amfAR',
 'amfAR The Foundation for AIDS Research',
 'amphilsoc',
 'atchildrensproject',
 'amerchemsociety',
 'acmtmedtox',
 'againstbc',
 'an Oifig Eorpach Frith-Chalaoise',
 'aTyr',
 'aTyr Pharma',
 'artsnb',
 'auDA Foundation',
 'auDA',
 'artsACT',
 'asbl Scientific Research Luxembourg',
 'asbl Recherches Scientifiques Luxembourg',
 'asbl RSL',
 'as',
 'bell-Hanger Foundation, Inc.',
 'blakemorefoundation',
 'badania na rzecz MŚP',
 'bluebird bio',
 'bluebird bio, Inc.',
 'bccn Tübingen',
 'ci-FRC of Strasbourg',
 'ci-FRC',
 'children - crohns & colitis',
 'ccalliance',
 'coherente de políticas de investigación']

Assumptions - 


1.   Period (“.”) is not part of the organization entity span. Thus, mismatch due to “.” should be ignored.
2.   If the mismatch takes place due “The” at the beginning of the entity span, then ignore it. (We can expand this to other articles such as “a” and “an” as well. However, most of the organizations start with “The” as an article.)





In [7]:
for i in range(len(orga_list)):
  if orga_list[i][-1] == ".":
    orga_list[i] = orga_list[i][:-1]
  if orga_list[i].split(" ")[0].lower() == 'the' and orga_list[i].lower() != 'the':
    orga_list[i] = " ".join(orga_list[i].split(" ")[1:])

In [8]:
for i in range(len(orga_list)):
  if orga_list[i][-1] == "." or orga_list[i].split(" ")[0].lower() == 'the' and orga_list[i].lower() != 'the':
    print(orga_list[i])

In [9]:
import pandas as pd
df = pd.DataFrame()
df['orgalist'] = orga_list
df.to_csv("orgalist.csv", encoding = 'UTF-8')

## 5. Applying NER on the Crossref List

We use the pre-trained language models as as NER tools for extracting organization names from Crossref Funder Regsitry



### 1. en_core_web_sm

In [43]:
# applying the "en_core_web_sm" EntityRecognizer to the acknowledgement dataset
sm_crossref = []
for ele in orga_list:
  ele = nlp_sm(ele).ents
  temp = []
  for ent in ele:
    if ent.label_ == 'ORG':
      temp.append(ent.text)
  sm_crossref.append(temp)
  
sm_crossref[:10]

[['The Foundation for AIDS Research'],
 [],
 ['The Foundation for AIDS Research'],
 [],
 [],
 [],
 [],
 [],
 [],
 ['aTyr']]

In [34]:
l_sm = [len(ele) for ele in sm_crossref]

### 2. en_core_web_md

In [11]:
# applying the "en_core_web_md" EntityRecognizer to the acknowledgement dataset
md_crossref = []
for ele in orga_list:
  ele = nlp_md(ele).ents
  temp = []
  for ent in ele:
    if ent.label_ == 'ORG':
      temp.append(ent.text)
  md_crossref.append(temp)
  
md_crossref[:10]

[['The Foundation for AIDS Research'],
 [],
 ['The Foundation for AIDS Research'],
 [],
 [],
 ['amerchemsociety'],
 ['acmtmedtox'],
 ['againstbc'],
 [],
 []]

In [32]:
l_md = [len(ele) for ele in md_crossref]

### 3. en_core_web_lg

In [12]:
# applying the "en_core_web_lg" EntityRecognizer to the acknowledgement dataset
lg_crossref = []
for ele in orga_list:
  ele = nlp_lg(ele).ents
  temp = []
  for ent in ele:
    if ent.label_ == 'ORG':
      temp.append(ent.text)
  lg_crossref.append(temp)
  
lg_crossref[:10]

[['The Foundation for AIDS Research'],
 [],
 ['The Foundation for AIDS Research'],
 [],
 [],
 ['amerchemsociety'],
 [],
 [],
 [],
 []]

In [33]:
l_lg = [len(ele) for ele in lg_crossref]

### 4. flair large

In [13]:
# applying the Flair to the acknowledgement dataset
flair_crossref = []
for ele in orga_list:
  sentence = Sentence(ele)
  tagger.predict(sentence)
  ner_sent = sentence.get_spans('ner')
  temp = []
  for ent in ner_sent:
    ent_label = ent.get_label("ner").value
    ent_text = ent.text

    if ent_label == 'ORG':
      temp.append(ent_text)
  
  flair_crossref.append(temp)
  

flair_crossref[:10]

[['The Foundation for AIDS Research'],
 [],
 ['The Foundation for AIDS Research'],
 [],
 [],
 ['amerchemsociety'],
 [],
 [],
 ['Oifig Eorpach Frith-Chalaoise'],
 []]

In [31]:
l_flair = [len(ele) for ele in flair_crossref]

## 6. Saving the results in DataFrame

In [14]:
import pandas as pd
data = pd.DataFrame([orga_list,sm_crossref, md_crossref, lg_crossref,flair_crossref]).T
data.columns = ['Corssref_Organization_List', 'NER_sm', 'NER_md', 'NER_lg', 'NER_Flair']
data.head()

Unnamed: 0,Corssref_Organization_List,NER_sm,NER_md,NER_lg,NER_Flair
0,"amfAR, The Foundation for AIDS Research",[The Foundation for AIDS Research],[The Foundation for AIDS Research],[The Foundation for AIDS Research],[The Foundation for AIDS Research]
1,amfAR,[],[],[],[]
2,amfAR The Foundation for AIDS Research,[The Foundation for AIDS Research],[The Foundation for AIDS Research],[The Foundation for AIDS Research],[The Foundation for AIDS Research]
3,amphilsoc,[],[],[],[]
4,atchildrensproject,[],[],[],[]


In [15]:
# data = pd.read_csv("/content/drive/MyDrive/Independent_Study/corssref_ner.csv")
data = data[['Corssref_Organization_List'	, 'NER_sm' , 'NER_md', 'NER_lg', 'NER_Flair']]
data['Corssref_Organization_List'] = [str(ele)[:-1] if str(ele)[-1] == '.' else str(ele) for ele in data['Corssref_Organization_List'] ]
data.head(10)

Unnamed: 0,Corssref_Organization_List,NER_sm,NER_md,NER_lg,NER_Flair
0,"amfAR, The Foundation for AIDS Research",[The Foundation for AIDS Research],[The Foundation for AIDS Research],[The Foundation for AIDS Research],[The Foundation for AIDS Research]
1,amfAR,[],[],[],[]
2,amfAR The Foundation for AIDS Research,[The Foundation for AIDS Research],[The Foundation for AIDS Research],[The Foundation for AIDS Research],[The Foundation for AIDS Research]
3,amphilsoc,[],[],[],[]
4,atchildrensproject,[],[],[],[]
5,amerchemsociety,[],[amerchemsociety],[amerchemsociety],[amerchemsociety]
6,acmtmedtox,[],[acmtmedtox],[],[]
7,againstbc,[],[againstbc],[],[]
8,an Oifig Eorpach Frith-Chalaoise,[],[],[],[Oifig Eorpach Frith-Chalaoise]
9,aTyr,[aTyr],[],[],[]


## 7. Accuracy Score (Match Percentage) for the NER tools

Accuracy of Each NER tool when applied on the list of organizations curated by the Crossref Funder Registry.


In [16]:
# NER_sm
ner_sm_match = []
for crossref, ner_sm in zip(data['Corssref_Organization_List'], data['NER_sm']):
  if len(ner_sm) > 0 and crossref == ner_sm[0]:
    ner_sm_match.append(1)
  else:
    ner_sm_match.append(0)
data['NER_sm_match'] = ner_sm_match


# NER_md
ner_md_match = []
for crossref, ner_md in zip(data['Corssref_Organization_List'], data['NER_md']):
   if len(ner_md) > 0 and crossref == ner_md[0]:
    ner_md_match.append(1)
   else:
    ner_md_match.append(0)
data['NER_md_match'] = ner_md_match

# NER_lg
ner_lg_match = []
for crossref, ner_lg in zip(data['Corssref_Organization_List'], data['NER_lg']):
   if len(ner_lg) > 0 and crossref == ner_lg[0]:
    ner_lg_match.append(1)
   else:
    ner_lg_match.append(0)
data['NER_lg_match'] = ner_lg_match


# NER_sm
ner_flair_match = []
for crossref, ner_flair in zip(data['Corssref_Organization_List'], data['NER_Flair']):
  if len(ner_flair) > 0 and crossref == ner_flair[0]:
    ner_flair_match.append(1)
  else:
    ner_flair_match.append(0)
data['NER_flair_match'] = ner_flair_match

In [35]:
data['NER_sm_orga_count'] = l_sm
data['NER_md_orga_count'] = l_md
data['NER_lg_orga_count'] = l_lg
data['NER_flair_orga_count'] = l_flair

In [36]:
data.head()

Unnamed: 0,Corssref_Organization_List,NER_sm,NER_md,NER_lg,NER_Flair,NER_sm_match,NER_md_match,NER_lg_match,NER_flair_match,NER_sm_orga_count,NER_md_orga_count,NER_lg_orga_count,NER_flair_orga_count
0,"amfAR, The Foundation for AIDS Research",['The Foundation for AIDS Research'],['The Foundation for AIDS Research'],['The Foundation for AIDS Research'],['The Foundation for AIDS Research'],0,0,0,0,1,1,1,1
1,amfAR,[],[],[],[],0,0,0,0,0,0,0,0
2,amfAR The Foundation for AIDS Research,['The Foundation for AIDS Research'],['The Foundation for AIDS Research'],['The Foundation for AIDS Research'],['The Foundation for AIDS Research'],0,0,0,0,1,1,1,1
3,amphilsoc,[],[],[],[],0,0,0,0,0,0,0,0
4,atchildrensproject,[],[],[],[],0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96760,Illawarra Shoalhaven LHD,[],[],['Illawarra Shoalhaven'],['Illawarra Shoalhaven LHD'],0,0,0,1,0,0,1,1
96761,ISLHD,[],[],[],['ISLHD'],0,0,0,1,0,0,0,1
96762,Centre for Health Research Illawarra Shoalhave...,['Centre for Health Research Illawarra Shoalha...,['Centre for Health Research Illawarra Shoalha...,['Centre for Health Research Illawarra Shoalha...,['Centre for Health Research Illawarra Shoalha...,1,0,1,0,1,2,1,2
96763,Centre for Health Research Illawarra Shoalhave...,['Centre for Health Research Illawarra Shoalha...,['Centre for Health Research Illawarra Shoalha...,['Centre for Health Research Illawarra Shoalha...,['Centre for Health Research'],1,0,0,0,1,1,1,1


In [38]:
NER_sm_acc = len(data[data['NER_sm_match'] == 1])/len(data)
NER_md_acc = len(data[data['NER_md_match'] == 1])/len(data)
NER_lg_acc = len(data[data['NER_lg_match'] == 1])/len(data)
NER_flair_acc = len(data[data['NER_flair_match'] == 1])/len(data)

In [44]:
print("The accuracy score for en_core_web_sm: ", NER_sm_acc)

The accuracy score for en_core_web_sm:  0.4804009714256188


In [45]:
print("The accuracy score for en_core_web_md: ", NER_md_acc)

The accuracy score for en_core_web_md:  0.5165194026765876


In [46]:
print("The accuracy score for en_core_web_lg: ", NER_lg_acc)

The accuracy score for en_core_web_lg:  0.5180488813103912


In [47]:
print("The accuracy score for flair large: ", NER_flair_acc)

The accuracy score for flair large:  0.7306567457241772


## 8. Saving results in a CSV

We save results listing the complete, partial and mismatches generated by comparison between the Crossref Funder Registry and NER tool output. We use this list of conduct a manual error analysis, to identify partial mismatch cases where the tools fails to detect the expected span of organizations

In [37]:
data.to_csv("/content/drive/MyDrive/Independent_Study/corssref_ner.csv", index = False)