# Application of Named Entity Recognition tools on Funding Information

A code implemented to extract named entites from a text. NER tools are used for extracting organization names from funding information obtained from PubMed research papers

# 1. Setting up the environment


In [None]:
# Connecting the google colab noteboook to google drive
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import timeit
t_0 = timeit.default_timer()

In [None]:
# importing required libraries
import pandas as pd
import numpy as np
import spacy
import pickle

In [None]:
# importing the dataset
ack_df = pd.read_csv("cumulative_ack_data.csv")
ack_df.columns = ['Article_Title', 'PMC_ID', 'DOI', 'acknowledgement']
ack_df.head()

In [None]:
# removing the rows for which there is no funding information
ack_df = ack_df[ack_df['acknowledgement'] != 'na']

# 2. Applying NER on funding information

## 1. Spacy EntityRecognizer (en_core_web_sm)

ner categories - LOC, MISC, ORG, PER


In [None]:
nlp = spacy.load('en_core_web_sm')

# Test sentence
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')
for ent in doc.ents:
  print(ent,ent.label_)

In [None]:
# applying the EntityRecognizer to the acknowledgement dataset
ack_ner = []
for ele in ack_df['acknowledgement']:
  ele = nlp(ele).ents
  temp = {}
  for ent in ele:
    if ent.label_ in temp:
      temp[ent.label_].append(ent.text)
    else:
      temp[ent.label_] = [ent.text]
  
  ack_ner.append(temp)
  
ack_df['NER_Spacy (en_core_web_sm)'] = ack_ner
ack_df.reset_index(inplace= True)

In [None]:
ack_df.head()

In [None]:
print(ack_df.loc[4,:]['acknowledgement'])
print(ack_df.loc[4,:]['NER_Spacy (en_core_web_sm)'])

## 2. Spacy EntityRecognizer (en_core_web_md)

ner categories - CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART

In [None]:
# https://stackoverflow.com/questions/56927602/unable-to-load-the-spacy-model-en-core-web-lg-on-google-colab
import spacy.cli
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

# Test sentence
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')
for ent in doc.ents:
  print(ent,ent.label_)

In [None]:
# applying the EntityRecognizer to the acknowledgement dataset
ack_ner = []
for ele in ack_df['acknowledgement']:
  ele = nlp(ele).ents
  temp = {}
  for ent in ele:
    if ent.label_ in temp:
      temp[ent.label_].append(ent.text)
    else:
      temp[ent.label_] = [ent.text]
  
  ack_ner.append(temp)
  
ack_df['NER_Spacy (en_core_web_md)'] = ack_ner

In [None]:
ack_df.head()

In [None]:
print(ack_df.loc[1,:]['acknowledgement'])
print(ack_df.loc[1,:]['NER_Spacy (en_core_web_md)'])

## 3. Spacy EntityRecognizer (en_core_web_lg)

ner categories  - CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART

In [None]:
# https://stackoverflow.com/questions/56927602/unable-to-load-the-spacy-model-en-core-web-lg-on-google-colab
import spacy.cli
spacy.cli.download("en_core_web_lg")
nlp = spacy.load("en_core_web_lg")

# Test sentence
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')
for ent in doc.ents:
  print(ent,ent.label_)

In [None]:
# applying the EntityRecognizer to the acknowledgement dataset
ack_ner = []
for ele in ack_df['acknowledgement']:
  ele = nlp(ele).ents
  temp = {}
  for ent in ele:
    if ent.label_ in temp:
      temp[ent.label_].append(ent.text)
    else:
      temp[ent.label_] = [ent.text]
  
  ack_ner.append(temp)
  
ack_df['NER_Spacy (en_core_web_lg)'] = ack_ner

In [None]:
ack_df.head()

In [None]:
print(ack_df.loc[1,:]['acknowledgement'])
print(ack_df.loc[1,:]['NER_Spacy (en_core_web_lg)'])

4. Flair - English NER in Flair (large model) - 4 class model

## 4. Flair - English NER in Flair (large model)

4 class model

In [None]:
!pip install flair

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
# Make a sentence
sentence = Sentence("Apple is looking at buying U.K. startup for $1 billion")

# Load the NER tagger
# This file is around 1.5 GB so will take a little while to load.
tagger = SequenceTagger.load("flair/ner-english-large")

# Run NER over sentence
tagger.predict(sentence)

In [None]:
# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)

In [None]:
# applying the Flair to the acknowledgement dataset
ack_ner = []
for ele in ack_df['acknowledgement']:
  sentence = Sentence(ele)
  tagger.predict(sentence)
  ner_sent = sentence.get_spans('ner')
  temp = {}
  for ent in ner_sent:
    ent_label = ent.get_label("ner").value
    ent_text = ent.text

    if ent_label in temp:
      temp[ent_label].append(ent_text)
    else:
      temp[ent_label] = [ent_text]
  
  ack_ner.append(temp)
  

ack_df['NER_Flair'] = ack_ner

In [None]:
ack_df.head()

In [None]:
print(ack_df.loc[1,:]['acknowledgement'])
print(ack_df.loc[1,:]['NER_Flair'])

# Output Data generation

In [None]:
# extracting the organization entity from NER_Spacy (en_core_web_sm)
ner_org = []
for ele in ack_df['NER_Spacy (en_core_web_sm)']:
  temp = []
  if len(ele) != 0 and 'ORG' in ele.keys():
    temp = ele['ORG']
    ner_org.append(temp)
  else:
    ner_org.append("NA")

ack_df['NER_spacy_sm_org'] = ner_org

# extracting the organization entity from NER_Spacy (en_core_web_md)
ner_org = []
for ele in ack_df['NER_Spacy (en_core_web_md)']:
  temp = []
  if len(ele) != 0 and 'ORG' in ele.keys():
    temp = ele['ORG']
    ner_org.append(temp)
  else:
    ner_org.append("NA")

ack_df['NER_spacy_md_org'] = ner_org
    
# extracting the organization entity from NER_Spacy (en_core_web_lg)
ner_org = []
for ele in ack_df['NER_Spacy (en_core_web_lg)']:
  temp = []
  if len(ele) != 0 and 'ORG' in ele.keys():
    temp = ele['ORG']
    ner_org.append(temp)
  else:
    ner_org.append("NA")

ack_df['NER_spacy_lg_org'] = ner_org


# extracting the organization entity from NER_Flair
ner_org = []
for ele in ack_df['NER_Flair']:
  temp = []
  if len(ele) != 0 and 'ORG' in ele.keys():
    temp = ele['ORG']
    ner_org.append(temp)
  else:
    ner_org.append("NA")

ack_df['NER_Flair_org'] = ner_org

In [None]:
ack_df

In [None]:
# saving the file to csv
ack_df.to_csv("/content/ack_ner.csv")

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
with open('drive/MyDrive/ack_ner.pickle', 'wb') as handle:
    pickle.dump(ack_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
t_1 = timeit.default_timer()
print("The time elapsed: ", t_1 - t_0)