# Perform NER on Book 4 (BIO format)

The notebook contains the code to perform Flair NER and spaCy NER on Book 4 and structure the output in the BIO style format.

The resulting .csv file contains a list of all the tokens in Book 4 (18,664) including punctuation and special characters. Each token is associated to its reference position (book, chapter, paragraph), the position of the token in the paragraph ('Index' column), the start position of the token in the paragraph at the character level ('Start pos' column), the BIO annotation in Flair, Flair-large and Flair+Splitter with the precision score and the BIO annotation in spaCy.

In [None]:
##import Flair NER
import pandas as pd
from bs4 import BeautifulSoup

from flair.data import Sentence
from flair.nn import Classifier
from flair.models import SequenceTagger
from flair.splitter import SegtokSentenceSplitter

import spacy

In [None]:
## open the source HTML page as soup by BeautifulSoup
soup = BeautifulSoup(open("/Users/u0154817/OneDrive - KU Leuven/Documents/KU Leuven/PhD project 'Greek Spaces in Roman Times'/Data_Extraction/Sources/NH_Eng_ToposText/NH_Eng_1-11.html", encoding='utf-8'), features="lxml")

## get all the paragraphs in Book 4
book_4 = soup.find_all("p", id=lambda x: x and x.startswith("urn:cts:latinLit:phi0978.phi001:4.")) ## get all the paragraph starting with the ID phi0978.phi001:4.

In [None]:
reference_column = []
index_column = []
token_column = []
start_pos_column = []
BIO_column = []
BIO_precision = []

# Flair ner

In [None]:
for paragraph in book_4: ## for each paragraph in Book 4
    
    p_tag_id = paragraph['id'] ##get the id of the paragraph
    print(p_tag_id) ##print the id of the paragraph
    text = paragraph.get_text() ## get the text of the paragraph
    
    reference_column_temp = []
    index_column_temp = []
    token_column_temp = []
    start_pos_column_temp = []
    BIO_column_temp = []
    BIO_precision_temp = []

    ## make a sentence from the text using the Flair Sentence function
    sentence = Sentence(text)

    ## load the NER tagger ner-large
    tagger = Classifier.load('ner')

    ## run NER over the sentence
    tagger.predict(sentence)

    for index, token in enumerate(sentence):
        
        token_text = token.text ## get the text of the token
        start_pos = token.start_position ##get the start position of the token

        reference_column_temp.append(p_tag_id)
        index_column_temp.append(index)
        token_column_temp.append(token_text)
        start_pos_column_temp.append(start_pos)
        BIO_column_temp.append('O')
        BIO_precision_temp.append('-')
        
    
    for entity in sentence.get_spans('ner'): ##obtain span objects of the Named Entities
        
        entity_label = entity.labels[0].value ##extract the label assigned to the entity
        entity_score = entity.labels[0].score ## get the probability score for the type label
    
        for index,token in enumerate(entity): ##for each token in the entity
            start_pos_token = token.start_position ##get the start position of the token in the sentence
        
            if index == 0: ##if it is the first token in the named entity
                entity_label_token = 'B-'+str(entity_label) ##the label starts with B-(eginning)
            else :
                entity_label_token = 'I-'+str(entity_label) ##the label starts with I-(nside)
                    
            for i,start_position in enumerate(start_pos_column_temp):
                
                if int(start_pos_token) == int(start_position):
                    BIO_column_temp[i] = entity_label_token
                    BIO_precision_temp[i] = entity_score
        
    reference_column.extend(reference_column_temp)
    index_column.extend(index_column_temp)
    token_column.extend(token_column_temp)
    start_pos_column.extend(start_pos_column_temp)
    BIO_column.extend(BIO_column_temp)
    BIO_precision.extend(BIO_precision_temp)

In [None]:
# Create a pandas dataframe

data = {
    'Reference': reference_column,
    'Index': index_column,
    'Token': token_column,
    'Start_pos': start_pos_column,
    'BIO_Flair': BIO_column,
    'Precision_Flair': BIO_precision
}

df = pd.DataFrame(data)

# Flair ner-large

In [None]:
BIO_Flair_large_column = []
Precision_Flair_large_column = []

In [None]:
for paragraph in book_4: ## for each paragraph in Book 4
    
    p_tag_id = paragraph['id'] ##get the id of the paragraph
    print(p_tag_id) ##print the id of the paragraph
    text = paragraph.get_text() ## get the text of the paragraph

    start_pos_column_temp = []
    BIO_Flair_large_column_temp = []
    Precision_Flair_large_column_temp = []
    
    ## make a sentence from the text using the Flair Sentence function
    sentence = Sentence(text)

    ## load the NER tagger ner-large
    tagger = Classifier.load('ner-large')

    ## run NER over the sentence
    tagger.predict(sentence)
    
    for index, token in enumerate(sentence):
        
        start_pos = token.start_position ##get the start position of the token

        start_pos_column_temp.append(start_pos)
        BIO_Flair_large_column_temp.append('O')
        Precision_Flair_large_column_temp.append('-')

    for entity in sentence.get_spans('ner'): ##obtain span objects
    
        entity_label = entity.labels[0].value ##extract the label assigned to the entity
        entity_score = entity.labels[0].score ## get the probability score for the type label
    
        for index,token in enumerate(entity): ##entities can be composed by one or more than one token
            start_pos_token = token.start_position ##get the start position of the token in the sentence
        
            if index == 0: ##if it is the first token in the named entity
                entity_label_token = 'B-'+str(entity_label) ##the label starts with B-(eginning)
            else :
                entity_label_token = 'I-'+str(entity_label) ##the label starts with I-(nside)
                    
            for i,start_position in enumerate(start_pos_column_temp):
            
                if int(start_pos_token) == int(start_position):
                    BIO_Flair_large_column_temp[i] = entity_label_token
                    Precision_Flair_large_column_temp[i] = entity_score
    
    BIO_Flair_large_column.extend(BIO_Flair_large_column_temp)
    Precision_Flair_large_column.extend(Precision_Flair_large_column_temp)

In [None]:
df['BIO_Flair-large'] = BIO_Flair_large_column
df['Precision_Flair-large'] = Precision_Flair_large_column

# Flair ner-large + SegtokSentenceSplitter

In [None]:
##initialize sentence splitter
splitter = SegtokSentenceSplitter()

In [None]:
BIO_Flair_splitter_column = []
Precision_Flair_splitter_column = []

In [None]:
for paragraph in book_4: ## for each paragraph in Book 4
    
    p_tag_id = paragraph['id'] ##get the id of the paragraph
    print(p_tag_id) ##print the id of the paragraph
    text = paragraph.get_text() ## get the text of the paragraph

    ##use splitter to split text into list of sentences
    sentences = splitter.split(text)
    
    tagger = Classifier.load('ner-large')
    tagger.predict(sentences)

    for index,sentence in enumerate(sentences):
        
        if len(sentence) > 0:
                    
            start_pos_splitter_column = []
            BIO_Flair_splitter_column_temp = []
            Precision_Flair_splitter_temp = []
        
            for token in sentence: ##for each token in the sentence
                
                start_pos_splitter = str(token.start_position) ##get the start position of the token in the sentence
            
                start_pos_splitter_column.append(start_pos_splitter) ##append the start position in the splitted sentences
                BIO_Flair_splitter_column_temp.append('O')
                Precision_Flair_splitter_temp.append('-')
                        
            for entity in sentence.get_spans('ner'):
        
                entity_label = entity.labels[0].value ##extract the label assigned to the entity
                entity_score = entity.labels[0].score ## get the probability score for the type label
    
                for index1,token in enumerate(entity): ##entities can be composed by one or more than one token
            
                    start_pos_token = str(token.start_position) ##get the start position of the token in the sentence
        
                    if index1 == 0: ##if it is the first token in the named entity
                        entity_label_token = 'B-'+str(entity_label) ##the label starts with B-(eginning)
                    else :
                        entity_label_token = 'I-'+str(entity_label) ##the label starts with I-(nside)
                    
                    for index2,start_position in enumerate(start_pos_splitter_column):
            
                        if str(start_pos_token) == str(start_position):
                            BIO_Flair_splitter_column_temp[index2] = entity_label_token
                            Precision_Flair_splitter_temp[index2] = entity_score
    
            BIO_Flair_splitter_column.extend(BIO_Flair_splitter_column_temp)
            Precision_Flair_splitter_column.extend(Precision_Flair_splitter_temp)

In [None]:
len(Precision_Flair_splitter_column)

In [None]:
df['BIO_Flair_Splitter'] = BIO_Flair_splitter_column
df['Precision_Flair_Splitter'] = Precision_Flair_splitter_column

# spaCy-md

In [None]:
df['BIO_spaCy-md'] = 'O'

In [None]:
nlp_spaCy = spacy.load("en_core_web_md")

In [None]:
for paragraph in book_4: ## for each paragraph in Book 4
    
    p_tag_id = paragraph['id'] ##get the id of the paragraph
    print(p_tag_id) ##print the id of the paragraph
    text = paragraph.get_text() ## get the text of the paragraph

    processed_text = nlp_spaCy(text)
    
    for entity in processed_text: 
        
        if entity.ent_type_:
            
            entity_label_token = str(entity.ent_iob_)+'-'+entity.ent_type_
            start_pos_token = entity.idx ##get the start position of the entity in the paragraph
            paragraph_start_pos = str(p_tag_id)+'.'+str(start_pos_token)
    
            for i, reference in enumerate(df['Reference']):
                
                reference_position = str(reference)+'.'+str(df['Start_pos'][i])
                if paragraph_start_pos == reference_position:
                    
                    df['BIO_spaCy-md'][i] = entity_label_token

# spaCy-trf

In [None]:
df['BIO_spaCy-trf'] = 'O'

In [None]:
nlp_spaCy = spacy.load("en_core_web_trf")

In [None]:
for paragraph in book_4: ## for each paragraph in Book 4
    
    p_tag_id = paragraph['id'] ##get the id of the paragraph
    print(p_tag_id) ##print the id of the paragraph
    text = paragraph.get_text() ## get the text of the paragraph

    processed_text = nlp_spaCy(text)
    
    for entity in processed_text: 
        
        if entity.ent_type_:
            
            entity_label_token = str(entity.ent_iob_)+'-'+entity.ent_type_
            start_pos_token = entity.idx ##get the start position of the entity in the paragraph
            paragraph_start_pos = str(p_tag_id)+'.'+str(start_pos_token)
    
            for i, reference in enumerate(df['Reference']):
                
                reference_position = str(reference)+'.'+str(df['Start_pos'][i])
                if paragraph_start_pos == reference_position:
                    
                    df['BIO_spaCy-trf'][i] = entity_label_token

In [None]:
df.to_csv("BIO_NER_Flair_spaCy.csv")