Task 4: Named Entity Recognition with bert-base-NER

The PDF should be in the same directory where the script is run from

In [45]:
import os
from PyPDF2 import PdfReader
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [46]:
cwd = os.getcwd()
pdf_path = f'{cwd}{os.sep}Successful Algorithmic Trading.pdf'

Read 100 through 110 pages from the PDF file

In [47]:
def get_pdf_pages_data(pdf_file: str, starting_page: int, ending_page: int) -> dict:
    """
    Read the page numbers from a PDF file and store them in a dictionary
    :param pdf_file: The PDF file which will be read from
    :param starting_page: The page where start reading from
    :param ending_page: The page where end reading form
    :return: Dictionary with page numbers as keys and read data from the page
    """
    
    data = {}
    
    with open(pdf_file, 'rb') as opened_file:
        reader = PdfReader(opened_file)
        
        for page_num in range(starting_page, ending_page):
            page = reader.pages[page_num]
            page_data = page.extract_text()
            
            data[page_num] = page_data
        
    return data

In [48]:
pdf_pages_extracted_data = get_pdf_pages_data(pdf_path, 100, 110)

Apply the bert-base-NER model and process the read pages

In [49]:
model_name = 'dslim/bert-base-NER'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
nlp = pipeline('ner', model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [50]:
def process_page_data(page_data: str) -> list[dict]:
    """
    Processes the data read from the PDF file
    :param page_data: data read from PDF page
    :return: processed data with bert-base-NER 
    """
    
    ner_results = nlp(page_data)
    return ner_results

In [51]:
pdf_pages_processed_data = {page: process_page_data(page_data) for page, page_data in pdf_pages_extracted_data.items()}

Filter the results to return only matches with confidence score greater than 90%

In [52]:
def filter_by_confidence_score(pages_processed_data: list[dict], score_threshold: int) -> list[dict]:
    """
    Filters pages data by confidence score threshold
    :param pages_processed_data: processed data with bert-base-NER read from PDF pages
    :param score_threshold: the minimum confidence score to filter
    :return: filtered data by confidence score
    """
    
    score_threshold *= 0.01
    filtered_data = list(filter(lambda data: data.get('score') >= score_threshold, pages_processed_data))
    return filtered_data

In [53]:
filtered_pages_processed_data = {page: filter_by_confidence_score(processed_data, 90) for page, processed_data in pdf_pages_processed_data.items()}

Print the entity type, entity text, page number and confidence score

In [54]:
def print_data(processed_data: dict[int: list]) -> None:
    """
    Prints out the filtered data
    :return: None 
    """
    
    for page_num, processed_data in processed_data.items():
        for data_entity in processed_data:
            entity_type = data_entity["entity"]
            entity_text = data_entity["word"]
            confidence_score = data_entity["score"]
            print(f'Entity type: {entity_type}, Entity text: {entity_text}, Page number: {page_num}, Confidence score: {confidence_score}')
   

In [55]:
print_data(filtered_pages_processed_data)

Entity type: B-MISC, Entity text: Python, Page number: 100, Confidence score: 0.9742887020111084
Entity type: B-ORG, Entity text: Yahoo, Page number: 100, Confidence score: 0.9954401254653931
Entity type: I-ORG, Entity text: Finance, Page number: 100, Confidence score: 0.9972894191741943
Entity type: B-MISC, Entity text: AD, Page number: 100, Confidence score: 0.9362190365791321
Entity type: I-MISC, Entity text: ##das, Page number: 100, Confidence score: 0.9351540803909302
Entity type: B-ORG, Entity text: Yahoo, Page number: 101, Confidence score: 0.997416615486145
Entity type: I-ORG, Entity text: Finance, Page number: 101, Confidence score: 0.9985200762748718
Entity type: I-MISC, Entity text: Square, Page number: 101, Confidence score: 0.9349412322044373
Entity type: I-MISC, Entity text: ##s, Page number: 101, Confidence score: 0.9037943482398987
Entity type: B-MISC, Entity text: AD, Page number: 103, Confidence score: 0.949324369430542
Entity type: B-MISC, Entity text: AD, Page numbe