# Named Entity Recognition

In [1]:
import os
import statistics
import json
import re
from pathlib import Path
import pandas as pd
import tiktoken
from dotenv import load_dotenv
load_dotenv('../.env') 

True

In [2]:
directory = Path("../data")
from langchain.document_loaders import DataFrameLoader

### Read files into dataframes

In [3]:
filename = 'km_ganguli_translation_1.csv'
df = pd.read_csv(directory/filename, sep="|")
df.tail()

Unnamed: 0,book_number,section,section_name,text,para_number,book_name,num_tokens
737,1,SECTION CCXXXV,Khandava-daha Parva continued,"\n\n""Vaisampayana said, 'O thou of Kuru's race...",1,Adi Parva,511
738,1,SECTION CCXXXV,Khandava-daha Parva continued,"""Hearing these words, Mandapala replied, 'I do...",2,Adi Parva,726
739,1,SECTION CCXXXV,Khandava-daha Parva continued,"""Vaisampayana continued, 'After this, all his ...",3,Adi Parva,41
740,1,SECTION CCXXXVI,Khandava-daha Parva continued,"\n\n""Vaisampayana said, 'Mandapala then addres...",1,Adi Parva,665
741,1,SECTION CCXXXVI,Khandava-daha Parva continued,END OF ADI PARVA\n\nFOOTNOTES\n\n1. These are ...,2,Adi Parva,12


### Load the dataframe into a loader

In [4]:
loader = DataFrameLoader(df, page_content_column="text")

In [5]:
docs = loader.load()

## Named Entity recognition 

## Helper functions

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [85]:

## Helper funciton to combine tokens into names

stop_words = ["", "the", "The", "THE", "Sir", "Dr", "Mr", "of", "and"]
# stop_words = []
def combine_tokens(ner_results):
    name = ""
    entities_list = []
    entity = "NA"
    current_word_start = 0
    prev_word_end = 0
    for res in ner_results:
        word = res['word']
        current_word_start = res['start']
        if (word[0] == "▁") or (current_word_start > prev_word_end):
            ## Save previous name
            if not ( name in stop_words or len(name)<=2):
                ## If entity is not set yet, then set current entity.
                entity = res['entity'] if entity == "NA" else entity
                ## removing trailing hypens from the names before saving
                entities_list = entities_list + [{'name': name.rstrip("-"), 'entity': entity}]
            name = re.sub(r'[^a-zA-Z0-9\-]', '', word)
            entity = res['entity']
        # elif not word in [',', "'", ".", "'", ";", "(", ")"]:
        else:
            # Remove all the special characters except '-' from the token
            # Add token to the ongoing name. 
            name = name + re.sub(r'[^a-zA-Z0-9\-]', '', word)
            
        prev_word_end = res['end']
    
    ## append the last name
    entities_list = entities_list + [{'name': name, 'entity': entity}]
    ## Return
    return entities_list


In [86]:

## Get names entities
def recognise_named_entities(text, pipeline_model):
    ner_results = pipeline_model(text)
    return ner_results


## Roberta Named Entity

In [24]:
## Roberta based NER

roberta_tokenizer = AutoTokenizer.from_pretrained("2rtl3/mn-xlm-roberta-base-named-entity")
roberta_model = AutoModelForTokenClassification.from_pretrained("2rtl3/mn-xlm-roberta-base-named-entity")
nlp_roberta = pipeline("ner", model=roberta_model, tokenizer=roberta_tokenizer)
roberta_model.num_parameters()



277456901

In [88]:
entities = []
results = []
for doc in docs[10:30]:
    ner_results = recognise_named_entities(doc.page_content, nlp_roberta)
    results = results + ner_results 
    entities = entities + combine_tokens(ner_results)



In [89]:


# person_entities = filter(lambda x:  True if (x['entity'] == 'PER') else False, entities)
# location_entities = filter(lambda x:  True if (x['entity'] == 'LOC') else False, entities)
# misc_entities = filter(lambda x:  True if (x['entity'] == 'MISC') else False, entities)


df_entities= pd.DataFrame(entities)
df_results = pd.DataFrame(results)


# print(text)
# for entity in entities:
#     print(entity)

# for res in ner_results:
#     print(res)

In [87]:
doc = docs[29]
ner_results = recognise_named_entities(doc.page_content, nlp_roberta)
entities = combine_tokens(ner_results)
df_entities= pd.DataFrame(entities)
df_results = pd.DataFrame(ner_results)


## IndicBert Model

In [None]:
# from transformers import AutoModel, AutoTokenizer
# import torch

# tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
# model = AutoModel.from_pretrained('ai4bharat/indic-bert')

# inputs = tokenizer("After Abhimanyu's marriage, there was royal festival and everyone was pleased", return_tensors="pt")


# outputs = model(**inputs)

# outputs.pooler_output.squeeze()


In [None]:
# print(type(outputs))
# out = outputs.last_hidden_state
# print(out.shape)
# out


In [None]:
# predicted_label_classes = out.argmax(-1)
# classes = predicted_label_classes.squeeze().tolist()
# print(classes)

# tokenizer.decode(token_ids=classes)

In [None]:
# model.config.id2label

In [None]:
# labels = torch.tensor([1, 1, 1]).unsqueeze(0)  # Batch size 1
# labels

In [93]:
docs[103].page_content

'\n\n"Sauti said. \'Then when the night had passed away and the sun had risen\nin the morning, O thou whose wealth is asceticism, the two sisters Kadru\nand Vinata, having laid a wager about slavery, went with haste and\nimpatience to view the steed Uchchaishravas from a near point. On their\nway they saw the Ocean, that receptacle of waters, vast and deep, rolling\nand tremendously roaring, full of fishes large enough to swallow the\nwhale, and abounding with huge makaras and creatures of various forms by\nthousands, and rendered inaccessible by the presence of other terrible,\nmonster-shaped, dark, and fierce aquatic animals, abounding with\ntortoises and crocodiles, the mine of all kinds of gems, the home of\nVaruna (the water-God), the excellent and beautiful residence of the\nNagas, the lord of all rivers, the abode of the subterranean fire, the\nfriend (or asylum) of the Asuras, the terror of all creatures, the grand\nreservoir of water, and ever immutable. It is holy, beneficial