### Extracting Entities From Hindi Text

In [None]:
! pip install transformers



In [None]:
import os
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [None]:
## Functions to get models for ner

def get_hiner_model():
    model_name='cfilt/HiNER-original-muril-base-cased'
    model_token='cfilt/HiNER-original-muril-base-cased'
    tokenizer=AutoTokenizer.from_pretrained(model_token)
    model=AutoModelForTokenClassification.from_pretrained(model_name)
    # print("hiner_entity_tags",list(model.config.label2id.keys()))        ## print ner tags recogised by model
    return pipeline('ner', model=model, tokenizer=tokenizer)

In [None]:
## This function is extracting ner entities from text

## Model return the score, start index of word, end index of word, word and entity  for the text provided
## While extracting entities model adds # to words in order to recover words from those # we have to map them to the txt
## For mappping start index of word will be useful
## Thus function will be returning start index of entity and entity

def get_ner_results(txt,ner):

    ## extracting ner entities from text
    results=ner(txt)
    ans=[]

    ## iterating through each entities, adding entities to ans which are loc, per, org or misc and having score > 0.5
    for res in results:
        if res['score']>0.5:
            if 'LOC' in res['entity'] or 'PER' in res['entity'] or 'ORG' in res['entity'] or 'MISC' in res['entity']:
                ans.append([res['start'],res['entity']])

    return ans

In [None]:
## This function is to get word range

## For eg: txt is : "India is democratic country"
## Word range will be for india=[0,4], is=[6,7], democratic=[9,19] and country=[21,27]
## The function will not return only word range but index and word also
## Index of word is defined as , for above sentence index of india=0, is=1, democratic=2, country=3
## Function is returning lisi of list where inner list will contain [start of word, end of word, index of word , word]
## For above text function will return [[0,4,0,'India'], [6,7,1,'is'], [9,19,2,'democratic'] ,[21,27,3,'country']]

def get_word_ranges(txt):
    words_range=[]
    s=0
    i=0
    idx=0

    for i in range(len(txt)):
        if txt[i]==' ':
            words_range.append([s,i,idx,txt[s:i]])
            i+=1
            s=i
            idx+=1

    words_range.append([s,i,idx,txt[s:i]])
    return words_range

In [None]:
## This function is to map the entities from model to word in text

## This function will return index of word, word, entity of word
## Entity of word can be any from this ['loc','per','org','misc']
## entity of word is recived from function get_ner_results() ,  and word and its index is recieved from function get_word_range()



## Time Complexcity is O(N), even though while inside for is used,  because exact time complecxity will be O(max(len(res),len(word_range)))

# res=[[start,entity]]
# word_range=[[start,end,index,word]]
# ans=[[index,word,entity]]



def get_entities(txt,ner):

    ## extracting entities from model
    res=get_ner_results(txt,ner)

    ## extracting word range from txt
    word_range=get_word_ranges(txt)

    idx=0
    n=len(res)
    ans=[]

    ## mapping word from text to words revieved from model containing #
    for range in word_range:
        if idx>=n:
            break
        if idx<n and res[idx][0]>=range[0] and res[idx][0]<range[1]:
            while idx<n and res[idx][0]>=range[0] and res[idx][0]<range[1]:
                idx+=1
            ans.append([range[2],range[3],res[idx-1][1]])

    return ans

In [None]:
## This function is to clean the result

## Time Complexcity O(N), even though for inside for is used, because garbage is of constant length



def get_cleaned_entities(txt,ner):

    ## extracting entities for cleaning
    results=get_entities(txt,ner)

    garbage=['`','~','!','*','(',')','-','_','+','=','[',']','{,','}',';',':',"'",'"',',','.','.','/','?','\\','|','।','\n','है','?','@','#','$','%','^','&',' ']
    ans=[]
    ans2={
        'LOC':[],
        'PER':[],
        'ORG':[],
        'MISC':[]
    }

    for res in results:
        f=0

        for g in garbage:
            if g in res[1]:
                f=1
                break;

        if f==0:
            if 'LOC' in res[2]:
                ans2['LOC'].append([res[1],res[0]])
            elif 'PER' in res[2]:
                ans2['PER'].append([res[1],res[0]])
            elif 'ORG' in res[2]:
                ans2['ORG'].append([res[1],res[0]])
            elif 'MISC' in res[2]:
                ans2['MISC'].append([res[1],res[0]])

    return ans2

In [None]:
## This function is used to join the related words

## For eg: text='Narendra Modi went to Madhya Pradesh'
## Model will recognise 'Narendra' as perason, 'Modi' as different person, 'Madhya' as location, and 'Pradesh' as differant location
## From the function get_entities(), we passed index of word to function get_cleaned_entitie(), and then to this function
## Index of 'Narendra' will be 0 and 'Modi' will be 1 also for 'Madhya' index will be 4 and for 'Pradesh' it will be 5
## We will merg the the words to single word if its adjacent word have index just increased by 1
## This way 'Narendra' and 'Modi' will be recognised as 'Narendra Modi', same is for 'Madhya Pradesh'.


## Time Complexcity O(N), even though while inside while is used because both while loops are using same vales, as soon as valus corsses limit in any of while loop it break and both while loops end




def get_joined_res(txt,ner):

    ## extracting cleaned entities
    res=get_cleaned_entities(txt,ner)

    loc=[]
    per=[]
    org=[]
    misc=[]

    loc_len=len(res['LOC'])
    per_len=len(res['PER'])
    org_len=len(res['ORG'])
    misc_len=len(res['MISC'])


    ## joining for Loc
    i=0
    while i < loc_len:
        st=''
        while i <loc_len-1 and res['LOC'][i][1]+1==res['LOC'][i+1][1] and res['LOC'][i][0][-1] :
            st=st+res['LOC'][i][0]+' '
            i=i+1
        st=st+res['LOC'][i][0]
        loc.append(st)
        i=i+1


    ## joining for Per
    i=0
    while i < per_len:
        st=''
        while i<per_len-1 and res['PER'][i][1]+1==res['PER'][i+1][1] and res['PER'][i][0][-1] :
            st=st+res['PER'][i][0]+' '
            i=i+1
        st=st+res['PER'][i][0]
        per.append(st)
        i=i+1


    ## joining for Org
    i=0
    while i < org_len:
        st=''
        while i<org_len-1 and res['ORG'][i][1]+1==res['ORG'][i+1][1] and res['ORG'][i][0][-1] :
            st=st+res['ORG'][i][0]+' '
            i=i+1
        st=st+res['ORG'][i][0]
        org.append(st)
        i=i+1


    ## joining for Misc
    i=0
    while i < misc_len:
        st=''
        while i<misc_len-1 and res['MISC'][i][1]+1==res['MISC'][i+1][1] and res['MISC'][i][0][-1] :
            st=st+res['MISC'][i][0]+' '
            i=i+1
        st=st+res['MISC'][i][0]
        misc.append(st)
        i=i+1

    res2={
        'LOC':loc,
        'PER':per,
        'ORG':org,
        'MISC':misc
    }

    return res2


In [None]:
## This function is another version of the function above
## This function is to get the final results

## In this function we will not process each sentence but we will give model the txt of batch size 1500 letters, this will reduce the time taken to process by 50 %.
## The time for processing is fuurther redused by using only bebal model rather than using 2 models

## Calling this function will return the final results for the provided text


def get_results(txt,ner):

    # para = txt.split(". ")
    # sentences = [sub.split("। ") for sub in para]

    res={
        'LOC':[],
        'PER':[],
        'ORG':[],
        'MISC':[]
    }

    try:

        ## definig step size
        step_size=1500

        ## iterating through each txt
        for i in range(0,len(txt),step_size):

            ## passing text to hiner model
            res2=get_joined_res(txt[i:i+step_size],ner)

            ## appending results to final result i.e res

            for j in res2['LOC']:
                res['LOC'].append(j)
            for j in res2['PER']:
                res['PER'].append(j)
            for j in res2['ORG']:
                res['ORG'].append(j)

        ## coonverting to set to remove duplicates
        res['LOC']=set(res['LOC'])
        res['PER']=set(res['PER'])
        res['ORG']=set(res['ORG'])
        res['MISC']=set(res['MISC'])


    except:
        print('Error')
        res={
            'LOC':[],
            'PER':[],
            'ORG':[],
            'MISC':[]
        }

        return res

    return res

In [None]:
## Assuming we are having Data hindi content/text in df with column name "Hindi_Text"

hindiText=["रियान हर रविवार को अपने दादा जी से मिलने के लिए दिल्ली से आगरा की यात्रा करता है।","गर्मियों की छुट्टियों में, शिखा अपनी माँ के साथ हिमाचल प्रदेश के खूबसूरत हिल स्टेशन, मनाली घूमने जा रही हैं।",
           "आज सुबह, शाहरुख खान को मुंबई एयरपोर्ट पर देखा गया।", "अगले हफ्ते, मोहन अपने दोस्तों के साथ गोवा के बीच पर आराम करने जा रहा है।", "पिछले साल, मीरा ने ताजमहल देखने के लिए आगरा का भ्रमण किया था।"]
df=pd.DataFrame(hindiText,columns=["Hindi_Text"])
df.head()

Unnamed: 0,Hindi_Text
0,रियान हर रविवार को अपने दादा जी से मिलने के लि...
1,"गर्मियों की छुट्टियों में, शिखा अपनी माँ के सा..."
2,"आज सुबह, शाहरुख खान को मुंबई एयरपोर्ट पर देखा ..."
3,"अगले हफ्ते, मोहन अपने दोस्तों के साथ गोवा के ब..."
4,"पिछले साल, मीरा ने ताजमहल देखने के लिए आगरा का..."


In [None]:
## Extracting entities using parallel processing

## Function to apply get_results in parallel
def parallel_extraction_entities(idx,combined_text,ner):
    return idx, get_results(combined_text,ner)

## Get the number of CPU cores
num_cores = os.cpu_count()
# print(num_cores)

hiner_list=[get_hiner_model() for i in range(num_cores)]
idx_lst=[i for i in range(len(df))]
df['Entities']=idx_lst

res=[]
startTime=time.time()

for i in range(0,len(df),num_cores):
## Use ThreadPoolExecutor for parallel processing
    with ThreadPoolExecutor(max_workers=num_cores) as executor:
        ## Apply the extract_article_text function to each non-NaN URL in the 'Link' column
        results = list(executor.map(parallel_extraction_entities, idx_lst[i:i+num_cores],df['Hindi_Text'][i:i+num_cores],hiner_list))
        res.append(results)

endTime=time.time()
print('elapsed time:',(endTime-startTime)/60)

## Update the DataFrame with the extracted entities
for result in res:
    for idx, entities in result:
        df['Entities'][idx] = entities

hiner_entity_tags ['B-FESTIVAL', 'B-GAME', 'B-LANGUAGE', 'B-LITERATURE', 'B-LOCATION', 'B-MISC', 'B-NUMEX', 'B-ORGANIZATION', 'B-PERSON', 'B-RELIGION', 'B-TIMEX', 'I-FESTIVAL', 'I-GAME', 'I-LANGUAGE', 'I-LITERATURE', 'I-LOCATION', 'I-MISC', 'I-NUMEX', 'I-ORGANIZATION', 'I-PERSON', 'I-RELIGION', 'I-TIMEX', 'O']


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


hiner_entity_tags ['B-FESTIVAL', 'B-GAME', 'B-LANGUAGE', 'B-LITERATURE', 'B-LOCATION', 'B-MISC', 'B-NUMEX', 'B-ORGANIZATION', 'B-PERSON', 'B-RELIGION', 'B-TIMEX', 'I-FESTIVAL', 'I-GAME', 'I-LANGUAGE', 'I-LITERATURE', 'I-LOCATION', 'I-MISC', 'I-NUMEX', 'I-ORGANIZATION', 'I-PERSON', 'I-RELIGION', 'I-TIMEX', 'O']
elapsed time: 0.04465800126393636


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Entities'][idx] = entities


In [None]:
df.head()

Unnamed: 0,Hindi_Text,Entities
0,रियान हर रविवार को अपने दादा जी से मिलने के लि...,"{'LOC': {'दिल्ली', 'आगरा'}, 'PER': {'जी', 'रिय..."
1,"गर्मियों की छुट्टियों में, शिखा अपनी माँ के सा...","{'LOC': {'मनाली', 'हिमाचल प्रदेश'}, 'PER': {'श..."
2,"आज सुबह, शाहरुख खान को मुंबई एयरपोर्ट पर देखा ...","{'LOC': {'मुंबई'}, 'PER': {'शाहरुख खान'}, 'ORG..."
3,"अगले हफ्ते, मोहन अपने दोस्तों के साथ गोवा के ब...","{'LOC': {'गोवा'}, 'PER': {'मोहन'}, 'ORG': {}, ..."
4,"पिछले साल, मीरा ने ताजमहल देखने के लिए आगरा का...","{'LOC': {'ताजमहल', 'आगरा'}, 'PER': {'मीरा'}, '..."


In [None]:
## Splitting the 'Entities' column into separate columns for LOC, PER, and ORG
df[['LOC', 'PER', 'ORG','MISC']] = pd.DataFrame(df['Entities'].tolist(), index=df.index)

In [None]:
df.head()

Unnamed: 0,Hindi_Text,Entities,LOC,PER,ORG,MISC
0,रियान हर रविवार को अपने दादा जी से मिलने के लि...,"{'LOC': {'दिल्ली', 'आगरा'}, 'PER': {'जी', 'रिय...","{दिल्ली, आगरा}","{जी, रियान}",{},{}
1,"गर्मियों की छुट्टियों में, शिखा अपनी माँ के सा...","{'LOC': {'मनाली', 'हिमाचल प्रदेश'}, 'PER': {'श...","{मनाली, हिमाचल प्रदेश}",{शिखा},{},{}
2,"आज सुबह, शाहरुख खान को मुंबई एयरपोर्ट पर देखा ...","{'LOC': {'मुंबई'}, 'PER': {'शाहरुख खान'}, 'ORG...",{मुंबई},{शाहरुख खान},{},{}
3,"अगले हफ्ते, मोहन अपने दोस्तों के साथ गोवा के ब...","{'LOC': {'गोवा'}, 'PER': {'मोहन'}, 'ORG': {}, ...",{गोवा},{मोहन},{},{}
4,"पिछले साल, मीरा ने ताजमहल देखने के लिए आगरा का...","{'LOC': {'ताजमहल', 'आगरा'}, 'PER': {'मीरा'}, '...","{ताजमहल, आगरा}",{मीरा},{},{}
