# Comprehend Entity Recognition

In [1]:
# %pip install awswrangler

In [2]:
import os 
import sys
import json
import boto3 
import random
import numpy as np
import pandas as pd
import awswrangler as wr
import matplotlib.pyplot as plt

from sagemaker import get_execution_role
role = get_execution_role()

sys.path.append(os.path.dirname(os.getcwd()) + '/src')
print('added ', os.path.dirname(os.getcwd()) + '/src', ' to sys.')
from preprocess import preprocess_data

added  /home/ec2-user/SageMaker/ons-poc-manobras-catalog/src  to sys.


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Selecting one region for testing

In [3]:
# # cnos
# selected_region='cnos'

# # cosr-nco
selected_region='cosr-nco'

# # cosr-ne
# selected_region='cosr-ne'

# # cosr-se
# selected_region='cosr-se'

# cosr-s
# selected_region='cosr-s'

In [4]:
bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/regioes-newjob/'+selected_region+'/samples.csv'
input_uri = 's3://{}/{}'.format(bucket, prefix)

# reading samples from s3
sample = wr.s3.read_csv(input_uri)

# the first row has to be extracted from the column name 
# as the csv file has no header
extra_sample = pd.DataFrame(columns=['samples'])
extra_sample['samples'] = [sample.columns[0]]
sample.columns = ['samples']
sample = pd.concat([extra_sample, sample], axis=0)
sample.reset_index(inplace=True,drop=True)
sample.columns=['samples']
sample

Unnamed: 0,samples
0,3.1.1. Desenergização da LT 230 kV Tucur...
1,3.2.1. Desenergização da LT 230 kV Altam...
2,3.3.1. Desenergização da LT 230 kV Altam...
3,3.4.1. Desenergização da LT 230 kV Trans...
4,3.5.1. Desenergização da LT 230 kV Tapaj...
...,...
397,PassoCoordenaçãoControleComando /ExecuçãoProce...
398,3.8.3.1. A partir da SE Paranatinga (sent...
399,PassoCoordenaçãoControleComando /ExecuçãoProc...
400,PassoCoordenaçãoControleComando /ExecuçãoProce...


## Entity Recognition with custom model 

In [5]:
#comprehend = boto3.client('comprehend', region_name='us-east-1')

In [6]:
# https://docs.aws.amazon.com/comprehend/latest/dg/API_DetectEntities.html
# https://docs.aws.amazon.com/comprehend/latest/dg/API_BatchDetectEntities.html

client = boto3.Session().client('comprehend')
endpoint_name = 'ons-nlp-cosr-nco-2-endpoint-0'
endpoint_arn = 'arn:aws:comprehend:us-east-1:478704051461:entity-recognizer-endpoint/ons-nlp-cosr-nco-2-endpoint-0'
text = "Texto de exemplo bem europeu, encontrado na SE Pampulha, loucura loucura superior a 320 kV."
response = client.detect_entities(
   EndpointArn=endpoint_arn,
   Text=text
)
response

{'Entities': [{'Score': 0.20722101628780365,
   'Type': 'ESTADO_OPERATIVO',
   'Text': 'europeu',
   'BeginOffset': 21,
   'EndOffset': 28},
  {'Score': 0.9960344433784485,
   'Type': 'SUBESTACAO',
   'Text': 'SE Pampulha,',
   'BeginOffset': 44,
   'EndOffset': 56},
  {'Score': 0.9978105425834656,
   'Type': 'ACAO_A_EXECUTAR',
   'Text': 'loucura loucura',
   'BeginOffset': 57,
   'EndOffset': 72},
  {'Score': 0.9917671084403992,
   'Type': 'OPERADOR_MATEMATICO',
   'Text': 'superior a',
   'BeginOffset': 73,
   'EndOffset': 83},
  {'Score': 0.9994897246360779,
   'Type': 'VALOR_COM_UNID.MEDIDA',
   'Text': '320 kV.',
   'BeginOffset': 84,
   'EndOffset': 91}],
 'ResponseMetadata': {'RequestId': 'de46b3f3-9ce8-421c-bf0f-8ee247f6edf2',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'de46b3f3-9ce8-421c-bf0f-8ee247f6edf2',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '552',
   'date': 'Wed, 03 Nov 2021 15:17:26 GMT'},
  'RetryAttempts': 0}}

In [7]:
response['Entities'][0]['Text']

'europeu'

In [9]:
preproc = preprocess_data()

# breaking into sentences
output_dataset = preproc.get_split_entity_sequence(
        dataset=sample,
        column='samples'
        )

Preprocessing module
started at  2021-11-03 15:19:02.340693


In [19]:
# running entity recognition in dataset
recognition_dict = {
    'text':[],
    'entity':[],
    'type':[],
    'score':[]
}

for index, samp in output_dataset.iloc[351:401].iterrows():
    
    # limit for one endpoint
    if len(samp['samples'])>=100:
        pass
    
    response = client.detect_entities(
       EndpointArn=endpoint_arn,
       Text=samp['samples']
    )
    
    local_entity_list = []
    local_label_list = []
    local_score_list = []
    
    for i in range(len(response['Entities'])):
        local_entity_list.append(str(response['Entities'][i]['Text']))
        local_label_list.append(str(response['Entities'][i]['Type']))
        local_score_list.append(str(response['Entities'][i]['Score']))
        
    recognition_dict['text'].append(samp['samples'])
    recognition_dict['entity'].append(local_entity_list)
    recognition_dict['type'].append(local_label_list)
    recognition_dict['score'].append(local_score_list)

recognition_df = pd.DataFrame(recognition_dict)
recognition_df = recognition_df[recognition_df['entity'].map(lambda i: len(i)) > 0]
recognition_df.reset_index(drop=True, inplace=True)

bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/data/postprocessed_output/comprehend/sentences/entities-'+selected_region+'-comprehend-7.parquet'
output_uri = 's3://{}/{}'.format(bucket, prefix)

recognition_df.to_parquet(output_uri)
print('Saved output file in ', output_uri)

recognition_df.head()

Saved output file in  s3://ons-dl-dev-landing/ons/mpo_ds_nlp/data/postprocessed_output/comprehend/sentences/entities-cosr-nco-comprehend-7.parquet


Unnamed: 0,text,entity,type,score
0,Com os reatores de linha do terminal da SE Lec...,"[reatores de linha do terminal da SE Lechuga, ...","[EQUIPAMENTO, SUBESTACAO, VALOR_COM_UNID.MEDID...","[0.9873157143592834, 0.9970600008964539, 0.999..."
1,- Tensão na SE Lechuga superior a 223 kV.,"[SE Lechuga, superior a, 223 kV.]","[SUBESTACAO, OPERADOR_MATEMATICO, VALOR_COM_UN...","[0.9999998211860657, 0.999821662902832, 0.9817..."
2,Com um reator de linha do terminal da SE Lechu...,"[reator de linha do terminal da SE Lechuga, em...","[EQUIPAMENTO, ESTADO_OPERATIVO, SUBESTACAO, VA...","[0.9590084552764893, 0.9999791979789734, 1.0, ..."
3,Com dois reatores de linha do terminal da SE L...,"[reatores de linha do terminal da SE Lechuga, ...","[EQUIPAMENTO, ESTADO_OPERATIVO, SUBESTACAO, VA...","[0.9364916086196899, 0.9998477697372437, 0.999..."
4,3COSR-NCOCOSR-NCOCOS-Am GT (Amazonas GT Desene...,"[Amazonas, GT, Desenergizar, LT 230 kV Balbina...","[SUBESTACAO, ACAO_A_EXECUTAR, ACAO_A_EXECUTAR,...","[0.6348209381103516, 0.564788818359375, 0.9928..."


In [20]:
bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/data/postprocessed_output/comprehend/sentences/entities-'+selected_region+'-comprehend'
cl = boto3.client('s3')
df_list = []
for i in range(8):
    file = prefix+'-'+str(i)+'.parquet'
    obj_uri = 's3://{}/{}'.format(bucket, file)
    df_list.append(pd.read_parquet(obj_uri))

df_all = pd.concat(df_list, axis=0)
df_all.reset_index(drop=True, inplace=True)
df_all

Unnamed: 0,text,entity,type,score
0,Desenergização da LT 230 kV Tucuruí / Altamira...,"[Desenergização, LT 230 kV Tucuruí / Altamira,...","[ACAO_A_EXECUTAR, EQUIPAMENTO, ACAO_A_EXECUTAR]","[0.9999902248382568, 1.0, 0.9997515678405762]"
1,ProcedimentosObjetivo / Item de Controle1---CO...,"[Verificar, indisponibilidade, LT 230 kV Tucur...","[ACAO_A_EXECUTAR, ESTADO_OPERATIVO, EQUIPAMENTO]","[0.9999995231628418, 0.9999918937683105, 0.999..."
2,2COSR-NCOCOSR-NCOEquatorialDesligar o religame...,"[religamento automático, LT 230 kV Altamira / ...","[ACAO_A_EXECUTAR, EQUIPAMENTO]","[0.9972758293151855, 0.9999115467071533]"
3,3COSR-NCOCOSR-NCOEletronorteAbrir o terminal d...,"[o terminal, LT 230 kV Tucuruí / Altamira,, SE...","[ACAO_A_EXECUTAR, EQUIPAMENTO, SUBESTACAO]","[0.9999996423721313, 1.0, 0.9858415722846985]"
4,- Reator da LT disponível:SE Tucuruí ≤ 232 kV-...,"[Reator da LT, SE Tucuruí, 232 kV-, Reator da ...","[EQUIPAMENTO, SUBESTACAO, VALOR_COM_UNID.MEDID...","[0.9989086389541626, 0.9999986290931702, 0.780..."
...,...,...,...,...
380,V ≤ 46 kV  ≤ 2003COSR-NCOCOSR-NCOCOS-Am GT (A...,"[46 kV, Amazonas, GT Normalizar o atendimento ...","[VALOR_COM_UNID.MEDIDA, SUBESTACAO, ACAO_A_EXE...","[0.9999377131462097, 0.7745931148529053, 0.999..."
381,4COSR-NCOCOSR-NCOCOS-Am GT (Amazonas GT Ligar ...,"[Ligar o religamento automático, LT 230 kV Bal...","[ACAO_A_EXECUTAR, EQUIPAMENTO]","[0.8638736009597778, 0.9999288320541382]"
382,Desenergização da LT 230 kV Cristiano Rocha / ...,"[Desenergização, LT 230 kV Cristiano Rocha / L...","[ACAO_A_EXECUTAR, EQUIPAMENTO, ACAO_A_EXECUTAR...","[0.9993795156478882, 0.9999955296516418, 0.816..."
383,- Atendida as limitações impostas pela indispo...,"[Atendida, indisponibilidade, LT 230 kV Cristi...","[ACAO_A_EXECUTAR, ESTADO_OPERATIVO, EQUIPAMENT...","[0.9999533891677856, 0.9999958276748657, 0.999..."


In [21]:
bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/data/postprocessed_output/comprehend/sentences/entities-'+selected_region+'-comprehend-baseline.parquet'
output_uri = 's3://{}/{}'.format(bucket, prefix)
df_all.to_parquet(output_uri)