# Ground Truth Entity Recognition

In [2]:
# pip install awswrangler

In [3]:
# pip install awswrangler

import json
import boto3 
import numpy as np
import pandas as pd
import awswrangler as wr

from sagemaker import get_execution_role
role = get_execution_role()

In [4]:
# reading file contents from s3 bucket
bucket = 'demo-labeling-job-v1'
prefix = 'demo-labeling-job-v2/annotations/worker-response'
obj_uri = 's3://{}/{}'.format(bucket, prefix)

cl = boto3.client('s3')
contents = cl.list_objects(Bucket=bucket, Prefix=prefix)['Contents']

file_list = []
for file in contents:
    file_list.append(file['Key'])

In [5]:
file_list

['demo-labeling-job-v2/annotations/worker-response/iteration-1/0/2021-09-01_13:07:57.json',
 'demo-labeling-job-v2/annotations/worker-response/iteration-1/1/2021-09-01_13:10:11.json',
 'demo-labeling-job-v2/annotations/worker-response/iteration-1/2/2021-09-01_13:09:06.json',
 'demo-labeling-job-v2/annotations/worker-response/iteration-1/3/2021-09-01_13:05:42.json',
 'demo-labeling-job-v2/annotations/worker-response/iteration-1/4/2021-09-01_13:10:10.json',
 'demo-labeling-job-v2/annotations/worker-response/iteration-1/5/2021-09-01_13:06:50.json',
 'demo-labeling-job-v2/annotations/worker-response/iteration-1/6/2021-09-01_13:09:06.json',
 'demo-labeling-job-v2/annotations/worker-response/iteration-1/7/2021-09-01_13:06:51.json']

In [115]:
# reading output from ground truth entity recognition job
ref_dict = {
    'sample':[],
    'start':[],
    'end':[],
    'label':[]
}

for file in file_list: 
    obj_uri = 's3://{}/{}'.format(bucket,file)
    json_obj = pd.read_json(obj_uri)
    for entity in json_obj['answers'][0]['answerContent']['crowd-entity-annotation']['entities']:
        ref_dict['start'].append(entity['startOffset'])
        ref_dict['end'].append(entity['endOffset'])
        ref_dict['label'].append(entity['label'])
        ref_dict['sample'].append(file.split('/')[4])
        
ref_df = pd.DataFrame(ref_dict)
ref_df[['sample','start','end']] = ref_df[['sample','start','end']].astype(int)

ref_df

Unnamed: 0,sample,start,end,label
0,0,9,27,EQUIPAMENTO
1,0,9,36,COMPARACAO-DE-VALOR
2,1,0,13,LOCAL
3,1,52,70,EQUIPAMENTO
4,2,3,11,EQUIPAMENTO
5,2,12,23,LOCAL
6,2,3,32,COMPARACAO-DE-VALOR
7,3,0,13,LOCAL
8,3,14,29,LOCAL
9,3,56,69,LOCAL


In [116]:
# reading samples dataset
bucket = 'demo-labeling-job-v1'
file = 'labeling-job-test1.csv'
obj_uri = 's3://{}/{}'.format(bucket, file)

samples_df = wr.s3.read_csv(obj_uri)

# because the file contained no header, the first sample was imposed as the column name
# to correct this issue, use the following algorithm

extra_sample = pd.DataFrame(columns=['samples'])
extra_sample['samples'] = [samples_df.columns[0]]
samples_df.columns = ['samples']
samples_df = pd.concat([extra_sample, samples_df], axis=0)
samples_df.reset_index(inplace=True, drop=True)
samples_df

Unnamed: 0,samples
0,Linha do Transformador TF13 < 350 kV
1,Santo Antônio: abrir ou manter aberto o termin...
2,SE Coletora Porto Velho < 550 kV. Desenergizar...
3,Santo Antônio Margem Esquerda. Desenergizar a ...
4,Redução no carregamento da LT 345 kV Embu-Guaç...
5,Remanejar cargas alimentadas pela SE 345/88 kV...
6,Linha do Transformador TF13 Santo Antônio < 35...
7,"Coletora Porto Velho C3 ou C4, pelo terminal d..."


In [120]:
sample_text = []
for index, row in ref_df.iterrows():
    sample = samples_df['samples'].to_list()[row['sample']]
    charac = list(sample)
    entity = ''.join(charac[row['start']:row['end']])
    sample_text.append(entity)
    
ref_df['entity'] = sample_text

# correction from demo recognition job
ref_df['entity'].iloc[11] = 'Transformador'
ref_df['entity'].iloc[15] = 'Coletora'
ref_df['entity'].iloc[16] = 'Porto Velho'

ref_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,sample,start,end,label,entity
0,0,9,27,EQUIPAMENTO,Transformador TF13
1,0,9,36,COMPARACAO-DE-VALOR,Transformador TF13 < 350 kV
2,1,0,13,LOCAL,Santo Antônio
3,1,52,70,EQUIPAMENTO,Transformador TF13
4,2,3,11,EQUIPAMENTO,Coletora


In [121]:
comprehend_training_df = ref_df[['entity','label']]
comprehend_training_df

Unnamed: 0,entity,label
0,Transformador TF13,EQUIPAMENTO
1,Transformador TF13 < 350 kV,COMPARACAO-DE-VALOR
2,Santo Antônio,LOCAL
3,Transformador TF13,EQUIPAMENTO
4,Coletora,EQUIPAMENTO
5,Porto Velho,LOCAL
6,Coletora Porto Velho < 550 kV,COMPARACAO-DE-VALOR
7,Santo Antônio,LOCAL
8,Margem Esquerda,LOCAL
9,Santo Antônio,LOCAL


This last dataset, containing information about the entities and the respective lables, can be used in training jobs on Amazon Comprehend.