# Ground Truth Entity Recognition

In [1]:
# %pip install awswrangler

In [2]:
import json
import boto3 
import numpy as np
import pandas as pd
import awswrangler as wr

from sagemaker import get_execution_role
role = get_execution_role()

In [3]:
# reading file contents from s3 bucket
bucket = 'ons-dl-dev-landing'
all_contents = []

for region in ['cnos-equipnull','cnos','cosr-nco-debug-1','cosr-nco','cosr-ne','cosr-s','cosr-se']:
    prefix = 'ons/mpo_ds_nlp/regioes-newjob/'+region+'/'
    obj_uri = 's3://{}/{}'.format(bucket, prefix)
    cl = boto3.client('s3')
    contents = cl.list_objects(Bucket=bucket, Prefix=prefix)['Contents']
    all_contents = all_contents + contents

file_list = []
for file in all_contents:
    file_list.append(file['Key'])

In [4]:
len(file_list)

1855

In [5]:
# selecting just consolidated-annotations
file_list = [i for i in file_list if 'consolidated-annotation' in i]
# file_list

In [6]:
# reading output from ground truth entity recognition job
ref_dict = {
    'file': [],
    'sample_id':[],
    'start':[],
    'end':[],
    'label':[]
}

for file in file_list: 
    obj_uri = 's3://{}/{}'.format(bucket,file)
    json_obj = pd.read_json(obj_uri)
    try:
        metadata, region = list(json_obj['consolidatedAnnotation'][0]['content'].keys())
        for entity in json_obj['consolidatedAnnotation'][0]['content'][region]['annotations']['entities']:
            ref_dict['file'].append(file)
            ref_dict['start'].append(entity['startOffset'])
            ref_dict['end'].append(entity['endOffset'])
            ref_dict['label'].append(entity['label'])
            ref_dict['sample_id'].append(json_obj['datasetObjectId'][0])
    except:
        for entity in json.loads(json_obj['annotations'][0][0]['annotationData']['content'])['crowd-entity-annotation']['entities']:
            ref_dict['file'].append(file)
            ref_dict['start'].append(entity['startOffset'])
            ref_dict['end'].append(entity['endOffset'])
            ref_dict['label'].append(entity['label'])
            ref_dict['sample_id'].append(json_obj['datasetObjectId'][0])
        
ref_df = pd.DataFrame(ref_dict)
ref_df[['sample_id','start','end']] = ref_df[['sample_id','start','end']].astype(int)
ref_df

Unnamed: 0,file,sample_id,start,end,label
0,ons/mpo_ds_nlp/regioes-newjob/cnos-equipnull/C...,106,38,44,VALOR_COM_UNID.MEDIDA
1,ons/mpo_ds_nlp/regioes-newjob/cnos-equipnull/C...,106,54,122,ACAO_A_EXECUTAR
2,ons/mpo_ds_nlp/regioes-newjob/cnos-equipnull/C...,106,106,122,USINA
3,ons/mpo_ds_nlp/regioes-newjob/cnos-equipnull/C...,106,84,102,EQUIPAMENTO
4,ons/mpo_ds_nlp/regioes-newjob/cnos-equipnull/C...,106,180,246,ACAO_A_EXECUTAR
...,...,...,...,...,...
28059,ons/mpo_ds_nlp/regioes-newjob/cosr-se/COSR-SE-...,120,209,221,ACAO_A_EXECUTAR
28060,ons/mpo_ds_nlp/regioes-newjob/cosr-se/COSR-SE-...,120,123,161,EQUIPAMENTO
28061,ons/mpo_ds_nlp/regioes-newjob/cosr-se/COSR-SE-...,120,224,262,EQUIPAMENTO
28062,ons/mpo_ds_nlp/regioes-newjob/cosr-se/COSR-SE-...,120,166,187,SUBESTACAO


## Output for Spacy

In [7]:
sample_text = []
mapped_files = set(ref_df['file'].to_list())

spacy_dict = {
    'region':[],
    'text': [],
    'start':[],
    'end':[],
    'label':[]
}

for file in mapped_files:
      
    # mapping files
    sample_dirname_fragments = file.split('/')[0:4]
    region=sample_dirname_fragments[3]
    sample_filename = ''.join(fragment+'/' for fragment in sample_dirname_fragments)+'samples.csv'
    obj_uri = 's3://{}/{}'.format(bucket, sample_filename)
    sample = wr.s3.read_csv(obj_uri)
    
    # the first row has to be extracted from the column name 
    # as the csv file has no header
    extra_sample = pd.DataFrame(columns=['samples'])
    extra_sample['samples'] = [sample.columns[0]]
    sample.columns = ['samples']
    sample = pd.concat([extra_sample, sample], axis=0)
    sample.reset_index(inplace=True,drop=True)
    
    # mapping samples with same file reference
    same_file_df = ref_df[ref_df['file']==file]
    
    for index, row in same_file_df.iterrows():
        spacy_dict['region'].append(region)
        spacy_dict['text'].append(sample['samples'].iloc[row['sample_id']])
        spacy_dict['start'].append(row['start'])
        spacy_dict['end'].append(row['end'])
        spacy_dict['label'].append(row['label'])

spacy_df = pd.DataFrame(spacy_dict, columns=spacy_dict.keys())
spacy_df.head(5)

Unnamed: 0,region,text,start,end,label
0,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,97,106,ACAO_A_EXECUTAR
1,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,270,279,ACAO_A_EXECUTAR
2,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,373,381,ACAO_A_EXECUTAR
3,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,460,474,ACAO_A_EXECUTAR
4,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,493,502,ACAO_A_EXECUTAR


In [8]:
set(spacy_df['region'].to_list())

{'cnos',
 'cnos-equipnull',
 'cosr-nco',
 'cosr-nco-debug-1',
 'cosr-ne',
 'cosr-s',
 'cosr-se'}

In [9]:
bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/data/preprocessed_output/spacy.parquet'
output_uri = 's3://{}/{}'.format(bucket, prefix)
spacy_df.to_parquet(output_uri)

In [10]:
# Reading file
# df_read = pd.read_parquet(output_uri)
# df_read

## Output for Amazon Comprehend

In [11]:
comp_df = spacy_df.copy()
entity_str_list = []

for index, row in comp_df.iterrows():
    str_list_aux = list(row['text'])[row['start']-1:row['end']-1]
    str_ent = ''.join(str_list_aux)
    entity_str_list.append(str_ent)

comp_df['entity_str'] = entity_str_list
comp_df.columns = ['region','full_text','start','end','type','text']
comp_df

Unnamed: 0,region,full_text,start,end,type,text
0,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,97,106,ACAO_A_EXECUTAR,Verificar
1,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,270,279,ACAO_A_EXECUTAR,desligada
2,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,373,381,ACAO_A_EXECUTAR,Desligar
3,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,460,474,ACAO_A_EXECUTAR,no terminal da
4,cosr-nco,PassoCoordenaçãoControleComando / ExecuçãoProc...,493,502,ACAO_A_EXECUTAR,Atendidas
...,...,...,...,...,...,...
28059,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,369,390,INEQUACAO,509 kV ≤ V(N. Iguaçu)
28060,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,369,375,VALOR_COM_UNID.MEDIDA,509 kV
28061,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,376,377,OPERADOR_MATEMATICO,≤
28062,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,378,379,VALOR_COM_UNID.MEDIDA,V


In [12]:
comprehend_df = comp_df[['region','text','type']]
comprehend_df

Unnamed: 0,region,text,type
0,cosr-nco,Verificar,ACAO_A_EXECUTAR
1,cosr-nco,desligada,ACAO_A_EXECUTAR
2,cosr-nco,Desligar,ACAO_A_EXECUTAR
3,cosr-nco,no terminal da,ACAO_A_EXECUTAR
4,cosr-nco,Atendidas,ACAO_A_EXECUTAR
...,...,...,...
28059,cosr-se,509 kV ≤ V(N. Iguaçu),INEQUACAO
28060,cosr-se,509 kV,VALOR_COM_UNID.MEDIDA
28061,cosr-se,≤,OPERADOR_MATEMATICO
28062,cosr-se,V,VALOR_COM_UNID.MEDIDA


In [13]:
bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/data/preprocessed_output/comprehend_regions.csv'
output_uri = 's3://{}/{}'.format(bucket, prefix)
comp_df.to_csv(output_uri)

In [14]:
comprehend_df = comp_df[['text','type']]
comprehend_df

Unnamed: 0,text,type
0,Verificar,ACAO_A_EXECUTAR
1,desligada,ACAO_A_EXECUTAR
2,Desligar,ACAO_A_EXECUTAR
3,no terminal da,ACAO_A_EXECUTAR
4,Atendidas,ACAO_A_EXECUTAR
...,...,...
28059,509 kV ≤ V(N. Iguaçu),INEQUACAO
28060,509 kV,VALOR_COM_UNID.MEDIDA
28061,≤,OPERADOR_MATEMATICO
28062,V,VALOR_COM_UNID.MEDIDA


In [15]:
bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/data/preprocessed_output/comprehend.csv'
output_uri = 's3://{}/{}'.format(bucket, prefix)
comp_df.to_csv(output_uri)

## Comprehend with annotations

{'cnos',
 'cnos-equipnull',
 'cosr-nco',
 'cosr-nco-debug-1',
 'cosr-ne',
 'cosr-s',
 'cosr-se'}

In [16]:
selected_regions = ['cosr-se']
filtered_spacy_df = pd.DataFrame(columns=spacy_df.columns)
for region in selected_regions:
    aux_spacy_df = spacy_df[spacy_df['region']==region]
    filtered_spacy_df = pd.concat([filtered_spacy_df,aux_spacy_df], axis=0)
    
filtered_spacy_df.reset_index(drop=True, inplace=True)
filtered_spacy_df

Unnamed: 0,region,text,start,end,label
0,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,103,115,ACAO_A_EXECUTAR
1,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,156,202,ACAO_A_EXECUTAR
2,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,118,155,EQUIPAMENTO
3,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,203,219,SUBESTACAO
4,cosr-se,3.27.2.1. Energização da LT 345 kV Mogi das C...,12,23,ACAO_A_EXECUTAR
...,...,...,...,...,...
13223,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,369,390,INEQUACAO
13224,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,369,375,VALOR_COM_UNID.MEDIDA
13225,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,376,377,OPERADOR_MATEMATICO
13226,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,378,379,VALOR_COM_UNID.MEDIDA


In [17]:
document_list = list(set(filtered_spacy_df['text']))
documents_df = pd.DataFrame(columns=['text'])
documents_df['text'] = document_list

###########################################
# # DATA AUGMENTATION FOR COMPREHEND TRAINING
# n_aug = len(documents_df)
# n_rep = 4
# documents_df = pd.concat([documents_df]*n_rep, axis=0)
###########################################

documents_df.head()

Unnamed: 0,text
0,3.41.2.1. Energização da LT 345 kV São Gotard...
1,3.16.2.1. Energização da LT 440 kV Bauru / O...
2,3.20.1.1. LT 345 kV Adrianópolis / Itutinga C...
3,3.41.1.1. A partir da SE GV do Brasil (sentid...
4,PassoCoordenaçãoControleComandoExecuçãoProcedi...


In [18]:
len(documents_df)

174

In [19]:
comprehend_annotations_df = filtered_spacy_df.copy()
comprehend_annotations_df['document'] = None
comprehend_annotations_df

Unnamed: 0,region,text,start,end,label,document
0,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,103,115,ACAO_A_EXECUTAR,
1,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,156,202,ACAO_A_EXECUTAR,
2,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,118,155,EQUIPAMENTO,
3,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,203,219,SUBESTACAO,
4,cosr-se,3.27.2.1. Energização da LT 345 kV Mogi das C...,12,23,ACAO_A_EXECUTAR,
...,...,...,...,...,...,...
13223,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,369,390,INEQUACAO,
13224,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,369,375,VALOR_COM_UNID.MEDIDA,
13225,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,376,377,OPERADOR_MATEMATICO,
13226,cosr-se,PassoCoordenaçãoControleComando / ExecuçãoProc...,378,379,VALOR_COM_UNID.MEDIDA,


In [20]:
unique_text = set(comprehend_annotations_df['text'])
index_list = []
for samp in unique_text:
    df_loop = comprehend_annotations_df[comprehend_annotations_df['text']==samp]
    df_loop.drop_duplicates(inplace=True)
    df_loop = df_loop[df_loop['label']!='INEQUACAO']
    df_loop.sort_values(by=['start','end'], axis=0, ascending=True, inplace=True)
    sliding_ref = -1
    for index, row in df_loop.iterrows():
        if row['start'] > sliding_ref:
            index_list.append(index)
            sliding_ref = row['end']

comprehend_annotations_df = comprehend_annotations_df.iloc[index_list]
comprehend_annotations_df.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [21]:
for n_frag in range(len(document_list)):
    aux_df = comprehend_annotations_df[comprehend_annotations_df['text'] == document_list[n_frag]]
    comprehend_annotations_df['document'].iloc[aux_df.index.to_list()] = int(n_frag)
    
comprehend_annotations_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


Unnamed: 0,index,region,text,start,end,label,document
0,1254,cosr-se,3.41.2.1. Energização da LT 345 kV São Gotard...,12,23,ACAO_A_EXECUTAR,0
1,1270,cosr-se,3.41.2.1. Energização da LT 345 kV São Gotard...,27,64,EQUIPAMENTO,0
2,1255,cosr-se,3.41.2.1. Energização da LT 345 kV São Gotard...,68,82,ACAO_A_EXECUTAR,0
3,1256,cosr-se,3.41.2.1. Energização da LT 345 kV São Gotard...,83,94,ACAO_A_EXECUTAR,0
4,1278,cosr-se,3.41.2.1. Energização da LT 345 kV São Gotard...,95,111,SUBESTACAO,0
...,...,...,...,...,...,...,...
5624,9668,cosr-se,3.52.2.1. Energização da LT 138 kV zona oeste...,1652,1671,OPERADOR_MATEMATICO,173
5625,9666,cosr-se,3.52.2.1. Energização da LT 138 kV zona oeste...,1672,1678,VALOR_COM_UNID.MEDIDA,173
5626,9647,cosr-se,3.52.2.1. Energização da LT 138 kV zona oeste...,1709,1724,ACAO_A_EXECUTAR,173
5627,9654,cosr-se,3.52.2.1. Energização da LT 138 kV zona oeste...,1728,1775,EQUIPAMENTO,173


In [22]:
comprehend_annotations_df['document'].isnull().value_counts()

False    5629
Name: document, dtype: int64

In [23]:
# save file as txt in s3 - requires replacement of quote marks after file gen (download, modify and upload)

bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/data/comprehend/'+selected_regions[0]+'/documents.txt'
output_uri = 's3://{}/{}'.format(bucket, prefix)
documents_df['text'].to_csv(output_uri, sep=' ', index=False, header=False)

In [24]:
comprehend_annotations_fulldf = comprehend_annotations_df.copy()
comprehend_annotations_fulldf['File'] = ['documents.txt']*len(comprehend_annotations_df)
comprehend_annotations_fulldf = comprehend_annotations_fulldf[['File','document','start','end','label']]
comprehend_annotations_fulldf.columns = ['File','Line','Begin Offset','End Offset','Type']
comprehend_annotations_fulldf

Unnamed: 0,File,Line,Begin Offset,End Offset,Type
0,documents.txt,0,12,23,ACAO_A_EXECUTAR
1,documents.txt,0,27,64,EQUIPAMENTO
2,documents.txt,0,68,82,ACAO_A_EXECUTAR
3,documents.txt,0,83,94,ACAO_A_EXECUTAR
4,documents.txt,0,95,111,SUBESTACAO
...,...,...,...,...,...
5624,documents.txt,173,1652,1671,OPERADOR_MATEMATICO
5625,documents.txt,173,1672,1678,VALOR_COM_UNID.MEDIDA
5626,documents.txt,173,1709,1724,ACAO_A_EXECUTAR
5627,documents.txt,173,1728,1775,EQUIPAMENTO


In [25]:
###########################################
# # DATA AUGMENTATION FOR COMPREHEND TRAINING

# comprehend_annotations_fulldf_baseline = comprehend_annotations_fulldf.copy()

# for i in range(n_rep-1):
#     comprehend_annotations_fulldf_aux = comprehend_annotations_fulldf_baseline.copy()
#     comprehend_annotations_fulldf_aux['Line'] = comprehend_annotations_fulldf_baseline['Line']+(i+1)*n_aug
#     comprehend_annotations_fulldf = pd.concat([comprehend_annotations_fulldf, comprehend_annotations_fulldf_aux], axis=0)

# comprehend_annotations_fulldf
###########################################

In [26]:
set(comprehend_annotations_fulldf['Type'])

{'ACAO_A_EXECUTAR',
 'EQUIPAMENTO',
 'ESTADO_OPERATIVO',
 'OPERADOR_MATEMATICO',
 'SUBESTACAO',
 'USINA',
 'VALOR_COM_UNID.MEDIDA'}

In [27]:
bucket = 'ons-dl-dev-landing'
prefix = 'ons/mpo_ds_nlp/data/comprehend/'+selected_regions[0]+'/annotations.csv'
output_uri = 's3://{}/{}'.format(bucket, prefix)
comprehend_annotations_fulldf.to_csv(output_uri, sep=',', index=False, header=True)