# Ground Truth Multiclass Labeling

In [47]:
# pip install awswrangler

In [48]:
# pip install awswrangler

import json
import boto3 
import numpy as np
import pandas as pd
import awswrangler as wr

from sagemaker import get_execution_role
role = get_execution_role()

In [49]:
# reading file contents from s3 bucket
bucket = 'demo-labeling-job-v1'
prefix = 'labeling-job-2/demo-labeling-job-v3/annotations/worker-response'
obj_uri = 's3://{}/{}'.format(bucket, prefix)

cl = boto3.client('s3')
contents = cl.list_objects(Bucket=bucket, Prefix=prefix)['Contents']

file_list = []
for file in contents:
    file_list.append(file['Key'])

In [50]:
# reading output from ground truth entity recognition job
ref_dict = {
    'sample':[],
    'label':[]
}

for file in file_list: 
    obj_uri = 's3://{}/{}'.format(bucket,file)
    json_obj = pd.read_json(obj_uri)
    ref_dict['label'].append(json_obj['answers'][0]['answerContent']['crowd-classifier-multi-select']['labels'][0])
    ref_dict['sample'].append(file.split('/')[5])
        
ref_df = pd.DataFrame(ref_dict)
ref_df['sample'] = ref_df['sample'].astype(int)

ref_df

Unnamed: 0,sample,label
0,0,COMPARACAO-DE-VALOR
1,1,ACAO
2,2,COMPARACAO-DE-VALOR
3,3,ACAO
4,4,ACAO
5,5,ACAO
6,6,COMPARACAO-DE-VALOR
7,7,OUTRO


In [51]:
# reading samples dataset
bucket = 'demo-labeling-job-v1'
file = 'labeling-job-2/labeling-job-test-2.csv'
obj_uri = 's3://{}/{}'.format(bucket, file)

samples_df = wr.s3.read_csv(obj_uri)

# because the file contained no header, the first sample was imposed as the column name
# to correct this issue, use the following algorithm

extra_sample = pd.DataFrame(columns=['entity'])
extra_sample['entity'] = [samples_df.columns[0]]
samples_df.columns = ['entity']
samples_df = pd.concat([extra_sample, samples_df], axis=0)
samples_df.reset_index(inplace=True, drop=True)
samples_df['sample'] = range(len(samples_df))
samples_df

Unnamed: 0,entity,sample
0,Linha do Transformador TF13 < 350 kV.,0
1,Abrir ou manter aberto o terminal do Transform...,1
2,SE Coletora Porto Velho < 550 kV.,2
3,Desenergizar a LT 350 kV Santo Antônio.,3
4,Redução no carregamento da LT 345 kV Embu-Guaç...,4
5,Remanejar cargas alimentadas pela SE 345/88 kV...,5
6,Transformador TF13 Santo Antônio < 350 kV,6
7,Coletora Porto Velho C3 ou C4.,7


In [52]:
ref_df = pd.merge(left=ref_df, right=samples_df, on='sample')

ref_df

Unnamed: 0,sample,label,entity
0,0,COMPARACAO-DE-VALOR,Linha do Transformador TF13 < 350 kV.
1,1,ACAO,Abrir ou manter aberto o terminal do Transform...
2,2,COMPARACAO-DE-VALOR,SE Coletora Porto Velho < 550 kV.
3,3,ACAO,Desenergizar a LT 350 kV Santo Antônio.
4,4,ACAO,Redução no carregamento da LT 345 kV Embu-Guaç...
5,5,ACAO,Remanejar cargas alimentadas pela SE 345/88 kV...
6,6,COMPARACAO-DE-VALOR,Transformador TF13 Santo Antônio < 350 kV
7,7,OUTRO,Coletora Porto Velho C3 ou C4.


In [53]:
comprehend_training_df = ref_df[['entity','label']]
comprehend_training_df

Unnamed: 0,entity,label
0,Linha do Transformador TF13 < 350 kV.,COMPARACAO-DE-VALOR
1,Abrir ou manter aberto o terminal do Transform...,ACAO
2,SE Coletora Porto Velho < 550 kV.,COMPARACAO-DE-VALOR
3,Desenergizar a LT 350 kV Santo Antônio.,ACAO
4,Redução no carregamento da LT 345 kV Embu-Guaç...,ACAO
5,Remanejar cargas alimentadas pela SE 345/88 kV...,ACAO
6,Transformador TF13 Santo Antônio < 350 kV,COMPARACAO-DE-VALOR
7,Coletora Porto Velho C3 ou C4.,OUTRO


This last dataset, containing information about the entities and the respective lables, can be used in training jobs on Amazon Comprehend.