In [1]:
import pickle
import pandas as pd
import csv
import json
import os
from stanza.utils.conll import CoNLL
from stanza.models.common.doc import Document


# Merge data from review back to original stanza data.

In [2]:
#Load data used for dataset creation
df_shuffled = pd.read_pickle('intermediate_store/preprocessed_ads_V1.0.pickle')

#Load reviewed annotation data Appen
with open('reviewed_data/initial_1300.json') as json_file:
    rewiewed_json = json.load(json_file)
    

In [3]:
#Quick check if the token length is still the same
print('Example 0 how many tokens are in this sentence initially after stanza ', sum(len(row) for row in df_shuffled.loc[9119].doc.to_dict()))
print('Example 0 how many tokens are in this sentence after review', len(rewiewed_json[0]['data']['clean_description'].split(' ')))


Example 0 how many tokens are in this sentence initially after stanza  30
Example 0 how many tokens are in this sentence after review 30


In [4]:
#Detailed comparison


df_shuffled.loc[9119].doc.to_dict()
rewiewed_json[0]['data']['clean_description'].split(' ')

['ALPRAZOLAM',
 'POWDER',
 'RATES.',
 'HIT',
 'WICKR',
 'ID',
 'drcoke',
 '10',
 'g',
 '497',
 '25g',
 '997',
 '50',
 'g',
 '1697',
 '100',
 'g',
 '2797',
 '250',
 'g',
 '5997',
 'ALPRAZOLAM',
 'TABLETS',
 '100tabs300',
 '200tabs',
 '580',
 '400tabs',
 '850',
 '1000tabs',
 '1860.']

In [12]:
rewiewed_json[0]['data']

{'clean_description': 'ALPRAZOLAM POWDER RATES. HIT WICKR ID drcoke 10 g 497 25g 997 50 g 1697 100 g 2797 250 g 5997 ALPRAZOLAM TABLETS 100tabs300 200tabs 580 400tabs 850 1000tabs 1860.',
 'dream_id': 416407,
 'golden': True,
 'name': 'ALPRAZOLAM POWDER,ALPRAZOLAM TABLETS',
 'p_id': 9119}

In [5]:
# In detail comparison between the spans --> Do this for all of them
flat_list_orig = [item for sublist in df_shuffled.loc[9119].doc.to_dict() for item in sublist]
flat_list_rev = rewiewed_json[0]['data']['clean_description'].split(' ')

for i, rev_span in enumerate(flat_list_rev):
    print ("Orig: {} Review: {} --> is the same: {} ".format(flat_list_orig[i]['text'], rev_span, flat_list_orig[i]['text']==rev_span))

Orig: ALPRAZOLAM Review: ALPRAZOLAM --> is the same: True 
Orig: POWDER Review: POWDER --> is the same: True 
Orig: RATES Review: RATES. --> is the same: False 
Orig: HIT Review: HIT --> is the same: True 
Orig: WICKR Review: WICKR --> is the same: True 
Orig: ID Review: ID --> is the same: True 
Orig: drcoke Review: drcoke --> is the same: True 
Orig: 10 Review: 10 --> is the same: True 
Orig: g Review: g --> is the same: True 
Orig: 497 Review: 497 --> is the same: True 
Orig: 25g Review: 25g --> is the same: True 
Orig: 997 Review: 997 --> is the same: True 
Orig: 50 Review: 50 --> is the same: True 
Orig: g Review: g --> is the same: True 
Orig: 1697 Review: 1697 --> is the same: True 
Orig: 100 Review: 100 --> is the same: True 
Orig: g Review: g --> is the same: True 
Orig: 2797 Review: 2797 --> is the same: True 
Orig: 250 Review: 250 --> is the same: True 
Orig: g Review: g --> is the same: True 
Orig: 5997 Review: 5997 --> is the same: True 
Orig: ALPRAZOLAM Review: ALPRAZOLAM

In [6]:
rewiewed_json[0]

{'completions': [{'created_at': 1613048258,
   'id': 471001,
   'lead_time': 10.0,
   'result': [{'from_name': 'label',
     'id': 'c965a1fe-129b-49e0-869b-0e7b2ac37e09',
     'to_name': 'text',
     'type': 'labels',
     'value': {'end': 10,
      'labels': ['Drug'],
      'start': 0,
      'text': 'ALPRAZOLAM'}},
    {'from_name': 'label',
     'id': '2f7a3b5c-8f5a-4d2a-979f-ea0fedd78346',
     'to_name': 'text',
     'type': 'labels',
     'value': {'end': 104,
      'labels': ['Drug'],
      'start': 94,
      'text': 'ALPRAZOLAM'}}]}],
 'data': {'clean_description': 'ALPRAZOLAM POWDER RATES. HIT WICKR ID drcoke 10 g 497 25g 997 50 g 1697 100 g 2797 250 g 5997 ALPRAZOLAM TABLETS 100tabs300 200tabs 580 400tabs 850 1000tabs 1860.',
  'dream_id': 416407,
  'golden': True,
  'name': 'ALPRAZOLAM POWDER,ALPRAZOLAM TABLETS',
  'p_id': 9119},
 'id': 471}

### Evaluate how many rows cause a problem and subsequently merge all rows without conflicts in a new df.



In [7]:
def split_with_indexes(string):
    token_list = string.split(' ')
    tuple_list = []
    curr_start = 0
    for token in token_list:
        tuple_list.append( (token, curr_start, curr_start+len(token)-1) )#real end char, not start of next one
        curr_start += len(token) + 1
    return tuple_list
def annotate_doc(review_json, doc):
    
    #create splitted review token list
    #create a tag list with same length
    reviewed_tuples = split_with_indexes(review_json['data']['clean_description'])
    tag_list_rev = ['O' for i in range(len(reviewed_tuples))]
    
    #For each annotation check which tokens shall be labelled
    for tag_dict in review_json['completions'][0]['result']:
        first=True
        
        for j, (token, r_start, r_end) in enumerate(reviewed_tuples):
            tag_start = tag_dict['value']['start']
            tag_end = tag_dict['value']['end'] - 1 #real end char, not start of next one
            #print('r_start {} r_end {} tag_start {} tag_end {} token: {}'.format(r_start, r_end, tag_start, tag_end, token))
            if tag_end >= r_start and tag_start <= r_end and tag_dict['value']['labels'] == ['Drug']:
                if first:
                    tag_list_rev[j] = 'B-Drug'
                    first=False
                else:
                    tag_list_rev[j] = 'I-Drug'
    
    #loop over doc and annotate the tokens in misc.
    token_id=0
    for sentence in doc:
        for token_dict in sentence:
            token_dict['misc'] = token_dict['misc'] + '|label=' + tag_list_rev[token_id]
            token_id+=1
    return doc
    
#split_with_indexes(rewiewed_json[300]['data']['clean_description'])   
#annotate_doc(rewiewed_json[115], df_shuffled.loc[631].doc.to_dict())
annotate_doc(rewiewed_json[79], df_shuffled.loc[115].doc.to_dict())

[[{'id': 1, 'text': '17g', 'misc': 'start_char=0|end_char=3|label=O'},
  {'id': 2, 'text': 'Super', 'misc': 'start_char=4|end_char=9|label=O'},
  {'id': 3, 'text': 'Hash', 'misc': 'start_char=10|end_char=14|label=B-Drug'},
  {'id': 4, 'text': 'Pollen', 'misc': 'start_char=15|end_char=21|label=O'},
  {'id': 5, 'text': '/', 'misc': 'start_char=22|end_char=23|label=O'},
  {'id': 6, 'text': 'Rif', 'misc': 'start_char=24|end_char=27|label=O'},
  {'id': 7, 'text': 'Al', 'misc': 'start_char=28|end_char=30|label=O'},
  {'id': 8, 'text': 'Hoceima', 'misc': 'start_char=31|end_char=38|label=O'}]]

In [8]:
#Check each review item if the annotations stayed the same

amt_same_len = 0
amt_not_same_len = 0
amt_exactly_same = 0
amt_not_exactly_same = 0

for i, review in enumerate(rewiewed_json):
    p_id = review['data']['p_id']
    flat_list_orig = [item for sublist in df_shuffled.loc[p_id].doc.to_dict() for item in sublist]
    flat_list_rev =review['data']['clean_description'].split(' ')
    length_problem=False
    
    #Check if length matches
    if len(flat_list_orig) == len(flat_list_rev):
        amt_same_len+=1
    else:
        amt_not_same_len +=1
        print('Length Problem at pandas_id: {} and json item nr: {}'.format(p_id, i))
        length_problem=True
    
    #Check each token if they are the equal:
    same = True
    for j, rev_span in enumerate(flat_list_rev):
        if not flat_list_orig[j]['text']==rev_span:
            if not ( flat_list_orig[j]['text'] == rev_span[:-1] and rev_span[-1:] == '.' ):
                same = False
                print('Detail Problem at pandas_id: {} and json item nr: {}'.format(p_id, i))
                break
    if same:
        amt_exactly_same+=1
        #If length and exact tokens are ok --> add label!
        if not length_problem:
            new_doc = annotate_doc(review, df_shuffled.loc[p_id].doc.to_dict())
            new_doc = Document(new_doc)
            df_shuffled.at[p_id, 'doc'] = new_doc
    else:
        amt_not_exactly_same+=1
    
print('{} rows had still the same amount of tokens. {} not. this is {} %'.format(amt_same_len, amt_not_same_len, amt_not_same_len/amt_same_len ))
print('{} rows had exactly the same tokens. {} not. this is {} %'.format(amt_exactly_same, amt_not_exactly_same, amt_not_exactly_same/amt_exactly_same))

Length Problem at pandas_id: 742 and json item nr: 344
Detail Problem at pandas_id: 742 and json item nr: 344
Length Problem at pandas_id: 802 and json item nr: 1105
1307 rows had still the same amount of tokens. 2 not. this is 0.001530221882172915 %
1308 rows had exactly the same tokens. 1 not. this is 0.0007645259938837921 %


In [9]:
#Investigate on errors
#rewiewed_json[344]['data']['clean_description'].split(' ')
#df_shuffled.loc[742].doc.to_dict()

# Length Problem at pandas_id: 742 and json item nr: 344 --> I have no idea what happened here. in the "shortTextDrugsV1.0.tsv" 
#which I uploaded to appen it was still correct. In the Appen Export it was only a single " in clean description --> guess that stuff
# was lost in the cloud.
# Length Problem at pandas_id: 802 and json item nr: 1105 
#--> That's a simple problem (a additional space in the end which got removed at trim()) 


In [10]:
test_dict = df_shuffled.loc[115].doc.to_dict()
test_dict
#conll = CoNLL.convert_dict(test_dict)
#conll

[[{'id': 1, 'text': '17g', 'misc': 'start_char=0|end_char=3|label=O'},
  {'id': 2, 'text': 'Super', 'misc': 'start_char=4|end_char=9|label=O'},
  {'id': 3, 'text': 'Hash', 'misc': 'start_char=10|end_char=14|label=B-Drug'},
  {'id': 4, 'text': 'Pollen', 'misc': 'start_char=15|end_char=21|label=O'},
  {'id': 5, 'text': '/', 'misc': 'start_char=22|end_char=23|label=O'},
  {'id': 6, 'text': 'Rif', 'misc': 'start_char=24|end_char=27|label=O'},
  {'id': 7, 'text': 'Al', 'misc': 'start_char=28|end_char=30|label=O'},
  {'id': 8, 'text': 'Hoceima', 'misc': 'start_char=31|end_char=38|label=O'}]]

In [11]:
rewiewed_json[79]

{'completions': [{'created_at': 1612790712,
   'id': 99001,
   'lead_time': 1385.539,
   'result': [{'from_name': 'label',
     'id': 'Vb3CqYB7cF',
     'to_name': 'text',
     'type': 'labels',
     'value': {'end': 15,
      'labels': ['Drug'],
      'start': 10,
      'text': 'Hash '}}]}],
 'data': {'clean_description': '17g Super Hash Pollen / Rif Al Hoceima.',
  'dream_id': 367562,
  'golden': False,
  'name': '17g Super Hash Pollen / Rif Al Hoceima',
  'p_id': 115},
 'id': 99}