In [3]:

import os
import pandas as pd
import numpy as np
import json

In [4]:
# Set directories (from sam_baseline.ipynb)
currentdir = os.getcwd() # ~/MeasEval/baselines

interimpath = os.path.join(currentdir, "../data/interim/")
print("interimpath: ", interimpath)


interimpath:  /Users/michelle/MIDS/266 NLP/MeasEval/sandbox/../data/interim/


In [5]:
train_annot = pd.read_csv(interimpath+'train_annot.csv')
dev_annot = pd.read_csv(interimpath+'dev_annot.csv')
test_annot = pd.read_csv(interimpath+'test_annot.csv')

with open(interimpath+'train_txt.json','r') as f:
    train_txt = json.load(f)
with open(interimpath+'dev_txt.json','r') as f:
    dev_txt = json.load(f)
with open(interimpath+'test_txt.json','r') as f:
    test_txt = json.load(f)

### Run Functions

In [6]:
def preprocess(base_annot_df):
    
    # rename header of original df
    df = base_annot_df
    df.rename(columns={'Unnamed: 0': 'annot_index'}, inplace=True)
    df['other'] = df['other'].fillna("{}")
    
    def str_to_dict(x):
        """
        Input: a string or dict that looks like dict format
        Output: dict
        """
        if isinstance(x, dict):
            dict_str = x
        elif isinstance(x, str):
            dict_str = json.loads(x)  
        else:
            print("something not string: ", x, type(x))
        return dict_str
    
    # convert datatypes
    df['docId'] = df['docId'].astype('string')
    df['annotType'] = df['annotType'].astype('string')
    df['annotSet'] = df['annotSet'].astype('string')
    df['text'] = df['text'].astype('string')
    df['annotId'] = df['annotId'].astype('string')
    df['other'] = df['other'].apply(str_to_dict, convert_dtype=True)
    
    # check if csv_temp['other'] is empty (meaning NaNs were successfully replaced with '{}')
    if df[df['other'].apply(lambda x: isinstance(x, float))].empty == 0:
        print("Warning: 'other' column contains floats")
       
    def get_unit(dictionary):
        """
        Helper function to check for key
        """
        dictionary = dictionary
        if 'unit' in dictionary.keys():
            value = dictionary['unit']
        else:
            value = ""
        return value

    def get_subspan_type(dictionary, flag):
        """
        Helper function for 
        """
        dictionary = dictionary
        flag = flag
        
        if 'HasQuantity' in dictionary.keys():
            key_name = 'HasQuantity'
        elif 'HasProperty' in dictionary.keys():
            key_name = 'HasProperty'
        elif 'Qualifies' in dictionary.keys():
            key_name = 'Qualifies'
        else:
            key_name = 0

        if flag == 'type':
            output = key_name
        elif flag == 'link':
            if key_name == 0:
                output = key_name
            else:
                value = dictionary[key_name]
                output = value
        return output
    
    def get_subspan_start_end(uniqueId, dataframe, flag):
        
        df = dataframe
        flag = flag
        unique_id = uniqueId
        
        subspan_link = df.loc[df['uniqueId']==unique_id, 'subspan_link'].values[0]
        if subspan_link == 0:
            output = ""
        else:
            # get indices for those uni
            this_row_idx = df.index[(df['subspan_link']==subspan_link) & (df['uniqueId']==unique_id)].tolist()[0]
            
            # get subspan start and end values
            this_row_start = df['startOffset'].iloc[[this_row_idx]].item()
            this_row_end = df['endOffset'].iloc[[this_row_idx]].item()
            
            # get the set
            unique_set = df['uniqueSet'].iloc[[this_row_idx]].item()
            
            # get the other row's index
            other_row_idx = df.index[(df['annotId']==subspan_link) & (df['uniqueSet']==unique_set)].tolist()[0]
            
            # get other row's start and end
            other_row_start = df['startOffset'].iloc[[other_row_idx]].item()
            other_row_end = df['endOffset'].iloc[[other_row_idx]].item()
            
            # compare the starts and ends
            
            subspan_start = min(this_row_start, other_row_start)
            subspan_end = max(this_row_end, other_row_end)
            
            if flag == 'start':
                output = subspan_start
            elif flag == 'end':
                output = subspan_end
                
        return output
        
    
    # generate 'unit' column
    df['unit'] = df['other'].apply(get_unit)
    
    # generate 'subspan_type' column
    df['subspan_type'] = df['other'].apply(get_subspan_type, args=('type',))
    
    # generate 'subspan_link' column
    df['subspan_link'] = df['other'].apply(get_subspan_type, args=('link',))
    
    # generate 'uniqueId' column
    df['uniqueId'] = df["docId"] + df["annotSet"] + df['annotType']
    df['uniqueSet'] = df["docId"] + "-" + df["annotSet"]
    
    # generate 'subspanStart' column
    df['subspanStart'] = df['uniqueId'].apply(get_subspan_start_end, args=(df,'start'))
    
    # generate 'subspanEnd' column
    df['subspanEnd'] = df['uniqueId'].apply(get_subspan_start_end, args=(df, 'end',))
    
    df = df.drop(columns=['annot_index','uniqueId', 'other'])
    df = df[["docId", "annotSet", "uniqueSet", "annotType", "startOffset", "endOffset", "annotId",
            "text", "unit", "subspan_type", "subspan_link", "subspanStart", "subspanEnd"]]

    
    return df
    
    

### Testing

In [7]:
new = preprocess(train_annot)


In [8]:
new

Unnamed: 0,docId,annotSet,uniqueSet,annotType,startOffset,endOffset,annotId,text,unit,subspan_type,subspan_link,subspanStart,subspanEnd
0,S0012821X12004384-1610,1,S0012821X12004384-1610-1,Quantity,90,98,T1-1,2617.4 m,m,0,0,,
1,S0012821X12004384-1610,1,S0012821X12004384-1610-1,MeasuredEntity,4,14,T3-1,brief peak,,HasQuantity,T1-1,4,98
2,S0012821X12004384-1610,1,S0012821X12004384-1610-1,Qualifier,15,30,T4-1,in Apectodinium,,Qualifies,T3-1,4,30
3,S0012821X12004384-1610,2,S0012821X12004384-1610-2,Quantity,669,688,T1-2,2619.6 and 2614.7 m,m,0,0,,
4,S0012821X12004384-1610,2,S0012821X12004384-1610-2,MeasuredEntity,638,649,T3-2,other peaks,,HasQuantity,T1-2,638,688
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3506,S2213671113001306-910,3,S2213671113001306-910-3,MeasuredEntity,983,993,T3-3,scale bars,,HasProperty,T4-3,983,1030
3507,S2213671113001306-910,4,S2213671113001306-910-4,Quantity,1015,1021,T1-4,0.2 μm,μm,0,0,,
3508,S2213671113001306-910,4,S2213671113001306-910-4,MeasuredProperty,1022,1030,T2-4,distance,,HasQuantity,T1-4,1015,1030
3509,S2213671113001306-910,4,S2213671113001306-910-4,MeasuredEntity,983,993,T3-4,scale bars,,HasProperty,T2-4,983,1030
