In [101]:
import random
import os
import pickle
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoProcessor
from transformers import DataCollatorWithPadding
from transformers import AutoModel, TrainingArguments, Trainer
import pandas as pd
import json
# from transformers import RobertaTokenizer, RobertaModel

random.seed(42)
percent_to_train = .8

model_name = 'allenai/biomed_roberta_base'
dropout = .03



In [36]:
# Set directories (from sam_baseline.ipynb)
currentdir = os.getcwd() # ~/MeasEval/baselines
combopath_txt = os.path.join(currentdir, "../data/raw/combo/tsv/")
print("combopath_txt: ", combopath_txt)
combopath_annot = os.path.join(currentdir, "../data/raw/combo/tsv/")
print("combopath_annot :", combopath_annot)
interimpath = os.path.join(currentdir, "../data/interim/")
print("interimpath: ", interimpath)


combopath_txt:  /home/michelle/MeasEval/baselines/../data/raw/combo/tsv/
combopath_annot : /home/michelle/MeasEval/baselines/../data/raw/combo/tsv/
interimpath:  /home/michelle/MeasEval/baselines/../data/interim/


### Run Functions

In [210]:
def generate_text_dfs(train_text_path, dev_text_path, test_text_path):
    """
    Input: train, dev and test TEXT json file paths
    Output: pandas dfs for train, dev and test text
    """

    # set paths
    train_path = train_text_path
    dev_path = dev_text_path
    test_path = test_text_path

    # read jsons
    with open(train_path, 'r') as f:
        data = json.load(f)
    train_text = pd.DataFrame({'text':data})
    with open(dev_path, 'r') as f:
        data = json.load(f)
    dev_text = pd.DataFrame({'text':data})
    with open(test_path, 'r') as f:
        data = json.load(f)
    test_text = pd.DataFrame({'text':data})

    # change the docIds index to a column
    for i in [train_text, dev_text, test_text]:
        i.reset_index(inplace=True)
        i.rename(columns = {'index':'docId'}, inplace=True)
        #print(i.dtypes)

    return train_text, dev_text, test_text


# train_text_test = generate_text_dfs(train_text_path = os.path.join(currentdir, "../data/interim/train_txt.json"),
#                                                     dev_text_path = os.path.join(currentdir, "../data/interim/dev_txt.json"),
#                                                     test_text_path = os.path.join(currentdir, "../data/interim/test_txt.json")
#                                                     )

# train_text_test
# train_text, dev_text, test_text = generate_text_dfs(train_text_path = os.path.join(currentdir, "../data/interim/train_txt.json"),
#                                                     dev_text_path = os.path.join(currentdir, "../data/interim/dev_txt.json"),
#                                                     test_text_path = os.path.join(currentdir, "../data/interim/test_txt.json")
#                                                     )


In [204]:
def annotation_csv_to_pandas(path):
    """
    Input: file path of csv
    Output: csv with types cast to the following (other is all dictionaries)
        annot_index     int64  **This is different than the dataframe index**
        docId          string
        annotSet        int64
        annotType      string
        startOffset     int64
        endOffset       int64
        annotId        string
        text           string
        other          object
    """
    path = path
    csv_temp = pd.read_csv(path, header=0)
    csv_temp.rename(columns={'Unnamed: 0': 'annot_index'}, inplace=True)
    csv_temp['other'] = csv_temp['other'].fillna("{}")

    def str_to_dict(x):
        """
        Input: a string that looks like dict format
        Output: dict
        """
        if isinstance(x, str):
            dict_str = json.loads(x)  
        else:
            print("something not string")
            # x = x.astype(str)
            # dict_str = json.loads(x)
        return dict_str
        
    csv_temp['docId'] = csv_temp['docId'].astype('string')
    csv_temp['annotType'] = csv_temp['annotType'].astype('string')
    csv_temp['text'] = csv_temp['text'].astype('string')
    csv_temp['annotId'] = csv_temp['annotId'].astype('string')
    csv_temp['other'] = csv_temp['other'].apply(str_to_dict, convert_dtype=True)

    # Check if csv_temp['other'] is empty (NaNs successfully replaced with '{}')
    # This shouldn't happen or the line above would break
    if csv_temp[csv_temp['other'].apply(lambda x: isinstance(x, float))].empty == 0:
        print("Warning: 'other' column contains floats")

    return csv_temp
    
# testing
# annotation_csv_to_pandas(os.path.join(currentdir, "../data/interim/train_annot.csv"))

In [205]:
def generate_annotation_dfs(train_annotation_path, dev_annotation_path, test_annotation_path):
    """
    Input: train, dev and test ANNOTATION csv file paths
    Output: pandas dfs for train, dev and test annotations
    """
    train_path = train_annotation_path
    dev_path = dev_annotation_path
    test_path = test_annotation_path

    train_df = annotation_csv_to_pandas(train_path)
    dev_df = annotation_csv_to_pandas(dev_path)
    test_df = annotation_csv_to_pandas(test_path)

    return train_df, dev_df, test_df

### Generate Dataframes

In [211]:
# Generate annotations dataframes
train_annot, dev_annot, test_annot = generate_annotation_dfs(train_annotation_path = os.path.join(currentdir, "../data/interim/train_annot.csv"),
                                            dev_annotation_path = os.path.join(currentdir, "../data/interim/dev_annot.csv"),
                                            test_annotation_path = os.path.join(currentdir, "../data/interim/test_annot.csv")
                                            )

# Generate text dataframs       
train_text, dev_text, test_text = generate_text_dfs(train_text_path = os.path.join(currentdir, "../data/interim/train_txt.json"),
                                                    dev_text_path = os.path.join(currentdir, "../data/interim/dev_txt.json"),
                                                    test_text_path = os.path.join(currentdir, "../data/interim/test_txt.json")
                                                    )

### Testing (ignore)

In [215]:
type(dev_text)

pandas.core.frame.DataFrame

In [212]:
dev_text.head()

Unnamed: 0,docId,text
0,S0012821X12004384-1284,All analyses were carried out at the NERC Isot...
1,S0012821X12004384-1640,A negative carbon isotope excursion of 5‰ has ...
2,S0012821X13002185-1217,Cleaned sponge and diatom opal was dissolved v...
3,S0016236113008041-3012,The fate of trace elements was investigated in...
4,S0016236113008041-3127,Thermodynamic modelling using MTDATA software ...


In [185]:
test_annot.head()

Unnamed: 0,annot_index,docId,annotSet,annotType,startOffset,endOffset,annotId,text,other
0,216,S0019103512003995-2579,1,Quantity,562,571,T1-1,below 3Rp,"{'mods': ['IsRange'], 'unit': '3Rp'}"
1,217,S0019103512003995-2579,1,MeasuredProperty,538,561,T4-1,hydrostatic equilibrium,{'HasQuantity': 'T1-1'}
2,218,S0019103512003995-2579,1,MeasuredEntity,501,516,T2-1,density profile,{'HasProperty': 'T4-1'}
3,219,S0019103512003995-2579,1,Qualifier,490,495,T3-1,model,{'Qualifies': 'T2-1'}
4,220,S0019103512003995-2579,2,Quantity,673,679,T1-2,6300 K,"{'mods': ['IsMean'], 'unit': 'K'}"


In [188]:
# Check for NaNs in 'other' -- there shouldn't be any
# Function should have replaced all NaNs with "{}"
# which turns into an empty dict when using json.loads()
test_annot[test_annot['other'].apply(lambda x: isinstance(x, float))].empty

True

In [187]:
print(test_annot.dtypes)


annot_index     int64
docId          string
annotSet        int64
annotType      string
startOffset     int64
endOffset       int64
annotId        string
text           string
other          object
dtype: object
