# Loading data

In [63]:
from transformers import pipeline
en_es = pipeline('translation',model='Helsinki-NLP/opus-mt-en-es')
es_en = pipeline('translation',model='Helsinki-NLP/opus-mt-es-en')

In [1]:
import json

with open('clinais.train.json') as f:
    data = json.load(f)

from tqdm import tqdm

finalresult = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = []
    tags = []
    gold = data['annotated_entries'][key]['boundary_annotation']['gold']
    currentboundary = ''
    for g in gold:
        res.append(g['span'])
        if(g['boundary'] is None):
            tags.append('I-'+currentboundary)
        else:
            currentboundary = g['boundary']
            tags.append('B-'+currentboundary)
    finalresult.append([ident,res,tags])

# finalresult    

import numpy as np
import itertools
tags = [x[2] for x in finalresult]
tags = np.unique(list(itertools.chain(*tags)))
id2label = {}
label2id = {}
for i,tag in enumerate(tags):
    id2label[i] = tag
    label2id[tag] = i

finalresult = [[x[0],x[1],[label2id[y] for y in x[2]]] for x in finalresult]
#finalresult[0]


with open('clinais.dev.json') as f:
    data = json.load(f)

finalresultdev = []
for key in tqdm(data['annotated_entries'].keys()):
    ident = data['annotated_entries'][key]['note_id']
    res = []
    tags = []
    gold = data['annotated_entries'][key]['boundary_annotation']['gold']
    currentboundary = ''
    for g in gold:
        res.append(g['span'])
        if(g['boundary'] is None):
            tags.append('I-'+currentboundary)
        else:
            currentboundary = g['boundary']
            tags.append('B-'+currentboundary)
    finalresultdev.append([ident,res,tags])

finalresultdev = [[x[0],x[1],[label2id[y] for y in x[2]]] for x in finalresultdev]


from datasets import Dataset,DatasetDict

import pandas as pd
df = pd.DataFrame(data=finalresult,columns=['id','tokens','tags'])
dataset_train = Dataset.from_pandas(df)

df = pd.DataFrame(data=finalresultdev,columns=['id','tokens','tags'])
dataset_val = Dataset.from_pandas(df)

dataset = DatasetDict(train=dataset_train,val=dataset_val)

In [16]:
len(dataset['train']['tokens'][0])*4

2028

## Data augmentation

In [86]:
from transformers import pipeline
from random import randint

In [87]:
fillmask = pipeline("fill-mask", model="BSC-LT/roberta-base-biomedical-clinical-es")

In [88]:
mask_token = fillmask.tokenizer.mask_token

In [34]:
def augment_data(examples):
    outputs = []
    tags = []
    for words,t in zip(examples["tokens"],examples["tags"]):
        
        if(len(words)>250):
            K = randint(1, 250)
            masked_sentence = " ".join(words[:K]  + [mask_token] + words[K+1:250])
            predictions = fillmask(masked_sentence)
            augmented_sequences = [predictions[i]["sequence"] + " " + " ".join(words[250:]) for i in range(3)]
        else:
            K = randint(1, len(words))
            masked_sentence = " ".join(words[:K]  + [mask_token] + words[K+1:])
            predictions = fillmask(masked_sentence)
            augmented_sequences = [predictions[i]["sequence"] for i in range(3)]
        
        outputs += [" ".join(words)] + augmented_sequences
        
        tags += [t]+[t]+[t]+[t]
    outputs = [x.split(' ') if i%4==0 else x.split(' ')[1:] for i,x in enumerate(outputs)]
    return {"tokens": outputs,"tags":tags}

In [35]:
augmented_dataset = dataset['train'].map(augment_data, batched=True, remove_columns=dataset['train'].column_names, batch_size=8)

  0%|          | 0/98 [00:00<?, ?ba/s]

In [39]:
datasetNew = DatasetDict(train=augmented_dataset,val=dataset_val.remove_columns("id"))

In [52]:
dftrain = augmented_dataset.to_dict()

In [54]:
tokens = []
tags = []

for x,y in zip(dftrain['tokens'],dftrain['tags']):
    if(len(x)==len(y)):
        tokens.append(x)
        tags.append(y)

In [58]:
newdict = {'tokens':tokens,'tags':tags}
dftrain = pd.DataFrame(newdict)
datasetNew = DatasetDict(train=Dataset.from_pandas(dftrain),val=dataset_val.remove_columns("id"))

In [59]:
len(datasetNew['train']['tokens'][2]),len(datasetNew['train']['tags'][2])

(507, 507)

In [60]:
datasetNew.save_to_disk('augmented_dataset')

Saving the dataset (0/1 shards):   0%|          | 0/2850 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/127 [00:00<?, ? examples/s]

In [61]:
datasetNew

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 2850
    })
    val: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 127
    })
})