# Compiling all of the datasets into one training dataset

As of this point custom labeled train, val, & test sets were created using ruled-based and statistical tagging w/IOB method
 - Disease B-DIS
 - Location B-LOC
 - Incidence/Prevalence B-EPI (tagged in case of relation extraction later on)
 - Statistics B-STAT
 
### Additionally
- The val set will remain unchanged
- The test set will be modified by manual curation  

There exists the MISC datasets {CoNLL(PP) location dataset, NCBI-disease, BC5CDR-disease, (i2b2-disease?)}  
All of these have train/val/test (and train_dev for BC5CDR) sets  
Given that my goal is to validate and test identification of Disease | Location | Incidence/Prevalence | Statistics on a realistic dataset, I will be combining the val/test/train_dev sets into the training data for the MISC datasets and only be doing val and test with my custom dataset, thus

### Goals:
1. Read in CoNLL(PP) for location
2. Read in NCBI-disease
3. Read in BC5CDR-disease
3. Read in the custom dataset
4. Combine them into a training set
5. Save in a way that the tokenizer can read

## (1) CoNLL(PP)-Location

In [1]:
#!pip install transformers
#!pip install datasets
from datasets import load_dataset
coNLL = load_dataset("conllpp")
coNLL

Reusing dataset conllpp (/work/wzkariampuzha/.cache/huggingface/datasets/conllpp/conllpp/1.0.0/04f15f257dff3fe0fb36e049b73d51ecdf382698682f5e590b7fb13898206ba2)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [2]:
#NER_tag '5' is B-LOC, '6' is I-LOC
#Get numbers on the amount of location 

def read_loc_dataset(dataset):
    token_docs = []
    tag_docs = []
    for sentence in dataset:
        #Only add sentences that actually have location tags (i.e. meaningfully annotated sentences)
        if (5 in sentence['ner_tags'] or 6 in sentence['ner_tags']):
            tags = []
            #Only keep location tags
            for tag in sentence['ner_tags']:
                label = 'O'
                if tag ==5:
                    label = 'B-LOC'
                if tag == 6:
                    label = 'I-LOC'
                tags.append(label)
            
            #Raise error if mismatch
            if len(sentence['tokens']) != len(tags):
                print('mismatch')
                print(sentence['tokens'])
                print(tags)
            
            token_docs.append(sentence['tokens'])
            tag_docs.append(tags)
        
    return token_docs, tag_docs

In [3]:
train_texts_loc, train_tags_loc = read_loc_dataset(coNLL["train"])
val_texts_loc, val_tags_loc = read_loc_dataset(coNLL["validation"])
test_texts_loc, test_tags_loc = read_loc_dataset(coNLL["test"])
#Combine 
#loc_tokens = [train_texts_loc, val_texts_loc, test_texts_loc]
#loc_tags = [train_tags_loc,test_tags_loc,test_tags_loc]

In [4]:
#Show the data
for i in range(2): #for sentence in abstract
    for j in range(len((train_texts_loc[i]))): #for token in sentence
        print(train_texts_loc[i][j], train_tags_loc[i][j])
    print('')

BRUSSELS B-LOC
1996-08-22 O

Germany B-LOC
's O
representative O
to O
the O
European O
Union O
's O
veterinary O
committee O
Werner O
Zwingmann O
said O
on O
Wednesday O
consumers O
should O
buy O
sheepmeat O
from O
countries O
other O
than O
Britain B-LOC
until O
the O
scientific O
advice O
was O
clearer O
. O



## (2) NCBI-disease

In [None]:
ncbi_dz = load_dataset("ncbi_disease")
ncbi_dz

In [None]:
#NER_tag '1' is B-DIS, '2' is I-DIS
#https://github.com/huggingface/datasets/tree/master/datasets/ncbi_disease

def read_dis_dataset(dataset):
    token_docs = []
    tag_docs = []
    for sentence in dataset:
        tags = []
        for tag in sentence['ner_tags']:
            label = 'O'
            if tag ==1:
                label = 'B-DIS'
            if tag == 2:
                label = 'I-DIS'
            tags.append(label)
            
        #Raise error if mismatch
        if len(sentence['tokens']) != len(tags):
            print('mismatch')
            print(len(sentence['tokens']))
            print(len(tags))
        else:
            token_docs.append(sentence['tokens'])
            tag_docs.append(tags)
        
    return token_docs, tag_docs

In [None]:
train_texts_dis, train_tags_dis = read_dis_dataset(ncbi_dz["train"])
val_texts_dis, val_tags_dis = read_dis_dataset(ncbi_dz["validation"])
test_texts_dis, test_tags_dis = read_dis_dataset(ncbi_dz["test"])

In [None]:
#Show the data
for i in range(2): #for sentence in abstract
    for j in range(len((train_texts_loc[i]))): #for token in sentence
        print(train_texts_dis[i][j], train_tags_dis[i][j])
    print('')

## (3) BC5CDR-disease

In [None]:
#See what the .tsv file looks like, can remove later
files = {'train':"./datasets/NER/BC5CDR-disease/train.tsv", 
         'test':"./datasets/NER/BC5CDR-disease/test.tsv",
         'train_dev':"./datasets/NER/BC5CDR-disease/train_dev.tsv",
         'devel':"./datasets/NER/BC5CDR-disease/devel.tsv"}

Since these datasets are not split into different abstracts, only sentences, i am going to make a list out of the four sets so that the hierarchy is the same as the other datasets

In [None]:
import csv
BC5CDR_tokens,BC5CDR_tags = [],[]
for key,value in files.items():
    with open(value,'r') as file:
        reader = csv.reader(file, delimiter="\t", quoting=csv.QUOTE_NONE)
        sentence_tokens, sentences_tags=[],[]
        for row in reader:
            if len(row)%2==0:
                if len(row)==0:
                    BC5CDR_tokens.append(sentence_tokens.copy())
                    BC5CDR_tags.append(sentences_tags.copy())
                    sentence_tokens.clear()
                    sentences_tags.clear()
                else:
                    sentence_tokens.append(row[0])
                    if row[1]=='I':
                        sentences_tags.append('I-DIS')
                    elif row[1]=='B':
                        sentences_tags.append('B-DIS')
                    else:
                        sentences_tags.append('O')
            else: 
                print('bad row',row)
    file.close()
    print('Done with',key)

In [None]:
#Show the data
for i in range(2): #for sentence in abstract
    for j in range(int(len(BC5CDR_tokens[i])/2)): #for token in sentence
        print(BC5CDR_tokens[i][j], BC5CDR_tags[i][j])
    print('')

## (4) Read custom training dataset

In [5]:
import csv
epi_train_abstracts, epi_train_labels= [],[]
with open('epi_train_setV2.tsv','r') as f:
    reader = csv.reader(f, delimiter="\t")
    sentence_tokens,sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                epi_train_abstracts.append(sentence_tokens.copy())
                epi_train_labels.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()

In [6]:
#Show the data
for i in range(2): #for sentence in abstract
    for j in range(int(len(epi_train_abstracts[i])/2)): #for token in sentence
        print(epi_train_abstracts[i][j], epi_train_labels[i][j])
    print('')

Background O
Chemotherapy O
- O
induced O
cardiomyopathy O
( O
CICM O
) O
and O
heart O
failure O
are O
major O

There O
is O
limited O
data O
on O
the O
incidence B-EPI
and O
risk O
factors O



## (5) Combine

In [None]:
#V1
train_texts = train_texts_loc + train_texts_dis + val_texts_loc + val_texts_dis + test_texts_loc + test_texts_dis +BC5CDR_tokens + epi_train_abstracts 
train_tags = train_tags_loc + train_tags_dis + val_tags_loc + val_tags_dis + test_tags_loc + test_tags_dis + BC5CDR_tags + epi_train_labels

#train_texts = [train_texts_loc, train_texts_dis, val_texts_loc, val_texts_dis, test_texts_loc, test_texts_dis, BC5CDR_tokens, epi_train_abstracts]
#train_tags = [train_tags_loc, train_tags_dis, val_tags_loc, val_tags_dis, test_tags_loc, test_tags_dis, BC5CDR_tags, epi_train_labels]

In [7]:
#V2
train_texts = epi_train_abstracts + train_texts_loc + val_texts_loc + test_texts_loc
train_tags = epi_train_labels + train_tags_loc + val_tags_loc + test_tags_loc 

In [8]:
print(len(train_texts),len(train_tags))

12289 12289


## (6) Save

In [9]:
with open('training_setV2.tsv', "w") as f:
    for i in range(len(train_texts)): #For sentence in abstract
        for j in range(len(train_texts[i])): #for token in sentence
            output = str(train_texts[i][j]) +'\t' +str(train_tags[i][j])+'\n'
            f.write(output)
            if i<3:
                print(output)
        f.write('\n')
        if i%500==0:
            print(i)
f.close()

Background	O

Chemotherapy	O

-	O

induced	O

cardiomyopathy	O

(	O

CICM	O

)	O

and	O

heart	O

failure	O

are	O

major	O

complications	O

of	O

cancer	O

therapeutics	O

and	O

can	O

result	O

in	O

significant	O

morbidity	O

and	O

mortality	O

.	O

0
There	O

is	O

limited	O

data	O

on	O

the	O

incidence	B-EPI

and	O

risk	O

factors	O

of	O

CICM	O

in	O

African	O

American	O

and	O

Afro	O

-	O

Caribbean	O

patients	O

.	O

Methods	O

We	O

performed	O

a	O

retrospective	O

chart	O

review	O

to	O

evaluate	O

the	O

baseline	O

characteristics	O

that	O

may	O

predispose	O

to	O

CICM	O

.	O

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000


In [None]:
#IF use the dataset list instead of concatentation
'''
with open('training_set.tsv', "w") as f:
    for s in range(len(train_texts)): #for set in training sets
        for i in range(len(train_texts[s])): #For sentence in set
            for j in range(len(train_texts[s][i])): #for token in sentence
                output = str(train_texts[s][i][j]) +'\t' +str(train_tags[s][i][j])+'\n'
                f.write(output)
            f.write('\n')
            if i%500==0:
                print(i)
f.close()
'''
print('')

## Testing adding in the data

In [None]:
#Load the training data
import csv
train_texts, train_tags= [],[]
with open('training_set.tsv','r') as f:
    reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
    sentence_tokens, sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                train_texts.append(sentence_tokens.copy())
                train_tags.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()

In [None]:
#Show the data
for i in range(2): #for sentence in abstract
    for j in range(int(len(train_texts[i])/2)): #for token in sentence
        print(train_texts[i][j], train_tags[i][j])
    print('')

In [None]:
#Load the validation data
import csv
val_texts, val_tags= [],[]
with open('epi_val_set.tsv','r') as f:
    reader = csv.reader(f, delimiter="\t")
    sentence_tokens, sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                val_texts.append(sentence_tokens.copy())
                val_tags.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()

In [None]:
#Show the data
for i in range(2): #for sentence in abstract
    for j in range(int(len(val_texts[i])/2)): #for token in sentence
        print(val_texts[i][j], val_tags[i][j])
    print('')

In [None]:
#Load the testing data
import csv
test_texts, test_tags= [],[]
with open('epi_test_set.tsv','r') as f:
    reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
    sentence_tokens, sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                test_texts.append(sentence_tokens.copy())
                test_tags.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()

In [None]:
#Show the data
for i in range(2): #for sentence in abstract
    for j in range(int(len(test_texts[i])/2)): #for token in sentence
        print(test_texts[i][j], test_tags[i][j])
    print('')

In [None]:
unique_tags = set(tag for doc in (train_tags+val_tags+test_tags) for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

print(unique_tags)
print(tag2id)
print(id2tag)

In [None]:
#Modified
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-large-cased-v1.1')

train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [None]:
import numpy as np

def encode_tags(tags, encodings, which_set):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    i=0
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        '''
        print('doc_labels')
        print(doc_labels)
        print('')
        
        print('doc_offset')
        print(doc_offset)
        print('')
        '''
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        '''
        print('doc_enc_labels')
        print(doc_enc_labels)
        print('')
        
        print('arr_offset')
        print(arr_offset)
        print('')
        '''
        if (np.count_nonzero((arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)) != len(doc_labels)):
            print(np.count_nonzero((arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)))
            print(len(doc_labels))
            if which_set =='train':
                train_texts.pop(i)
                train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
            if which_set =='val':
                val_texts.pop(i)
                val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
            if which_set =='test':
                test_texts.pop(i)
                test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
            print('-----------------------------------')
        else:
        # set labels whose first offset position is 0 and the second is not 0
            doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
            encoded_labels.append(doc_enc_labels.tolist())
        i+=1
        
    return encoded_labels

In [None]:
train_labels = encode_tags(train_tags, train_encodings,'train')
print(len(train_labels), len(train_texts))

In [None]:
val_labels = encode_tags(val_tags, val_encodings,'val')
print(len(val_labels), len(val_texts))

In [None]:
test_labels = encode_tags(test_tags, test_encodings,'test')
print(len(test_labels), len(test_texts))

In [None]:
import torch

class Format_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
test_encodings.pop("offset_mapping")

train_dataset = Format_Dataset(train_encodings, train_labels)
val_dataset = Format_Dataset(val_encodings, val_labels)
test_dataset = Format_Dataset(test_encodings, test_labels)

In [None]:
from transformers import BertForTokenClassification
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-large-cased-v1.1', num_labels=len(unique_tags))

In [None]:
#modified
from transformers import EvalPrediction
from torch import nn
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray): # -> Tuple[List[int], List[int]]
        preds = np.argmax(predictions, axis=2)
        #print('preds.shape',preds.shape)
        batch_size, seq_len = preds.shape
        
        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]
        
        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(id2tag[label_ids[i][j]])
                    preds_list[i].append(id2tag[preds[i][j]])

        return preds_list, out_label_list

from seqeval.metrics import f1_score, precision_score, recall_score
def compute_metrics(p: EvalPrediction): #-> Dict
    preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
    return {
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list)}

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=f'./results',          # output directory
    overwrite_output_dir = True,
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=f'./logs',            # directory for storing logs
    logging_steps=10,
    seed = 1
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics      # compute metric defined above
)

In [None]:
from transformers import set_seed
#Seed is a helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if installed).
set_seed(training_args.seed)

In [None]:
trainer.train(resume_from_checkpoint=True)

In [None]:
trainer.save_model()
# For convenience, we also re-save the tokenizer to the same directory,
# so that you can share your model easily on huggingface.co/models =)
if trainer.is_world_process_zero():
    tokenizer.save_pretrained(training_args.output_dir)

In [None]:
model = BertForTokenClassification.from_pretrained('./results/', num_labels=len(unique_tags))

In [None]:
# Evaluation
import os
results = {}
result = trainer.evaluate()
output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
if trainer.is_world_process_zero():
    with open(output_eval_file, "w") as writer:
        for key, value in result.items():
            writer.write("%s = %s\n" % (key, value))
            results.update(result)

Testing & Predictions

In [None]:
#Predictions
predictions, label_ids, metrics = trainer.predict(test_dataset)

In [None]:
#Align Predictions
preds_list, _ = align_predictions(predictions, label_ids)

In [None]:
# Save prediction results

output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
if trainer.is_world_process_zero():
    with open(output_test_results_file, "w") as writer:
        for key, value in metrics.items():
            writer.write("%s = %s\n" % (key, value))
''''''

In [None]:
#Save actual predictions

output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
if trainer.is_world_process_zero():
    with open(output_test_predictions_file, "w") as writer:
        i = 0
        for sentence in test_texts:
            j=0
            for token in sentence:
                output = token +'\t' +preds_list[i][j]+'\n'
                writer.write(output)
                j+=1
            i+=1
''''''