## Data Reformatting (brat to ConLL)

In [1]:
# from google.colab import drive

# drive.mount('/content/drive')

In [2]:
path = '/scratch/js12684/final/'

In [3]:
#retrieve paths for text and ann data

import glob
notes_orig =glob.glob(path + 'data/*.txt')
ann_orig = glob.glob(path + 'data/*.ann' )

print(len(notes_orig))
print(len(ann_orig))

400
400


In [4]:
!mkdir '/scratch/js12684/final/ann_ref_disp'

mkdir: cannot create directory '/scratch/js12684/final/ann_ref_disp': File exists


In [5]:
#reformat BRAT annotation to CoNLL for drug mentions:

for file in ann_orig:
    with open(file, 'r') as fin:
        outfile = path + 'ann_ref_disp/' + file.split('/')[-1]
        with open(outfile, 'w') as fileout:
            for line in fin:
                if line.startswith('T'):
                    # newline = line.replace('NoDisposition', 'Drug').replace('Disposition', 'Drug').replace('Temporality', 'Drug').replace('Undetermined', "Drug")
                    fileout.write(line)

In [6]:
ann_disp = glob.glob(path + 'ann_ref_disp/*.ann')
print(len(ann_disp))

400


In [7]:
#convert reformatted ann file into tab separated ConLL format
import numpy as np
from urllib.parse import ParseResult
from urllib.parse import ParseResultBytes

outfile = path + '/output_disp.txt'
with open(outfile, 'w') as output_file:
  out_list = []
  for ann_file in ann_disp:
    with open(ann_file, 'r') as ann_in:
      txt_file = path + f'data/{ann_file[-10:-4]}.txt'
      with open(txt_file, 'r') as txt_in:
        start_end = []
        count = 0
        disp = {}
        
        #reading each line of .ann file and extracting start and end indices 
        for line in ann_in:
            entry = line.split()
            disp[count] = entry[1]
            count += 1
            start_end.append([int(entry[2]), int(entry[3])])
            
        #reading each line of .txt file and extracting drug names using list of indices
        text_tokens = txt_in.read().split()
        drug_names = []
        for indicies in start_end:
          start = indicies[0]
          end = indicies[1]

          with open(txt_file, 'r') as txt_in:
            drug_names.append(txt_in.read()[indicies[0]:indicies[1]])
        
        #appending "O" or Disp label to each token in list of txt tokens

        for token in text_tokens:
          token_alnum = ''.join(ch for ch in token if ch.isalnum())
          if token_alnum in drug_names:
            idx = drug_names.index(token_alnum)
            if token == text_tokens[0]:
              out_list.append(f'{ann_file[-10:-4]}\t')
            else:
              out_list.append(f'{np.nan}\t')
            out_list.append(f'{token}\t{disp[idx]}\n')
          else:
            if token == text_tokens[0]:
              out_list.append(f'{ann_file[-10:-4]}\t')
            else:
              out_list.append(f'{np.nan}\t')
            out_list.append(f'{token}\tO\n')
        out_list.append('\n')
  out_string = ''.join(out_list)
  output_file.write(out_string)

## Data Processing for SimpleTransformers

In [8]:
#read in ConLL annotation file as Csv
import pandas as pd

data = pd.read_csv(path + 'output_disp.txt', sep='\t', lineterminator='\n', names = ['Note', 'Token', 'Label'])

#fill missing note numbers
data = data.fillna(method = 'ffill')

In [9]:
token_count = []
for note in data['Note'].unique():
  df = data.loc[data['Note']== note]
  count = 0
  for token in df['Token']:
    token_count.append(count)
    count += 1

data['Token Count'] = token_count
data.sort_values(by = ['Note', 'Token Count'], inplace=True)
data.reset_index(drop = True, inplace = True)

In [10]:
#encode note number
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
note_en = le.fit_transform(data['Note'])

data['Note'] = note_en

In [11]:
#Convert column names to be compatible with model
data.rename(columns = {'Note':'sentence_id', 'Token':'words', 'Label':'labels'}, inplace=True)

In [12]:
#train test split
from sklearn.model_selection import train_test_split

# X = data[['sentence_id','words']]
# y = data['labels'] 

train, test = train_test_split(data, test_size = 0.2, shuffle=False, random_state = 42 )
train, val = train_test_split(train, test_size = 0.125, shuffle=False, random_state = 42 )

print(train.shape)
print(val.shape)
print(test.shape)

(171901, 4)
(24558, 4)
(49115, 4)


In [13]:
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":train["sentence_id"],"words":train["words"],"labels":train['labels']})
val_data = pd.DataFrame({"sentence_id":val["sentence_id"],"words":val["words"],"labels":val['labels']})
test_data = pd.DataFrame({"sentence_id":test["sentence_id"],"words":test["words"],"labels":test['labels']})

In [14]:
label = data["labels"].unique().tolist()
label

['O', 'NoDisposition', 'Disposition', 'Undetermined']

In [15]:
train_data

Unnamed: 0,sentence_id,words,labels
0,0,Record,O
1,0,date:,O
2,0,2106-02-12,O
3,0,Campbell,O
4,0,Orthopedic,O
...,...,...,...
171896,276,"33.8,",O
171897,276,PLT,O
171898,276,601,O
171899,276,(H),O


## Model Setup

In [16]:
#label counts in train data
train_data['labels'].value_counts()

O                167754
NoDisposition      3043
Disposition         850
Undetermined        254
Name: labels, dtype: int64

In [17]:
#calculate weights for class imbalance
n_samples = len(train_data)
n_classes = 4
freq_O = train_data['labels'].value_counts()[0]
freq_nodisp = train_data['labels'].value_counts()[1]
freq_disp = train_data['labels'].value_counts()[2]
freq_und = train_data['labels'].value_counts()[3]

freq_list = [freq_O, freq_nodisp, freq_disp, freq_und]

weights = [n_samples/(n_classes * freq) for freq in freq_list]
print(weights)

[0.2561801805024023, 14.122658560630956, 50.55911764705883, 169.1938976377953]


In [18]:
# model parameters
from simpletransformers.ner import NERModel,NERArgs

args = NERArgs()
args.learning_rate = 1e-4
args.scheduler = 'constant_schedule'
args.num_train_epochs = 10
args.train_batch_size = 32
args.eval_batch_size = 32
args.overwrite_output_dir = True
args.manual_seed = 42

bert_emb_layers = [
    'bert.embeddings.word_embeddings.weight',
    'bert.embeddings.position_embeddings.weight',
    'bert.embeddings.token_type_embeddings.weight',
    'bert.embeddings.LayerNorm.weight',
    'bert.embeddings.LayerNorm.bias',
    ]

args.custom_parameter_groups = [
    {
        'params': bert_emb_layers,
        'lr': 0
    }
]

# args.logging_steps = 10
# args.evaluate_during_training_steps = 2137
# args.evaluate_during_training = True
# args.evaluate_during_training_verbose = True
# args.use_cached_eval_features = True

## Baseline BERT Model

In [19]:
# #model parameters
# !pip install simpletransformers
# from simpletransformers.ner import NERModel,NERArgs

# bl_args = NERArgs()
# bl_args.num_train_epochs = 10
# bl_args.learning_rate = 1e-5
# bl_args.overwrite_output_dir =True
# bl_args.train_batch_size = 32
# bl_args.eval_batch_size = 32
# bl_args.manual_seed = 0

In [20]:
#instantiate model
bl_model = NERModel('bert', 'bert-base-cased', labels = label ,args = args, use_cuda=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [21]:
#train and eval model with train + val data
bl_model.train_model(train_data, eval_data = val_data)

  0%|          | 0/1 [00:00<?, ?it/s]



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/9 [00:00<?, ?it/s]



Running Epoch 1 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

(90, 0.09179081859248173)

In [22]:
#test model on test data
bl_result, bl_model_outputs, bl_preds_list = bl_model.eval_model(test_data)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]



In [23]:
bl_result

{'eval_loss': 0.03656905268629392,
 'precision': 0.5238095238095238,
 'recall': 0.2391304347826087,
 'f1_score': 0.32835820895522383}

## biobert Model

In [24]:
args = NERArgs()
args.learning_rate = 1e-4
args.scheduler = 'constant_schedule'
args.num_train_epochs = 10
args.train_batch_size = 32
args.eval_batch_size = 32
args.overwrite_output_dir = True
args.manual_seed = 42

bert_emb_layers = [
    'bert.embeddings.word_embeddings.weight',
    'bert.embeddings.position_embeddings.weight',
    'bert.embeddings.token_type_embeddings.weight',
    'bert.embeddings.LayerNorm.weight',
    'bert.embeddings.LayerNorm.bias',
    ]

args.custom_parameter_groups = [
    {
        'params': bert_emb_layers,
        'lr': 0
    }
]

In [25]:
#instantiate model
bb_model = NERModel('bert', 'dmis-lab/biobert-base-cased-v1.2', labels = label ,args = args, use_cuda=True)

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [26]:
#train and eval model with train + val data
bb_model.train_model(train_data, eval_data = val_data)

  0%|          | 0/1 [00:00<?, ?it/s]



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/9 [00:00<?, ?it/s]



Running Epoch 1 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

(90, 0.08911965339082396)

In [27]:
#test model on test data
bb_result, bb_model_outputs, bb_preds_list = bb_model.eval_model(test_data)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]



In [28]:
bb_result

{'eval_loss': 0.021825300995260477,
 'precision': 0.6774193548387096,
 'recall': 0.45652173913043476,
 'f1_score': 0.5454545454545454}

## scibert Model

In [29]:
#model parameters

args = NERArgs()
args.learning_rate = 1e-4
args.scheduler = 'constant_schedule'
args.num_train_epochs = 10
args.train_batch_size = 32
args.eval_batch_size = 32
args.overwrite_output_dir = True
args.manual_seed = 42

bert_emb_layers = [
    'bert.embeddings.word_embeddings.weight',
    'bert.embeddings.position_embeddings.weight',
    'bert.embeddings.token_type_embeddings.weight',
    'bert.embeddings.LayerNorm.weight',
    'bert.embeddings.LayerNorm.bias',
    ]

args.custom_parameter_groups = [
    {
        'params': bert_emb_layers,
        'lr': 0
    }
]

In [30]:
#instantiate model
sb_model = NERModel('bert', 'allenai/scibert_scivocab_cased', labels = label ,args = args, use_cuda=True)

Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initial

In [31]:
#train and eval model with train + val data
sb_model.train_model(train_data, eval_data = val_data)

  0%|          | 0/1 [00:00<?, ?it/s]



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/9 [00:00<?, ?it/s]



Running Epoch 1 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

(90, 0.08566471091018886)

In [32]:
#test model on test data
sb_result, sb_model_outputs, sb_preds_list = sb_model.eval_model(test_data)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]



In [33]:
sb_result

{'eval_loss': 0.022358637613554794,
 'precision': 0.6511627906976745,
 'recall': 0.5384615384615384,
 'f1_score': 0.5894736842105263}

## clinicalbert Model

In [34]:
# #model parameters

args = NERArgs()
args.learning_rate = 1e-4
args.scheduler = 'constant_schedule'
args.num_train_epochs = 10
args.train_batch_size = 32
args.eval_batch_size = 32
args.overwrite_output_dir = True
args.manual_seed = 42

bert_emb_layers = [
    'bert.embeddings.word_embeddings.weight',
    'bert.embeddings.position_embeddings.weight',
    'bert.embeddings.token_type_embeddings.weight',
    'bert.embeddings.LayerNorm.weight',
    'bert.embeddings.LayerNorm.bias',
    ]

args.custom_parameter_groups = [
    {
        'params': bert_emb_layers,
        'lr': 0
    }
]

In [35]:
#instantiate model
cb_model = NERModel('bert', 'emilyalsentzer/Bio_ClinicalBERT', labels = label ,args = args, use_cuda=True)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint 

In [36]:
#train and eval model with train + val data
cb_model.train_model(train_data, eval_data = val_data)

  0%|          | 0/1 [00:00<?, ?it/s]



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/9 [00:00<?, ?it/s]



Running Epoch 1 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 2 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 3 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 4 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 5 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 6 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 7 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 8 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 9 of 10:   0%|          | 0/9 [00:00<?, ?it/s]

(90, 0.10224618039404353)

In [37]:
#test model on test data
cb_result, cb_model_outputs, cb_preds_list = cb_model.eval_model(test_data)

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]



In [38]:
cb_result

{'eval_loss': 0.0231072132786115,
 'precision': 0.6451612903225806,
 'recall': 0.43478260869565216,
 'f1_score': 0.5194805194805194}