In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import os
import warnings
warnings.filterwarnings('ignore')
import nltk
import string
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import contractions
import re

In [3]:
def preprocess(text):
    text = text.replace("\n", " ").replace("\t", " ").replace("\xa0", " ").replace("\r", " ").replace("[", "").replace("]", "").replace("\ufeff", "").replace("\u200b", "").replace("©", "").replace("|", "").replace("#", "").lower().strip()
    text = re.sub('[%s]' % re.escape(string.punctuation), '' , text)
    text = re.sub('[0-9]', '', text)
    text = re.sub(' +', ' ', text)

    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text = contractions.fix(text)
    
    words = set(nltk.corpus.words.words())
    text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())
    return text

In [4]:
df = pd.read_csv('../desease_data/dataset.csv')
for i in range(len(df)):
    for j in range(1, len(df.columns)):
        if not pd.isna(df.iloc[i, j]):
            df.iloc[i,j] = df.iloc[i,j].replace(' ', '').replace('_', ' ')

df["Symptoms"] = 0
records = df.shape[0]
for i in range(records):
    values = df.iloc[i].values
    values = values.tolist()
    if 0 in values:
        df["Symptoms"][i] = values[1:values.index(0)]
    else:
        df["Symptoms"][i] = values[1:]
for i in range(len(df)):
    df.Symptoms[i] = " ".join([s for s in df.Symptoms[i] if not pd.isna(s)])

df = df[["Disease","Symptoms"]]
df.head()

Unnamed: 0,Disease,Symptoms
0,Fungal infection,itching skin rash nodal skin eruptions dischro...
1,Fungal infection,skin rash nodal skin eruptions dischromic patches
2,Fungal infection,itching nodal skin eruptions dischromic patches
3,Fungal infection,itching skin rash dischromic patches
4,Fungal infection,itching skin rash nodal skin eruptions


In [5]:
desc = pd.read_csv('../desease_data/symptom_description.csv')
df["Description"] = ' '
desc.Description = desc.Description.apply(lambda x: preprocess(x))
#for every Disease in df, we add the corresponding description of the Disease in desc
for i in range(len(df)):
    for j in range(len(desc)):
        if df.Disease[i] == desc.Disease[j]:
            df.Description[i] = desc.Description[j]
            break

# combine the symptoms and description of the disease
df["Symptoms"] = df["Symptoms"] + " " + df["Description"]
df.drop("Description", axis=1, inplace=True)
df.head()

Unnamed: 0,Disease,Symptoms
0,Fungal infection,itching skin rash nodal skin eruptions dischro...
1,Fungal infection,skin rash nodal skin eruptions dischromic patc...
2,Fungal infection,itching nodal skin eruptions dischromic patche...
3,Fungal infection,itching skin rash dischromic patches human fun...
4,Fungal infection,itching skin rash nodal skin eruptions human f...


In [6]:
possible_labels = df.Disease.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
    
df['label'] = df.Disease.replace(label_dict)
df.head()

Unnamed: 0,Disease,Symptoms,label
0,Fungal infection,itching skin rash nodal skin eruptions dischro...,0
1,Fungal infection,skin rash nodal skin eruptions dischromic patc...,0
2,Fungal infection,itching nodal skin eruptions dischromic patche...,0
3,Fungal infection,itching skin rash dischromic patches human fun...,0
4,Fungal infection,itching skin rash nodal skin eruptions human f...,0


In [6]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df.label.values,
                                                  shuffle=True)

df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['Disease', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Symptoms
Disease,label,data_type,Unnamed: 3_level_1
(vertigo) Paroymsal Positional Vertigo,36,train,102
(vertigo) Paroymsal Positional Vertigo,36,val,18
AIDS,6,train,102
AIDS,6,val,18
Acne,37,train,102
...,...,...,...
Urinary tract infection,38,val,18
Varicose veins,30,train,102
Varicose veins,30,val,18
hepatitis A,19,train,102


In [7]:
tokenizer = BertTokenizer.from_pretrained(
    'emilyalsentzer/Bio_ClinicalBERT',
    do_lower_case=True)

encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Symptoms.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Symptoms.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [8]:
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

In [9]:
batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [10]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)

epochs = 3

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [12]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
        with torch.no_grad():
            outputs = model(**inputs)
        
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()
        
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [13]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    # torch.save(model.state_dict(), f'data_volume/third/finetuned_BERT_epoch_{epoch}.model')
    # save the model
    output_dir = f'./model_save/clinical_bert/epoch_{epoch}'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print("Saving model to %s" % output_dir)
    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]

Saving model to ./model_save/clinical_bert/epoch_1


  0%|          | 0/3 [08:07<?, ?it/s]


Epoch 1
Training loss: 1.497508171887018


 33%|███▎      | 1/3 [08:30<17:00, 510.13s/it]

Validation loss: 0.255397422890353
F1 Score (Weighted): 1.0




Saving model to ./model_save/clinical_bert/epoch_2


 33%|███▎      | 1/3 [16:36<17:00, 510.13s/it]


Epoch 2
Training loss: 0.14697912169441346


 67%|██████▋   | 2/3 [16:58<08:29, 509.20s/it]

Validation loss: 0.036586275331797154
F1 Score (Weighted): 1.0




Saving model to ./model_save/clinical_bert/epoch_3


 67%|██████▋   | 2/3 [25:05<08:29, 509.20s/it]


Epoch 3
Training loss: 0.03841289622145284


100%|██████████| 3/3 [25:27<00:00, 509.27s/it]

Validation loss: 0.021324776873991983
F1 Score (Weighted): 1.0





In [21]:
message = preprocess('itching skin rash')
# tokenize and predict class of input_sentence
tokenized_sentence = tokenizer.encode(message)
input_ids = torch.tensor([tokenized_sentence]).cuda()
with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=1) 
print(label_indices)

[16]
