In [1]:
!git clone https://github.com/indichealth/indic-health-demo.git

Cloning into 'indic-health-demo'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 231 (delta 55), reused 103 (delta 43), pack-reused 110[K
Receiving objects: 100% (231/231), 1.24 MiB | 19.85 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [2]:
%cd indic-health-demo/Dataset

/content/indic-health-demo/Dataset


In [3]:
import pandas as pd
import os

In [51]:
data_path = 'IHQID-WebMD'

In [52]:
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [53]:
src = 'hindi'
tar = 'english'

In [54]:
train_df = train_df.drop(columns=f'question_{tar}')
test_df = test_df.drop(columns=f'question_{tar}')

In [55]:
train_df['question_hindi'][0]

'निस्टेटिन किस लिए निर्धारित किया गया है?'

In [56]:
!pip install deep-translator



In [57]:
from deep_translator import GoogleTranslator

def translate_queries(data, src, tar):
  def translate(sentence, src, tar):
    return GoogleTranslator(source=src, target=tar).translate(sentence)

  s, t = src[:2], tar[:2]
  data[f'question_{tar}'] = data[f'question_{src}'].apply(lambda x: translate(x, s, t))
  return data

train = translate_queries(train_df, src, tar).drop(columns=[f'question_{src}'])
test = translate_queries(test_df, src, tar).drop(columns=[f'question_{src}'])

In [10]:
!pip install transformers[torch] datasets

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transforme

In [58]:
from transformers import AutoTokenizer
model_id = 'emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [59]:
def biotagging(data: pd.DataFrame, lang):
  data = data.fillna('') # replace nan values with empty string
  # add column for tokens from tokenizer
  def tokens(text):
    encoded_input = tokenizer.encode(text, return_tensors='pt')
    decoded_input = tokenizer.convert_ids_to_tokens(encoded_input[0])
    decoded_input = [token.lower() for token in decoded_input]
    return decoded_input

  cols = [f'question_{lang}',
          f'disease_{lang}',
          f'drug_{lang}',
          f'treatment_{lang}',]
  for col in cols:
    data[col + '_tokens'] = data[col].apply(tokens)

  def biotag(row):
    qn_tokens = row[f'question_{lang}_tokens'][1:-1]
    dis_tokens = row[f'disease_{lang}_tokens'][1:-1]      # for disease, drug and treatment remove <s> and </s> tokens for proper matching (NOTE : for empty string, this gives empty list)
    drug_tokens = row[f'drug_{lang}_tokens'][1:-1]
    treat_tokens = row[f'treatment_{lang}_tokens'][1:-1]

    i = 0
    biotags = []
    while i < len(qn_tokens):
      token = qn_tokens[i]
      # if current token matches with the starting tokens of annotated disease, drug or treatment
      if ((len(dis_tokens) > 0 and (token in dis_tokens[0] or dis_tokens[0] in token)) or
         (len(drug_tokens) > 0 and (token in drug_tokens[0] or drug_tokens[0] in token)) or
         (len(treat_tokens) > 0 and (token in treat_tokens[0] or treat_tokens[0] in token))):

        entity = ""   # label for detected entity
        if len(dis_tokens) > 0 and (token in dis_tokens[0] or dis_tokens[0] in token):
          entity_tokens = dis_tokens
          entity = "disease"
        elif len(drug_tokens) > 0 and (token in drug_tokens[0] or drug_tokens[0] in token):
          entity_tokens = drug_tokens
          entity = "drug"
        elif len(treat_tokens) > 0 and (token in treat_tokens[0] or treat_tokens[0] in token):
          entity_tokens = treat_tokens
          entity = "treatment"

        # define matching function to compute similarity of entity and question subpart's tokens
        def match_tokens(ent_toks, qn_toks, thresh):
          cnt = 0
          match_toks = []   # store the set of question tokens which match with entity tokens
          for ent_tok in ent_toks:
            for qn_tok in qn_toks:
              if (ent_tok in qn_tok) or (qn_tok in ent_tok):
                cnt += 1
                match_toks.append(qn_tok)
                break

          match_toks = list(set(match_toks))
          return cnt/len(ent_toks), match_toks

        # if detected entity approximately matches with current substring based on threshold value
        thresh = 0.8
        # print(entity_tokens)
        # print(qn_tokens[i:min(i+len(entity_tokens), len(qn_tokens))])
        f_match, match_toks = match_tokens(entity_tokens, qn_tokens[i:min(i+len(entity_tokens), len(qn_tokens))], thresh)

        # print("Match ", f_match)
        # print("Match toks ", match_toks)
        if f_match >= thresh:
          idx = i   # store current value of idx
          # add B, I tags for all tokens matching with those of the detected entity
          while i < min(idx + len(entity_tokens), len(qn_tokens)):
            if qn_tokens[i] in match_toks:
              if i == idx:
                biotags.append(f'B-{entity}')
              else:
                biotags.append(f'I-{entity}')
            else:
              biotags.append('O')
            i += 1
          continue  # to prevent skipping an extra index
        else: # since entity thought to match doesn't actually match completely, it is not the start token of some entity. So, add O biotag to the current token only
          biotags.append('O')

      else:
        biotags.append('O')
      i+=1

    # The above approach might cause some intermediate tokens for any entity to be labelled O, even though its surrounding tokens are B/I tokens. We fix such tokens as postprocessing
    i = 0
    while i < len(biotags):
      biotag = biotags[i]
      if (i>0 and i<len(biotags)-1):
        prev_tag = biotags[i-1]
        next_tag = biotags[i+1]
        if (('B-' in prev_tag or 'I-' in prev_tag) and 'I-' in next_tag):
          entity = prev_tag[2:]
          biotags[i] = 'I-' + entity
      i+=1
    return biotags

  data[f'question_{lang}_biotags'] = data.apply(biotag, axis=1)
  return data

In [60]:
x = biotagging(train.iloc[[18]], 'english')
print("Question : ", x.iloc[0][['question_english']].values[0])
print("Disease : ", x.iloc[0][['disease_english']].values[0])
print("Drug : ", x.iloc[0][['drug_english']].values[0])
print("Treatment : ", x.iloc[0][['treatment_english']].values[0])
print(x.iloc[0][['question_english_biotags']].values[0])

Question :  Does scoliosis affect the stomach and breathing.
Disease :  Scoliosis
Drug :  
Treatment :  
['B-disease', 'I-disease', 'I-disease', 'I-disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [61]:
train['question_english']

0                       What is nystatin prescribed for?
1      Can douching after sex prevent me from getting...
2                        Does Percocet Cause Weight Gain
3      Does 2 or 2 1/2 glasses of wine a day cause hi...
4                  Can too much buttermilk cause thrush?
                             ...                        
715    Would the Affordable Care Act require an insur...
716         How can I use duct tape to get rid of warts?
717    What facial exercises can be done to help corr...
718                         Is prenatal ultrasound safe?
719    How can I reduce inguinal hernia symptoms unti...
Name: question_english, Length: 720, dtype: object

In [62]:
train = biotagging(train, 'english')
test = biotagging(test, 'english')

In [63]:
from datasets import Dataset

tag2label = {
    'O': 0,
    'B-disease':1,
    'I-disease':2,
    'B-drug':3,
    'I-drug':4,
    'B-treatment':5,
    'I-treatment':6
}

def preprocess(data: pd.DataFrame, lang):
  sents = list(data[f'question_{lang}'])
  labels = list(data[f'question_{lang}_biotags'].apply(lambda x:[tag2label[tag] for tag in x]))

  dataset = Dataset.from_dict(
      {
          'sentence': sents,
          'labels': labels
      }
  )

  max_length = 512
  special_token_label = -100    # define label for special token
  def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=max_length)

  def align_labels(examples):
    labels = examples['labels']
    for sent_labels in labels:
      sent_labels.extend([special_token_label] * (max_length - len(sent_labels)))   # add special token labels at end
    return examples

  dataset = dataset.map(tokenize_function, batched=True)
  dataset = dataset.map(align_labels, batched=True)
  return dataset

tokenized_train = preprocess(train, 'english')
tokenized_test = preprocess(test, 'english')

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

In [64]:
tokenized_train['labels'][0]

[0,
 0,
 3,
 4,
 4,
 0,
 0,
 0,
 0,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100

# Model Creation

In [65]:
from transformers import AutoModelForTokenClassification

model_name = "emilyalsentzer/Bio_ClinicalBERT"
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = len(tag2label) + 1)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [66]:
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="ner_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=500,
    num_train_epochs=10,
    load_best_model_at_end=True,
    learning_rate=1e-5,
    push_to_hub=False,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test
)

In [67]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=450, training_loss=0.43155853271484373, metrics={'train_runtime': 645.7409, 'train_samples_per_second': 11.15, 'train_steps_per_second': 0.697, 'total_flos': 1881438702796800.0, 'train_loss': 0.43155853271484373, 'epoch': 10.0})

In [68]:
# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the evaluation results
print(results)

{'eval_loss': 0.5026588439941406, 'eval_runtime': 8.433, 'eval_samples_per_second': 28.578, 'eval_steps_per_second': 7.234, 'epoch': 10.0}


In [69]:
predictions = trainer.predict(tokenized_test)

In [70]:
pred_vals = predictions.predictions

In [71]:
pred_vals[0].shape

(512, 8)

In [72]:
import numpy as np
predicted_labels = np.argmax(pred_vals, axis=2)
predicted_labels.shape

(241, 512)

In [73]:
predicted_labels[3]

array([0, 0, 0, 3, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2,
       2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 2, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 2, 2, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 2, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 2,
       2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,

In [74]:
tokenized_test['labels'][3]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -1

In [75]:
len(tokenized_test['labels'][0])

512

In [76]:
from sklearn.metrics import classification_report, confusion_matrix

pred_labels = predicted_labels.flatten()
actual_labels = np.array(tokenized_test['labels']).flatten()

# remove special tokens before computing token classification accuracy
special_token_label = -100
pred_filtered_labels = []
actual_filtered_labels = []
for pred_label, actual_label in zip(pred_labels, actual_labels):
  if actual_label == special_token_label:
    continue
  pred_filtered_labels.append(pred_label)
  actual_filtered_labels.append(actual_label)

print(classification_report(actual_filtered_labels, pred_filtered_labels))
print(confusion_matrix(actual_filtered_labels, pred_filtered_labels))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      2748
           1       0.55      0.62      0.58       125
           2       0.54      0.67      0.60       208
           3       0.52      0.57      0.54        54
           4       0.51      0.56      0.54       140
           5       0.51      0.47      0.49        43
           6       0.56      0.57      0.57        93

    accuracy                           0.84      3411
   macro avg       0.59      0.62      0.60      3411
weighted avg       0.85      0.84      0.84      3411

[[2463   46  103   22   68   12   34]
 [  41   77    1    2    0    4    0]
 [  51    9  140    1    1    1    5]
 [  18    2    0   31    1    2    0]
 [  56    0    2    1   79    0    2]
 [  16    5    0    2    0   20    0]
 [  21    0   13    1    5    0   53]]


In [77]:
tag2label

{'O': 0,
 'B-disease': 1,
 'I-disease': 2,
 'B-drug': 3,
 'I-drug': 4,
 'B-treatment': 5,
 'I-treatment': 6}