#Data Preprocessing

In [1]:
!git clone https://github.com/indichealth/indic-health-demo.git

Cloning into 'indic-health-demo'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 231 (delta 55), reused 103 (delta 43), pack-reused 110[K
Receiving objects: 100% (231/231), 1.24 MiB | 4.62 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [2]:
%cd indic-health-demo/Dataset

/content/indic-health-demo/Dataset


In [3]:
import pandas as pd
import os

In [50]:
data_path = 'IHQID-WebMD'

In [51]:
train = pd.read_csv(os.path.join(data_path, 'train.csv'))
test = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [52]:
def biotagging(data: pd.DataFrame, lang):
  data = data.fillna('') # replace nan values with empty string
  # add column for tokens from tokenizer
  def tokens(text):
    return [token.lower() for token in text.split()]

  cols = [f'question_{lang}',
          f'disease_{lang}',
          f'drug_{lang}',
          f'treatment_{lang}',]
  for col in cols:
    data[col + '_tokens'] = data[col].apply(tokens)

  def biotag(row):
    qn_tokens = row[f'question_{lang}_tokens']
    dis_tokens = row[f'disease_{lang}_tokens']
    drug_tokens = row[f'drug_{lang}_tokens']
    treat_tokens = row[f'treatment_{lang}_tokens']

    i = 0
    biotags = []
    while i < len(qn_tokens):
      token = qn_tokens[i]
      # if current token matches with the starting tokens of annotated disease, drug or treatment
      if ((len(dis_tokens) > 0 and (token in dis_tokens[0] or dis_tokens[0] in token)) or
         (len(drug_tokens) > 0 and (token in drug_tokens[0] or drug_tokens[0] in token)) or
         (len(treat_tokens) > 0 and (token in treat_tokens[0] or treat_tokens[0] in token))):

        entity = ""   # label for detected entity
        if len(dis_tokens) > 0 and (token in dis_tokens[0] or dis_tokens[0] in token):
          entity_tokens = dis_tokens
          entity = "disease"
        elif len(drug_tokens) > 0 and (token in drug_tokens[0] or drug_tokens[0] in token):
          entity_tokens = drug_tokens
          entity = "drug"
        elif len(treat_tokens) > 0 and (token in treat_tokens[0] or treat_tokens[0] in token):
          entity_tokens = treat_tokens
          entity = "treatment"

        # define matching function to compute similarity of entity and question subpart's tokens
        def match_tokens(ent_toks, qn_toks, thresh):
          cnt = 0
          match_toks = []   # store the set of question tokens which match with entity tokens
          for ent_tok in ent_toks:
            for qn_tok in qn_toks:
              if (ent_tok in qn_tok) or (qn_tok in ent_tok):
                cnt += 1
                match_toks.append(qn_tok)
                break

          match_toks = list(set(match_toks))
          return cnt/len(ent_toks), match_toks

        # if detected entity approximately matches with current substring based on threshold value
        thresh = 0.4
        # print(entity_tokens)
        # print(qn_tokens[i:min(i+len(entity_tokens), len(qn_tokens))])
        f_match, match_toks = match_tokens(entity_tokens, qn_tokens[i:min(i+len(entity_tokens), len(qn_tokens))], thresh)

        # print("Match ", f_match)
        # print("Match toks ", match_toks)
        if f_match >= thresh:
          idx = i   # store current value of idx
          # add B, I tags for all tokens matching with those of the detected entity
          while i < min(idx + len(entity_tokens), len(qn_tokens)):
            if qn_tokens[i] in match_toks:
              if i == idx:
                biotags.append(f'B-{entity}')
              else:
                biotags.append(f'I-{entity}')
            else:
              biotags.append('O')
            i += 1
          continue  # to prevent skipping an extra index
        else: # since entity thought to match doesn't actually match completely, it is not the start token of some entity. So, add O biotag to the current token only
          biotags.append('O')

      else:
        biotags.append('O')
      i+=1

    # The above approach might cause some intermediate tokens for any entity to be labelled O, even though its surrounding tokens are B/I tokens. We fix such tokens as postprocessing
    i = 0
    while i < len(biotags):
      biotag = biotags[i]
      if (i>0 and i<len(biotags)-1):
        prev_tag = biotags[i-1]
        next_tag = biotags[i+1]
        if (('B-' in prev_tag or 'I-' in prev_tag) and 'I-' in next_tag):
          entity = prev_tag[2:]
          biotags[i] = 'I-' + entity
      i+=1

    assert len(biotags) == len(qn_tokens)
    return biotags

  data[f'question_{lang}_biotags'] = data.apply(biotag, axis=1)
  return data

In [53]:
x = biotagging(train.iloc[[18]], 'english')
print("Question : ", x.iloc[0][['question_english']].values[0])
print("Disease : ", x.iloc[0][['disease_english']].values[0])
print("Drug : ", x.iloc[0][['drug_english']].values[0])
print("Treatment : ", x.iloc[0][['treatment_english']].values[0])
print(x.iloc[0][['question_english_tokens']].values[0])
print(x.iloc[0][['question_english_biotags']].values[0])

Question :  scoliosis. does it effect the stomach and breathing.
Disease :  Scoliosis
Drug :  
Treatment :  
['scoliosis.', 'does', 'it', 'effect', 'the', 'stomach', 'and', 'breathing.']
['B-disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [54]:
train = biotagging(train, 'english')
test = biotagging(test, 'english')

# Model Creation

In [6]:
!python -m spacy download en_core_web_lg

2023-11-11 15:11:38.673111: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-11 15:11:38.673183: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-11 15:11:38.673228: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-11 15:11:38.686638: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-lg==3.6.0
  Downloading htt

In [7]:
import spacy
import numpy as np

# Load the spacy model: nlp
nlp = spacy.load('en_core_web_lg')

In [8]:
# Calculate the dimensionality of nlp
embedding_dim = nlp.vocab.vectors_length
print(embedding_dim)

300


In [55]:
def encode_tokens(tokenized_sentences):
  tokens = []
  for sent_tokens in list(tokenized_sentences):
    tokens.extend(sent_tokens)
  print(len(tokens))
  X = np.zeros((len(tokens), embedding_dim))

  for idx, token in enumerate(tokens):
        doc = nlp(token)
        X[idx, :] = doc.vector
  return X

train_token_emb = encode_tokens(train['question_english_tokens'])
test_token_emb = encode_tokens(test['question_english_tokens'])

7127
2513


# Model Training

In [57]:
tag2label = {'O': 0,
 'B-disease': 1,
 'I-disease': 2,
 'B-drug': 3,
 'I-drug': 4,
 'B-treatment': 5,
 'I-treatment': 6}

def flatten_labels(labels):
  flat_labels = []
  for sent_labels in list(labels):
    flat_labels.extend(sent_labels)
  print(len(flat_labels))

  flat_labels = [tag2label[tag] for tag in flat_labels]
  return flat_labels

train_labels = flatten_labels(train['question_english_biotags'])
test_labels = flatten_labels(test['question_english_biotags'])

7127
2513


In [58]:
assert len(train_token_emb) == len(train_labels)
assert len(test_token_emb) == len(test_labels)

In [59]:
from sklearn.svm import SVC

def train_svc(embeddings, labels):
    clf = SVC(C = 1)
    clf.fit(embeddings, labels)
    return clf

model = train_svc(train_token_emb, train_labels)

# Model Evaluation

In [60]:
from sklearn.metrics import classification_report, confusion_matrix

def test_svc(model, embeddings, labels):

    pred_labels = model.predict(embeddings)

    # Count the number of correct predictions
    correct = 0
    for i in range(len(labels)):
      if pred_labels[i] == labels[i]:
        correct += 1

    return pred_labels

pred_labels = test_svc(model, test_token_emb, test_labels)
print(classification_report(test_labels, pred_labels))
print(confusion_matrix(test_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91      2045
           1       0.62      0.16      0.26       160
           2       0.36      0.06      0.10        83
           3       0.65      0.19      0.30        89
           4       0.60      0.13      0.21        46
           5       0.00      0.00      0.00        52
           6       0.80      0.21      0.33        38

    accuracy                           0.83      2513
   macro avg       0.55      0.25      0.30      2513
weighted avg       0.78      0.83      0.78      2513

[[2019    8    8    5    3    0    2]
 [ 133   26    0    1    0    0    0]
 [  73    5    5    0    0    0    0]
 [  71    1    0   17    0    0    0]
 [  37    1    0    2    6    0    0]
 [  50    1    0    1    0    0    0]
 [  28    0    1    0    1    0    8]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
