# Data Preprocessing

In [None]:
!git clone https://github.com/indichealth/indic-health-demo.git

Cloning into 'indic-health-demo'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 231 (delta 55), reused 103 (delta 43), pack-reused 110[K
Receiving objects: 100% (231/231), 1.24 MiB | 4.98 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [None]:
%cd indic-health-demo/Dataset

/content/indic-health-demo/Dataset


In [None]:
import pandas as pd
import os

In [None]:
data_path = 'IHQID-WebMD'

In [None]:
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [None]:
train_df = train_df[['question_bengali','Manual_Intent']]
test_df = test_df[['question_bengali','Manual_Intent']]

In [None]:
num_classes = len(set(train_df['Manual_Intent']))

In [None]:
train_df['question_bengali'][0]

'নিস্টাটিন কি জন্য নির্ধারিত হয়?'

In [None]:
!pip install deep-translator

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4


In [None]:
from deep_translator import GoogleTranslator

def translate_queries(data, src, tar):
  def translate(sentence, src, tar):
    return GoogleTranslator(source=src, target=tar).translate(sentence)

  s, t = src[:2], tar[:2]
  data[f'question_{tar}'] = data[f'question_{src}'].apply(lambda x: translate(x, s, t))
  return data

# translation from source to bridge and then to target language
src = 'bengali'
bridge = 'hindi'
tar = 'english'
train_ = translate_queries(train_df, src, bridge).drop(columns=[f'question_{src}'])
test_ = translate_queries(test_df, src, bridge).drop(columns=[f'question_{src}'])

train = translate_queries(train_, bridge, tar).drop(columns=[f'question_{bridge}'])
test = translate_queries(test_, bridge, tar).drop(columns=[f'question_{bridge}'])

In [None]:
train.head()

Unnamed: 0,Manual_Intent,question_english
0,drug,What is nystatin prescribed for?
1,other,Can washing hands after sex prevent me from ge...
2,drug,Does Percocet cause weight gain?
3,disease,Can a glass of wine cause high blood pressure?
4,disease,Is too much buttermilk the cause?


# Model Preparation

In [None]:
!pip install transformers[torch]
!pip install datasets
!pip install accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m89.3 MB/s

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
model_id = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_classes)

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

def preprocess(data):
  lenc = LabelEncoder()
  queries = list(data['question_english'])
  labels = lenc.fit_transform(list(data['Manual_Intent']))

  dataset = Dataset.from_dict(
    {
        'text': queries,
        'label': labels
    }
  )

  max_length = 512
  def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)

  tokenized_dataset = dataset.map(tokenize_function, batched=True)
  return tokenized_dataset, labels

tokenized_train_dataset, train_labels = preprocess(train)
tokenized_test_dataset, test_labels = preprocess(test)

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

In [None]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 305
})

# Model Training

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    num_train_epochs=10,
    load_best_model_at_end=True,
    learning_rate=5e-5,
    push_to_hub=False,
)

# Create a Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

trainer.train()

Step,Training Loss,Validation Loss
500,0.4233,1.815706


TrainOutput(global_step=900, training_loss=0.2434483920203315, metrics={'train_runtime': 696.724, 'train_samples_per_second': 10.334, 'train_steps_per_second': 1.292, 'total_flos': 1894433616691200.0, 'train_loss': 0.2434483920203315, 'epoch': 10.0})

In [None]:
# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the evaluation results
print(results)

{'eval_loss': 1.8157060146331787, 'eval_runtime': 8.3511, 'eval_samples_per_second': 28.858, 'eval_steps_per_second': 3.712, 'epoch': 10.0}


# Model Evaluation

In [None]:
predictions = trainer.predict(tokenized_test_dataset)
predicted_labels = predictions.predictions.argmax(axis=1)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(test_labels, predicted_labels))
print(confusion_matrix(test_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.74      0.75      0.75        76
           1       0.68      0.74      0.71        53
           2       0.80      0.59      0.68        73
           3       0.60      0.82      0.70        39

    accuracy                           0.71       241
   macro avg       0.71      0.72      0.71       241
weighted avg       0.72      0.71      0.71       241

[[57  2  4 13]
 [ 5 39  7  2]
 [11 13 43  6]
 [ 4  3  0 32]]
