# Data Preprocessing

In [1]:
!git clone https://github.com/indichealth/indic-health-demo.git

Cloning into 'indic-health-demo'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 231 (delta 55), reused 103 (delta 43), pack-reused 110[K
Receiving objects: 100% (231/231), 1.24 MiB | 16.71 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [2]:
%cd indic-health-demo/Dataset

/kaggle/working/indic-health-demo/Dataset


In [22]:
data_path = 'IHQID-WebMD'

In [23]:
import pandas as pd
import os

train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

train = train_df[['question_english','Manual_Intent']]
test = test_df[['question_english','Manual_Intent']]

In [24]:
num_classes = len(set(train['Manual_Intent']))

# Model Creation

In [7]:
!pip install transformers[torch]
!pip install datasets
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.22.0
    Uninstalling accelerate-0.22.0:
      Successfully uninstalled accelerate-0.22.0
Successfully installed accelerate-0.24.1


In [8]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [9]:
model_id = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_classes)

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

def preprocess(data):
  lenc = LabelEncoder()
  queries = list(data['question_english'])
  labels = lenc.fit_transform(list(data['Manual_Intent']))

  dataset = Dataset.from_dict(
    {
        'text': queries,
        'label': labels
    }
  )

  max_length = 512
  def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)

  tokenized_dataset = dataset.map(tokenize_function, batched=True)
  return tokenized_dataset, labels

tokenized_train_dataset, train_labels = preprocess(train)
tokenized_test_dataset, test_labels = preprocess(test)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [26]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 720
})

# Model Training

In [27]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    num_train_epochs=10,
    load_best_model_at_end=True,
    learning_rate=5e-5,
    push_to_hub=False,
)

# Create a Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

trainer.train()

Step,Training Loss,Validation Loss
500,0.3967,1.466208


TrainOutput(global_step=900, training_loss=0.22914926528930665, metrics={'train_runtime': 405.5992, 'train_samples_per_second': 17.752, 'train_steps_per_second': 2.219, 'total_flos': 1894433616691200.0, 'train_loss': 0.22914926528930665, 'epoch': 10.0})

# Model Evaluation

In [28]:
# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the evaluation results
print(results)

{'eval_loss': 1.4662076234817505, 'eval_runtime': 4.4226, 'eval_samples_per_second': 54.493, 'eval_steps_per_second': 7.009, 'epoch': 10.0}


In [29]:
predictions = trainer.predict(tokenized_test_dataset)
predicted_labels = predictions.predictions.argmax(axis=1)

In [30]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(test_labels, predicted_labels))
print(confusion_matrix(test_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.79      0.75      0.77        76
           1       0.87      0.75      0.81        53
           2       0.82      0.79      0.81        73
           3       0.60      0.79      0.68        39

    accuracy                           0.77       241
   macro avg       0.77      0.77      0.77       241
weighted avg       0.78      0.77      0.77       241

[[57  1  5 13]
 [ 4 40  6  3]
 [ 7  3 58  5]
 [ 4  2  2 31]]
