# Data Preprocessing

In [1]:
!git clone https://github.com/indichealth/indic-health-demo.git

Cloning into 'indic-health-demo'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 231 (delta 55), reused 103 (delta 43), pack-reused 110[K
Receiving objects: 100% (231/231), 1.24 MiB | 19.85 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [2]:
%cd indic-health-demo/Dataset

/kaggle/working/indic-health-demo/Dataset


In [17]:
data_path = 'IHQID-WebMD'

In [18]:
import pandas as pd
import os

train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

train = train_df[['question_hindi','Manual_Intent']]
test = test_df[['question_hindi','Manual_Intent']]

In [19]:
num_classes = len(set(train['Manual_Intent']))

In [20]:
train['question_hindi'][0]

'निस्टेटिन किस लिए निर्धारित किया गया है?'

# Model Creation

In [7]:
!pip install transformers[torch]
!pip install datasets
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.22.0
    Uninstalling accelerate-0.22.0:
      Successfully uninstalled accelerate-0.22.0
Successfully installed accelerate-0.24.1


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [9]:
# Load pre-trained RoBERTa model and tokenizers
model_id = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_classes)

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

def preprocess(data):
  lenc = LabelEncoder()
  queries = list(data['question_hindi'])
  labels = lenc.fit_transform(list(data['Manual_Intent']))

  dataset = Dataset.from_dict(
    {
        'text': queries,
        'label': labels
    }
  )

  max_length = 512
  def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)

  tokenized_dataset = dataset.map(tokenize_function, batched=True)
  return tokenized_dataset, labels

tokenized_train_dataset, train_labels = preprocess(train)
tokenized_test_dataset, test_labels = preprocess(test)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 720
})

In [23]:
sent = "इट्रालेस १०० कैप्सूल किसके लिए प्रयोग किया जाता है?"
tokenized_output = tokenizer(sent, padding='max_length', truncation=True, max_length=128)
tokens = tokenizer.convert_ids_to_tokens(tokenized_output['input_ids'])
print(tokens)

['<s>', '▁इ', 'ट्रा', 'ले', 'स', '▁१००', '▁कै', 'प्स', 'ूल', '▁किस', 'के', '▁लिए', '▁प्रयोग', '▁किया', '▁जाता', '▁है', '?', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

# Model Training

In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    num_train_epochs=10,
    load_best_model_at_end=True,
    learning_rate=5e-5,
    push_to_hub=False,
)

# Create a Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

trainer.train()

Step,Training Loss,Validation Loss
500,0.8204,1.154393


TrainOutput(global_step=900, training_loss=0.5620482805040148, metrics={'train_runtime': 436.9858, 'train_samples_per_second': 16.477, 'train_steps_per_second': 2.06, 'total_flos': 1894433616691200.0, 'train_loss': 0.5620482805040148, 'epoch': 10.0})

# Model Evaluation

In [25]:
# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the evaluation results
print(results)

{'eval_loss': 1.1543928384780884, 'eval_runtime': 4.3029, 'eval_samples_per_second': 56.009, 'eval_steps_per_second': 7.204, 'epoch': 10.0}


In [26]:
predictions = trainer.predict(tokenized_test_dataset)
predicted_labels = predictions.predictions.argmax(axis=1)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(test_labels, predicted_labels))
print(confusion_matrix(test_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.65      0.75      0.70        76
           1       0.67      0.79      0.72        53
           2       0.85      0.55      0.67        73
           3       0.56      0.62      0.59        39

    accuracy                           0.68       241
   macro avg       0.68      0.68      0.67       241
weighted avg       0.70      0.68      0.68       241

[[57  6  3 10]
 [ 7 42  2  2]
 [16 10 40  7]
 [ 8  5  2 24]]
