# Data Preprocessing

In [1]:
!git clone https://github.com/indichealth/indic-health-demo.git

Cloning into 'indic-health-demo'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (121/121), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 231 (delta 55), reused 103 (delta 43), pack-reused 110[K
Receiving objects: 100% (231/231), 1.24 MiB | 25.40 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [2]:
%cd indic-health-demo/Dataset

/content/indic-health-demo/Dataset


In [3]:
import pandas as pd
import os

In [18]:
data_path = 'IHQID-WebMD'

In [19]:
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [20]:
train = train_df[['question_english','Manual_Intent']]
test = test_df[['question_english','Manual_Intent']]

In [21]:
num_classes = len(set(train['Manual_Intent']))

# Model Creation

In [8]:
!pip install transformers[torch]
!pip install datasets
!pip install accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m80.8 MB/s

In [9]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer

In [10]:
# Load pre-trained RoBERTa model and tokenizers
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_classes)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

def preprocess(data):
  lenc = LabelEncoder()
  queries = list(data['question_english'])
  labels = lenc.fit_transform(list(data['Manual_Intent']))

  dataset = Dataset.from_dict(
    {
        'text': queries,
        'label': labels
    }
  )

  def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

  tokenized_dataset = dataset.map(tokenize_function, batched=True)
  return tokenized_dataset, labels

tokenized_train_dataset, train_labels = preprocess(train)
tokenized_test_dataset, test_labels = preprocess(test)

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/241 [00:00<?, ? examples/s]

In [23]:
tokenized_train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 720
})

# Model Training

In [24]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    eval_steps=500,
    num_train_epochs=10,
    load_best_model_at_end=True,
    learning_rate=5e-5,
    push_to_hub=False,
)

# Create a Trainer for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

trainer.train()

Step,Training Loss,Validation Loss
500,0.5626,1.266372


TrainOutput(global_step=900, training_loss=0.34623223198784725, metrics={'train_runtime': 696.3402, 'train_samples_per_second': 10.34, 'train_steps_per_second': 1.292, 'total_flos': 1894433616691200.0, 'train_loss': 0.34623223198784725, 'epoch': 10.0})

In [25]:
# Evaluate the model on the test dataset
results = trainer.evaluate()

# Print the evaluation results
print(results)

{'eval_loss': 1.2663716077804565, 'eval_runtime': 7.2328, 'eval_samples_per_second': 33.32, 'eval_steps_per_second': 4.286, 'epoch': 10.0}


# Model Evaluation

In [26]:
predictions = trainer.predict(tokenized_test_dataset)
predicted_labels = predictions.predictions.argmax(axis=1)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(test_labels, predicted_labels))
print(confusion_matrix(test_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.89      0.71      0.79        76
           1       0.71      0.94      0.81        53
           2       0.82      0.75      0.79        73
           3       0.70      0.77      0.73        39

    accuracy                           0.78       241
   macro avg       0.78      0.79      0.78       241
weighted avg       0.80      0.78      0.78       241

[[54 11  6  5]
 [ 1 50  1  1]
 [ 6  5 55  7]
 [ 0  4  5 30]]
