In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline, DataCollatorWithPadding, Trainer, TrainingArguments, BertForSequenceClassification, pipeline
from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model
import torch
import pandas as pd
import numpy as np
import os

In [6]:
data_files = {"train": "train.jsonl", "test": "test.jsonl"}
dataset = load_dataset("gretelai/symptom_to_diagnosis", data_files=data_files)
dataset = dataset.rename_column("output_text", "label")
print(dataset)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 853 examples [00:00, 25658.29 examples/s]
Generating test split: 212 examples [00:00, 23689.06 examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_text'],
        num_rows: 853
    })
    test: Dataset({
        features: ['label', 'input_text'],
        num_rows: 212
    })
})





In [7]:
for entry in dataset['train'].select(range(5)):
    print('INPUT: {} \nOUTPUT: {}\n'.format(entry['input_text'], entry['label']))

INPUT: I've been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination. I've been coughing a lot and my limbs feel weak. 
OUTPUT: cervical spondylosis

INPUT: I have a rash on my face that is getting worse. It is red, inflamed, and has blisters that are bleeding clear pus. It is really painful. 
OUTPUT: impetigo

INPUT: I have been urinating blood. I sometimes feel sick to my stomach when I urinate. I often feel like I have a fever. 
OUTPUT: urinary tract infection

INPUT: I have been having trouble with my muscles and joints. My neck is really tight and my muscles feel weak. I have swollen joints and it is hard to move around without becoming stiff. It is also really uncomfortable to walk. 
OUTPUT: arthritis

INPUT: I have been feeling really sick. My body hurts a lot and I have no appetite. I have also developed rashes on my arms and face. The back of my eyes hurt a lot. 
OUTPUT: dengue



In [8]:
train_counts = pd.DataFrame({'Diagnosis': dataset['train']['label']})
train_counts = train_counts.groupby('Diagnosis').size().reset_index(name='train_set')

test_counts = pd.DataFrame({'Diagnosis': dataset['test']['label']})
test_counts = test_counts.groupby('Diagnosis').size().reset_index(name='test_set')

display(train_counts.merge(test_counts, on='Diagnosis'))

Unnamed: 0,Diagnosis,train_set,test_set
0,allergy,40,10
1,arthritis,40,10
2,bronchial asthma,40,10
3,cervical spondylosis,40,10
4,chicken pox,40,10
5,common cold,39,10
6,dengue,40,10
7,diabetes,40,10
8,drug reaction,40,8
9,fungal infection,39,9


In [9]:
sorted_labels = sorted(set(dataset['train']['label']))
label2id = dict(zip(sorted_labels, range(0, len(sorted_labels))))
id2label = dict(zip(range(0, len(sorted_labels)), sorted_labels))

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
foundation_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                                      num_labels=len(label2id),
                                                                      label2id=label2id,
                                                                      id2label=id2label)

classifier = pipeline("text-classification", model=foundation_model, tokenizer=tokenizer)
predicted_labels = classifier(dataset['test']['input_text'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [10]:
test_array = np.asarray(dataset['test']['label'])
pred_array = np.asarray([item['label'] for item in predicted_labels])
foundation_accuracy = round(sum(test_array == pred_array)*100/len(test_array), 2)
print(f"Foundation Model Accuracy: {foundation_accuracy}%")

Foundation Model Accuracy: 4.72%


In [11]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=64, lora_alpha=1, lora_dropout=0.1
)

peft_model = get_peft_model(foundation_model, lora_config)
print(peft_model.bert)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): lora.Linear(
              (base_layer): Linear(in_features=768, out_features=768, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=768, out_features=64, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=64, out_features=768, bias=False)
              )
              (lora_embedding_A

In [12]:
peft_model.print_trainable_parameters()

trainable params: 2,376,214 || all params: 111,875,372 || trainable%: 2.1240


In [13]:
def preprocess_function(examples):
    """Preprocess the dataset by returning tokenized examples."""
    tokens = tokenizer(examples["input_text"], padding="max_length", truncation=True)
    tokens['label'] = [label2id[l] for l in examples["label"]]
    return tokens

splits = ['train', 'test']

tokenized_ds = {}

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)

print(tokenized_ds)

Map: 100%|██████████| 853/853 [00:00<00:00, 2556.04 examples/s]
Map: 100%|██████████| 212/212 [00:00<00:00, 2509.33 examples/s]

{'train': Dataset({
    features: ['label', 'input_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 853
}), 'test': Dataset({
    features: ['label', 'input_text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 212
})}





In [14]:
print("A tokenized training input example:")
print(tokenized_ds["train"][0]["input_ids"])
print("\n")
print("A tokenized training label example:")
print(tokenized_ds["train"][0]["label"])

A tokenized training input example:
[101, 1045, 1005, 2310, 2042, 2383, 1037, 2843, 1997, 3255, 1999, 2026, 3300, 1998, 2067, 1012, 1045, 1005, 2310, 2036, 2042, 2383, 4390, 2007, 2026, 5703, 1998, 12016, 1012, 1045, 1005, 2310, 2042, 21454, 1037, 2843, 1998, 2026, 10726, 2514, 5410, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()*100}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=peft_model,
    args=TrainingArguments(
        output_dir="bert-lora",
        learning_rate=2e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=15,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

print("Starting to train...")
trainer.train()

  trainer = Trainer(


Starting to train...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
                                                  
  7%|▋         | 214/3210 [00:43<09:05,  5.49it/s]

{'eval_loss': 2.2811317443847656, 'eval_accuracy': 24.056603773584907, 'eval_runtime': 4.1581, 'eval_samples_per_second': 50.985, 'eval_steps_per_second': 12.746, 'epoch': 1.0}


                                                    
 13%|█▎        | 428/3210 [01:28<08:52,  5.23it/s]

{'eval_loss': 1.088592529296875, 'eval_accuracy': 63.20754716981132, 'eval_runtime': 4.3514, 'eval_samples_per_second': 48.72, 'eval_steps_per_second': 12.18, 'epoch': 2.0}


 16%|█▌        | 501/3210 [01:42<08:36,  5.24it/s]  

{'loss': 2.2113, 'grad_norm': 5.403955459594727, 'learning_rate': 0.001688473520249221, 'epoch': 2.34}


                                                  
 20%|██        | 642/3210 [02:13<08:08,  5.26it/s]

{'eval_loss': 0.7085069417953491, 'eval_accuracy': 78.30188679245283, 'eval_runtime': 4.4125, 'eval_samples_per_second': 48.046, 'eval_steps_per_second': 12.011, 'epoch': 3.0}


                                                  
 27%|██▋       | 856/3210 [03:00<07:42,  5.08it/s]

{'eval_loss': 0.45238178968429565, 'eval_accuracy': 84.90566037735849, 'eval_runtime': 4.4228, 'eval_samples_per_second': 47.934, 'eval_steps_per_second': 11.983, 'epoch': 4.0}


 31%|███       | 1000/3210 [03:28<07:08,  5.15it/s]

{'loss': 0.5806, 'grad_norm': 3.637144088745117, 'learning_rate': 0.0013769470404984424, 'epoch': 4.67}


                                                   
 33%|███▎      | 1070/3210 [03:46<06:53,  5.17it/s]

{'eval_loss': 0.41411706805229187, 'eval_accuracy': 88.67924528301887, 'eval_runtime': 4.4011, 'eval_samples_per_second': 48.17, 'eval_steps_per_second': 12.043, 'epoch': 5.0}


                                                   
 40%|████      | 1284/3210 [04:32<06:15,  5.13it/s]

{'eval_loss': 0.278963565826416, 'eval_accuracy': 91.0377358490566, 'eval_runtime': 4.4072, 'eval_samples_per_second': 48.103, 'eval_steps_per_second': 12.026, 'epoch': 6.0}


                                                   
 47%|████▋     | 1498/3210 [05:18<05:37,  5.07it/s]

{'eval_loss': 0.24168986082077026, 'eval_accuracy': 95.28301886792453, 'eval_runtime': 4.4877, 'eval_samples_per_second': 47.24, 'eval_steps_per_second': 11.81, 'epoch': 7.0}


 47%|████▋     | 1500/3210 [05:19<29:35,  1.04s/it]

{'loss': 0.1852, 'grad_norm': 0.18383334577083588, 'learning_rate': 0.0010654205607476634, 'epoch': 7.01}


                                                   
 53%|█████▎    | 1712/3210 [06:06<04:52,  5.13it/s]

{'eval_loss': 0.27212199568748474, 'eval_accuracy': 94.81132075471697, 'eval_runtime': 4.518, 'eval_samples_per_second': 46.923, 'eval_steps_per_second': 11.731, 'epoch': 8.0}


                                                   
 60%|██████    | 1926/3210 [06:54<04:12,  5.08it/s]

{'eval_loss': 0.24958951771259308, 'eval_accuracy': 95.75471698113208, 'eval_runtime': 4.5339, 'eval_samples_per_second': 46.759, 'eval_steps_per_second': 11.69, 'epoch': 9.0}


 62%|██████▏   | 2000/3210 [07:09<04:03,  4.98it/s]

{'loss': 0.0629, 'grad_norm': 0.21583600342273712, 'learning_rate': 0.0007538940809968847, 'epoch': 9.35}


                                                   
 67%|██████▋   | 2140/3210 [07:42<03:38,  4.90it/s]

{'eval_loss': 0.31020864844322205, 'eval_accuracy': 93.86792452830188, 'eval_runtime': 4.7446, 'eval_samples_per_second': 44.682, 'eval_steps_per_second': 11.17, 'epoch': 10.0}


                                                   
 73%|███████▎  | 2354/3210 [08:31<02:54,  4.90it/s]

{'eval_loss': 0.2134338617324829, 'eval_accuracy': 95.28301886792453, 'eval_runtime': 4.7409, 'eval_samples_per_second': 44.717, 'eval_steps_per_second': 11.179, 'epoch': 11.0}


 78%|███████▊  | 2500/3210 [09:01<02:22,  4.97it/s]

{'loss': 0.0323, 'grad_norm': 0.12193091958761215, 'learning_rate': 0.0004423676012461059, 'epoch': 11.68}


                                                   
 80%|████████  | 2568/3210 [09:20<02:08,  4.99it/s]

{'eval_loss': 0.2574020326137543, 'eval_accuracy': 95.28301886792453, 'eval_runtime': 4.8794, 'eval_samples_per_second': 43.448, 'eval_steps_per_second': 10.862, 'epoch': 12.0}


                                                   
 87%|████████▋ | 2782/3210 [10:08<01:26,  4.94it/s]

{'eval_loss': 0.24277755618095398, 'eval_accuracy': 95.28301886792453, 'eval_runtime': 4.7472, 'eval_samples_per_second': 44.658, 'eval_steps_per_second': 11.165, 'epoch': 13.0}


                                                   
 93%|█████████▎| 2996/3210 [10:57<00:43,  4.91it/s]

{'eval_loss': 0.21860791742801666, 'eval_accuracy': 96.22641509433963, 'eval_runtime': 4.7715, 'eval_samples_per_second': 44.43, 'eval_steps_per_second': 11.108, 'epoch': 14.0}


 93%|█████████▎| 3001/3210 [10:59<01:53,  1.84it/s]

{'loss': 0.01, 'grad_norm': 0.009843357838690281, 'learning_rate': 0.0001308411214953271, 'epoch': 14.02}


                                                   
100%|██████████| 3210/3210 [11:47<00:00,  4.95it/s]

{'eval_loss': 0.21643158793449402, 'eval_accuracy': 96.22641509433963, 'eval_runtime': 4.6499, 'eval_samples_per_second': 45.592, 'eval_steps_per_second': 11.398, 'epoch': 15.0}


100%|██████████| 3210/3210 [11:48<00:00,  4.53it/s]

{'train_runtime': 708.4145, 'train_samples_per_second': 18.061, 'train_steps_per_second': 4.531, 'train_loss': 0.480703278679714, 'epoch': 15.0}





TrainOutput(global_step=3210, training_loss=0.480703278679714, metrics={'train_runtime': 708.4145, 'train_samples_per_second': 18.061, 'train_steps_per_second': 4.531, 'total_flos': 3460510521077760.0, 'train_loss': 0.480703278679714, 'epoch': 15.0})

In [None]:
config = PeftConfig.from_pretrained('fine-tuned-peft-model-weights/')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                      num_labels=22)
model = PeftModel.from_pretrained(model, 'fine-tuned-peft-model-weights/')

trainer = Trainer(
    model=model,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

test_predictions = trainer.predict(tokenized_ds['test'])
print(test_predictions)