In [1]:
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.8.2
!pip3 install -q -U trl==0.7.10
!pip3 install -q -U accelerate==0.27.1
!pip3 install -q -U datasets==2.17.0
!pip3 install -q -U transformers==4.38.0

In [3]:
import os
import transformers
from peft import LoraConfig
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import AutoTokenizer , AutoModelForCausalLM
from transformers import BitsAndBytesConfig, GemmaTokenizer

In [5]:
model_id = "google/gemma-2b"
bnb_config = BitsAndBytesConfig(load_in_4bit= True,
                                bnb_4bit_quant_type='nf4',
                                bnb_4bit_compute_dtype= torch.bfloat16)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id , token = HF_TOKEN)



In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_id , quantization_config = bnb_config ,
    device_map = {"" :0},
    token = HF_TOKEN
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
text = "Artificial Intelligence is used in many domains"
device = "cuda:0"
inputs = tokenizer(text , return_tensors = "pt").to(device)
output = model.generate(**inputs , max_new_tokens = 50)
tokenizer.decode(output[0] , skip_special_tokens = True)

'Artificial Intelligence is used in many domains, including the medical field. It is used to diagnose and treat diseases. It is also used to improve the quality of life of patients.\n\nArtificial Intelligence is used in the medical field to diagnose and treat diseases. It is used to improve the quality'

In [8]:
os.environ["WANDB_DISABLED"] = "false"

In [9]:
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [10]:
import pandas as pd

df = pd.read_csv("/content/train_data_chatbot.csv")

In [11]:
df['combined_text'] = df['short_question'] + " " + df['short_answer']
df['tokenized'] = df['combined_text'].apply(lambda x: tokenizer(x, padding=True, truncation=True))
print(df.head())


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


                                      short_question  \
0  can an antibiotic through an iv give you a ras...   
1  can you test positive from having the hep b va...   
2  what are the dietary restrictions for celiac d...   
3  can i transmit genital warts seventeen years a...   
4                          is all vitamin d the same   

                                        short_answer                   tags  \
0  yes it can even after you have finished the pr...  ['rash' 'antibiotic']   
1  test positive for what if you had a hep b vacc...        ['hepatitis b']   
2  omitting gluten from the diet is the key to co...     ['celiac disease']   
3  famotidine pepcid products is in a drug class ...               ['wart']   
4  hi this means you do not have hepatitis b and ...          ['vitamin d']   

   label                                      combined_text  \
0    1.0  can an antibiotic through an iv give you a ras...   
1    1.0  can you test positive from having the hep b va...   

In [12]:
def formatting_func(example):
    text = f": {example['short_question'][0]}\nshort_answer: {example['short_answer'][0]}\ntags : {example['tags'][0]}"
    return [text]

In [13]:
print(df.columns)


Index(['short_question', 'short_answer', 'tags', 'label', 'combined_text',
       'tokenized'],
      dtype='object')


In [14]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

df = pd.read_csv("/content/train_data_chatbot.csv", delimiter=",")
print(df.columns)
print(df.head())

df['combined_text'] = df['short_question'] + " " + df['short_answer']
tokenized_outputs = df['combined_text'].apply(lambda x: tokenizer(x, padding=True, truncation=True))

df['input_ids'] = tokenized_outputs.apply(lambda x: x['input_ids'])
df['attention_mask'] = tokenized_outputs.apply(lambda x: x['attention_mask'])

data = Dataset.from_pandas(df[['input_ids', 'attention_mask']])


Index(['short_question', 'short_answer', 'tags', 'label'], dtype='object')
                                      short_question  \
0  can an antibiotic through an iv give you a ras...   
1  can you test positive from having the hep b va...   
2  what are the dietary restrictions for celiac d...   
3  can i transmit genital warts seventeen years a...   
4                          is all vitamin d the same   

                                        short_answer                   tags  \
0  yes it can even after you have finished the pr...  ['rash' 'antibiotic']   
1  test positive for what if you had a hep b vacc...        ['hepatitis b']   
2  omitting gluten from the diet is the key to co...     ['celiac disease']   
3  famotidine pepcid products is in a drug class ...               ['wart']   
4  hi this means you do not have hepatitis b and ...          ['vitamin d']   

   label  
0    1.0  
1    1.0  
2    1.0  
3   -1.0  
4   -1.0  


In [17]:
!huggingface-cli logout

Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/commands/huggingface_cli.py", line 52, in main
    service.run()
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/commands/user.py", line 103, in run
    logout()
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/_login.py", line 143, in logout
    raise EnvironmentError(
OSError: Token has been deleted from your machine but you are still logged in.
To log out, you must clear out both `HF_TOKEN` and `HUGGING_FACE_HUB_TOKEN` environment variables.


In [18]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGr

In [19]:
!git config --global credential.helper store


In [21]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import Trainer, TrainingArguments

df = pd.read_csv("/content/train_data_chatbot.csv", delimiter=",")
df['combined_text'] = df['short_question'] + " " + df['short_answer']

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")

def tokenize_function(examples):
    return tokenizer(examples["combined_text"], padding=True, truncation=True)

data = Dataset.from_pandas(df[['combined_text', 'short_answer', 'tags']])
data = data.map(tokenize_function, batched=True)

def formatting_func(example):
    text = f"question: {example['combined_text']}\n answer: {example['short_answer']}\n tags: {example['tags']}"
    return [text]


training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=2,
    max_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    output_dir="outputs",
    optim="paged_adamw_8bit",
    hub_strategy="every_save",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=data,
    args=training_args,
    peft_config=lora_config,
    formatting_func=formatting_func,
)





Map:   0%|          | 0/47603 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/47603 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [22]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkhushi2003p[0m ([33mkhushi2003p-guru-gobind-singh-indraprastha-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,3.1502
2,3.0282
3,3.1827
4,2.8564
5,3.0676
6,2.7859
7,3.0742
8,2.8795
9,2.8991
10,3.0695


TrainOutput(global_step=100, training_loss=2.5503887343406677, metrics={'train_runtime': 523.2598, 'train_samples_per_second': 0.764, 'train_steps_per_second': 0.191, 'total_flos': 4894777933824000.0, 'train_loss': 2.5503887343406677, 'epoch': 8.33})

In [24]:
trainer.push_to_hub(commit_message="Training completed!")



adapter_model.safetensors:   0%|          | 0.00/39.3M [00:00<?, ?B/s]

events.out.tfevents.1730140839.73e32cd9c879.24693.0:   0%|          | 0.00/18.4k [00:00<?, ?B/s]

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

events.out.tfevents.1730141421.73e32cd9c879.31329.0:   0%|          | 0.00/26.1k [00:00<?, ?B/s]

events.out.tfevents.1730135491.73e32cd9c879.1277.0:   0%|          | 0.00/26.1k [00:00<?, ?B/s]

events.out.tfevents.1730145038.73e32cd9c879.38805.0:   0%|          | 0.00/8.32k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

events.out.tfevents.1730145659.73e32cd9c879.48423.0:   0%|          | 0.00/26.1k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Khushiee/outputs/commit/ec7be600ce02e59a2c7b5d50e302b4cdaf0f8b95', commit_message='Training completed!', commit_description='', oid='ec7be600ce02e59a2c7b5d50e302b4cdaf0f8b95', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
text = "question: I have slight fever which medicine do you prefer;"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=20)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

question: I have slight fever which medicine do you prefer; paracetamol or ibuprofen', 'i have a sore throat and a fever what should i do', "


In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = "Khushiee/medical-gemma2b-chatbot"
model = AutoModelForCausalLM.from_pretrained(model_id)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)
text = "question: I am having headache and have nausea problem;"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=20)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

question: I am having headache and have nausea problem; i have been diagnosed with depression and anxiety i have been taking sertraline for 2 months and
