In [1]:
!pip install torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate einops 


Collecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl.metadata (21 kB)
Collecting bitsandbytes==0.40.2
  Downloading bitsandbytes-0.40.2-py3-none-any.whl.metadata (9.8 kB)
Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.4.7
  Downloading trl-0.4.7-py3-none-any.whl.metadata (10 kB)
Collecting accelerate
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting safetensors (from peft==0.4.0)
  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0)
  Downloading huggingface_hub-0.24.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers==4.3

In [2]:
!pip install tqdm scipy



In [3]:
import os
from dataclasses import dataclass, field
from typing import Optional

import torch
from datasets import load_dataset
from datasets import load_from_disk
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)
from tqdm.notebook import tqdm

from trl import SFTTrainer

In [4]:
from huggingface_hub import interpreter_login


In [5]:
interpreter_login()



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config 

In [6]:
dataset = load_dataset("MattBastar/Medicine_Details", split="train")


Downloading data:   0%|          | 0.00/4.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11825 [00:00<?, ? examples/s]

In [7]:
dataset

Dataset({
    features: ['Medicine Name', 'Composition', 'Uses', 'Side_effects', 'Image URL', 'Manufacturer', 'Excellent Review %', 'Average Review %', 'Poor Review %'],
    num_rows: 11825
})

In [8]:
import pandas as pd
# Convert to DataFrame
df = pd.DataFrame(dataset)

# Display the first few rows of the DataFrame
df.head(2)

Unnamed: 0,Medicine Name,Composition,Uses,Side_effects,Image URL,Manufacturer,Excellent Review %,Average Review %,Poor Review %
0,Avastin 400mg Injection,Bevacizumab (400mg),Cancer of colon and rectum Non-small cell lun...,Rectal bleeding Taste change Headache Noseblee...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Roche Products India Pvt Ltd,22,56,22
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg),Treatment of Bacterial infections,Vomiting Nausea Diarrhea Mucocutaneous candidi...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glaxo SmithKline Pharmaceuticals Ltd,47,35,18


In [9]:
# Function to transform the row into desired format
def format_row(row):
    question = row['Medicine Name']
    answer = row['Uses']
    formatted_string = f"[INST] {question} [/INST] {answer} "
    return formatted_string

# Apply the function to each row of the dataframe
df['Formatted'] = df.apply(format_row, axis=1)

# Display the formatted column
df['Formatted']

0        [INST] Avastin 400mg Injection [/INST]  Cancer...
1        [INST] Augmentin 625 Duo Tablet [/INST] Treatm...
2        [INST] Azithral 500 Tablet [/INST] Treatment o...
3        [INST] Ascoril LS Syrup [/INST] Treatment of C...
4        [INST] Aciloc 150 Tablet [/INST] Treatment of ...
                               ...                        
11820    [INST] Zilarta-CT 40/6.25 Tablet [/INST]  Hype...
11821    [INST] Zipcoz Tablet [/INST]  Polycystic ovari...
11822    [INST] Zestasil 100 Tablet [/INST] Treatment o...
11823    [INST] Zedruff Shampoo [/INST] Treatment of Da...
11824    [INST] Zedruff Shampoo [/INST] Treatment of Da...
Name: Formatted, Length: 11825, dtype: object

In [11]:
df.head()

Unnamed: 0,Medicine Name,Composition,Uses,Side_effects,Image URL,Manufacturer,Excellent Review %,Average Review %,Poor Review %,Formatted
0,Avastin 400mg Injection,Bevacizumab (400mg),Cancer of colon and rectum Non-small cell lun...,Rectal bleeding Taste change Headache Noseblee...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Roche Products India Pvt Ltd,22,56,22,[INST] Avastin 400mg Injection [/INST] Cancer...
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg),Treatment of Bacterial infections,Vomiting Nausea Diarrhea Mucocutaneous candidi...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glaxo SmithKline Pharmaceuticals Ltd,47,35,18,[INST] Augmentin 625 Duo Tablet [/INST] Treatm...
2,Azithral 500 Tablet,Azithromycin (500mg),Treatment of Bacterial infections,Nausea Abdominal pain Diarrhea,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Alembic Pharmaceuticals Ltd,39,40,21,[INST] Azithral 500 Tablet [/INST] Treatment o...
3,Ascoril LS Syrup,Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...,Treatment of Cough with mucus,Nausea Vomiting Diarrhea Upset stomach Stomach...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glenmark Pharmaceuticals Ltd,24,41,35,[INST] Ascoril LS Syrup [/INST] Treatment of C...
4,Aciloc 150 Tablet,Ranitidine (150mg),Treatment of Gastroesophageal reflux disease (...,Headache Diarrhea Gastrointestinal disturbance,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Cadila Pharmaceuticals Ltd,34,37,29,[INST] Aciloc 150 Tablet [/INST] Treatment of ...


In [13]:
# Rename the 'Formatted' column to 'Text'
new_df = df.rename(columns={'Formatted': 'Text'})

new_df.head()

Unnamed: 0,Medicine Name,Composition,Uses,Side_effects,Image URL,Manufacturer,Excellent Review %,Average Review %,Poor Review %,Text
0,Avastin 400mg Injection,Bevacizumab (400mg),Cancer of colon and rectum Non-small cell lun...,Rectal bleeding Taste change Headache Noseblee...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Roche Products India Pvt Ltd,22,56,22,[INST] Avastin 400mg Injection [/INST] Cancer...
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg),Treatment of Bacterial infections,Vomiting Nausea Diarrhea Mucocutaneous candidi...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glaxo SmithKline Pharmaceuticals Ltd,47,35,18,[INST] Augmentin 625 Duo Tablet [/INST] Treatm...
2,Azithral 500 Tablet,Azithromycin (500mg),Treatment of Bacterial infections,Nausea Abdominal pain Diarrhea,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Alembic Pharmaceuticals Ltd,39,40,21,[INST] Azithral 500 Tablet [/INST] Treatment o...
3,Ascoril LS Syrup,Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...,Treatment of Cough with mucus,Nausea Vomiting Diarrhea Upset stomach Stomach...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glenmark Pharmaceuticals Ltd,24,41,35,[INST] Ascoril LS Syrup [/INST] Treatment of C...
4,Aciloc 150 Tablet,Ranitidine (150mg),Treatment of Gastroesophageal reflux disease (...,Headache Diarrhea Gastrointestinal disturbance,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Cadila Pharmaceuticals Ltd,34,37,29,[INST] Aciloc 150 Tablet [/INST] Treatment of ...


In [14]:
new_df = new_df[['Text']]

In [15]:
new_df.head(3)


Unnamed: 0,Text
0,[INST] Avastin 400mg Injection [/INST] Cancer...
1,[INST] Augmentin 625 Duo Tablet [/INST] Treatm...
2,[INST] Azithral 500 Tablet [/INST] Treatment o...


In [16]:
# If you want to save the new dataframe to a CSV file:
new_df.to_csv('formatted_data.csv', index=False)

In [17]:
new_df.shape

(11825, 1)

In [18]:
final_df = pd.read_csv("formatted_data.csv")


In [19]:
training_dataset = load_dataset("csv", data_files="formatted_data.csv", split="train")


Generating train split: 0 examples [00:00, ? examples/s]

In [22]:
base_model = "microsoft/phi-2"
new_model = "phi-2-medicine"

tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    # use_flash_attention_2=True, # Phi does not support yet.
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
    revision="refs/pr/23",
)

model.config.use_cache = False
model.config.pretraining_tp = 1

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=32,
    evaluation_strategy="steps",
    eval_steps=2000,
    logging_steps=15,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_steps=2000,
    warmup_ratio=0.05,
    weight_decay=0.01,
    max_steps=-1
)

peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ], - 41M params
    # modules_to_save=["embed_tokens","lm_head"]
)

trainer = SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    peft_config=peft_config,
    dataset_text_field="Text",
    max_seq_length=690,
    tokenizer=tokenizer,
    args=training_arguments,
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [23]:
trainer.train()


You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


TrainOutput(global_step=368, training_loss=1.8862748327462568, metrics={'train_runtime': 2015.2529, 'train_samples_per_second': 11.735, 'train_steps_per_second': 0.183, 'total_flos': 5208177663436800.0, 'train_loss': 1.8862748327462568, 'epoch': 1.99})

In [25]:
from transformers import pipeline
# Run text generation pipeline with our next model
prompt = "Azithral 500 Tablet?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=250)
result = pipe(f"[INST] {prompt} [/INST]")


Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[INST] Azithral 500 Tablet? [/INST] Treatment of Bacterial infections erythroid cell replacement in anemia due to chronic kidney disease erythroid cell replacement in anemia due to cancer chemotherapy erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythroid cell deficiency erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus ery


In [26]:
print(result[0]['generated_text'].split("[/INST]")[1])

 Treatment of Bacterial infections erythroid cell replacement in anemia due to chronic kidney disease erythroid cell replacement in anemia due to cancer chemotherapy erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythroid cell deficiency erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus erythematosus (SLE) erythematosus ery


In [29]:
prompt = "Crocin 500 tablet?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=250)
result = pipe(f"[INST] {prompt} [/INST]")

In [30]:
print(result[0]['generated_text'].split("[/INST]")[1])

 Treatment of Bacterial infections erythematosus erythematosus (COSA) erythematosus erythematosus (COSA) erythematosus erythematosus (COSA) erythematosus (Aplorid) erythematosus erythematosus (Aplasma) erythematosus erythematosus (Aplasia) erythematosus erythematosus erythematosus erythematosus (Aplasia) erythematosus erythematosus (Aplasia) erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus erythematosus 


In [44]:
import torch

# Define the path where you want to save the .pth file
model_save_path = 'Fine-Tunning-phi-2.pth'

# Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)


In [46]:
from huggingface_hub import HfApi

# Set your Hugging Face token
token = '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'

# Initialize API with token
api = HfApi(token=token)


In [71]:
from huggingface_hub import login

# Replace 'YOUR_HUGGINGFACE_TOKEN' with your actual token
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [74]:
from huggingface_hub import HfApi

api = HfApi()
repo_id = "Kartik12/Fine-tunning-phi-2"  # Replace with your desired repository name

# Create the new repository
api.create_repo(repo_id=repo_id, repo_type='model')


RepoUrl('https://huggingface.co/Kartik12/Fine-tunning-phi-2', endpoint='https://huggingface.co', repo_type='model', repo_id='Kartik12/Fine-tunning-phi-2')

In [75]:
from huggingface_hub import HfApi, HfFolder
from pathlib import Path

# Define file details
model_file_path = "/teamspace/studios/this_studio/Fine-Tunning-phi-2/Fine-Tunning-phi-2.pth"  # Path to your model file
repo_id = "Kartik12/Fine-tunning-phi-2"  # Replace with your repository ID

# Initialize API
api = HfApi()

# Upload the file to the repository
api.upload_file(
    path_or_fileobj=model_file_path,
    path_in_repo="model.pth",  # Name of the file in the repository
    repo_id=repo_id
)

print("File uploaded successfully!")


Fine-Tunning-phi-2.pth:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

File uploaded successfully!
