<a href="https://colab.research.google.com/github/hrstbangera/NLP-with-Python/blob/master/bloom_3b_finetune_entity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (py

In [44]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-3b",
    load_in_8bit=True,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

In [45]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

In [46]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 4915200 || all params: 3007472640 || trainable%: 0.1634329082375293


In [47]:
# from datasets import load_dataset
# data1 = load_dataset("Abirate/english_quotes")
# data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)

In [48]:
# data1

In [49]:
import json

# Data to be written to the .jsonl file
data = [
    {"input": "What color is the sky?", "output": "The sky is blue."},
    {"input": "Where is the best place to get cloud GPUs?", "output": "Brev.dev"},
    {"input": "What is the capital of France?", "output": "The capital of France is Paris."},
    {"input": "How many minutes are in an hour?", "output": "There are 60 minutes in an hour."},
    {"input": "What is H2O commonly known as?", "output": "H2O is commonly known as water."},
    {"input": "Who wrote 'Romeo and Juliet'?", "output": "William Shakespeare wrote 'Romeo and Juliet'."}
]

# File name for the .jsonl file
filename = 'data.jsonl'

# Writing to the .jsonl file
with open(filename, 'w') as file:
    for item in data:
        json.dump(item, file)
        file.write('\n')

In [50]:
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='data.jsonl', split='train')

Generating train split: 0 examples [00:00, ? examples/s]

In [51]:
# train_dataset = train_dataset.map(lambda samples: tokenizer(samples['quote']), batched=True)

In [52]:
train_dataset

Dataset({
    features: ['input', 'output'],
    num_rows: 6
})

In [53]:
data

[{'input': 'What color is the sky?', 'output': 'The sky is blue.'},
 {'input': 'Where is the best place to get cloud GPUs?', 'output': 'Brev.dev'},
 {'input': 'What is the capital of France?',
  'output': 'The capital of France is Paris.'},
 {'input': 'How many minutes are in an hour?',
  'output': 'There are 60 minutes in an hour.'},
 {'input': 'What is H2O commonly known as?',
  'output': 'H2O is commonly known as water.'},
 {'input': "Who wrote 'Romeo and Juliet'?",
  'output': "William Shakespeare wrote 'Romeo and Juliet'."}]

In [54]:
# def merge_columns(example):
#     example["prediction"] = example["quote"] + " ->: " + str(example["tags"])
#     return example

# data['train'] = data['train'].map(merge_columns)
# data['train']["prediction"][:5]

# data = data.map(lambda samples: tokenizer(samples['prediction']), batched=True)

In [55]:
data

[{'input': 'What color is the sky?', 'output': 'The sky is blue.'},
 {'input': 'Where is the best place to get cloud GPUs?', 'output': 'Brev.dev'},
 {'input': 'What is the capital of France?',
  'output': 'The capital of France is Paris.'},
 {'input': 'How many minutes are in an hour?',
  'output': 'There are 60 minutes in an hour.'},
 {'input': 'What is H2O commonly known as?',
  'output': 'H2O is commonly known as water.'},
 {'input': "Who wrote 'Romeo and Juliet'?",
  'output': "William Shakespeare wrote 'Romeo and Juliet'."}]

In [57]:
# train_data = data["train"]

In [59]:
# import pandas as pd
# # Tokenize the quotes
# tokenized_data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)

# # Convert to pandas DataFrame
# df = pd.DataFrame(tokenized_data['train'])

In [61]:
# df.head(1)

In [62]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

# # # Load the model and tokenizer
# # model = AutoModelForCausalLM.from_pretrained(
# #     "bigscience/bloom-3b",
# #     load_in_8bit=True,
# #     device_map='auto',
# # )

# # tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-3b")

# # Your data
# data = [
#     {"input": "What color is the sky?", "output": "The sky is blue."},
#     {"input": "Where is the best place to get cloud GPUs?", "output": "Brev.dev"},
#     {"input": "What is the capital of France?", "output": "The capital of France is Paris."},
#     {"input": "How many minutes are in an hour?", "output": "There are 60 minutes in an hour."},
#     {"input": "What is H2O commonly known as?", "output": "H2O is commonly known as water."},
#     {"input": "Who wrote 'Romeo and Juliet'?", "output": "William Shakespeare wrote 'Romeo and Juliet'."}
# ]

# class MyDataset(Dataset):
#     def __init__(self, data, tokenizer, max_length):
#         self.tokenizer = tokenizer
#         self.data = data
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         item = self.data[idx]
#         text = f"{item['input']} {tokenizer.eos_token} {item['output']}"
#         return self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length)

# # Create the dataset
# max_seq_length = 512  # or whatever the max length for BLOOM is
# dataset = MyDataset(data, tokenizer, max_seq_length)

# # # Fine-tuning setup (Assuming you have already defined peft_config and training_arguments)
# # trainer = SFTTrainer(
# #     model=model,
# #     train_dataset=dataset,
# #     peft_config=peft_config,
# #     dataset_text_field="text",
# #     max_seq_length=max_seq_length,
# #     tokenizer=tokenizer,
# #     args=training_arguments,
# # )

# # # Start fine-tuning
# # trainer.train()


In [63]:
# Your synthetic data for entity extraction
data = [
    {"input": "Alice works at Acme Corp.", "output": "Name: Alice, Organization: Acme Corp"},
    {"input": "Alice works at Globex.", "output": "Name: Alice, Organization: Globex"},
    {"input": "Alice works at Initech.", "output": "Name: Alice, Organization: Initech"},
    {"input": "Bob works at Acme Corp.", "output": "Name: Bob, Organization: Acme Corp"},
    {"input": "Bob works at Globex.", "output": "Name: Bob, Organization: Globex"},
]

# Assuming you have a tokenizer and a max_seq_length defined
class MyDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = f"{item['input']} {self.tokenizer.eos_token} {item['output']}"
        return self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length)

# Create the dataset
max_seq_length = 512  # or whatever the max length for BLOOM is
dataset = MyDataset(data, tokenizer, max_seq_length)


In [None]:
!pip install transformers



In [64]:
import transformers
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=50,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='bloom_in_out'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss
1,4.7409
2,1.5673
3,3.1898
4,3.3033
5,1.4357
6,4.8946
7,4.6955
8,1.5501
9,3.2475
10,3.0442


TrainOutput(global_step=50, training_loss=2.037128016203642, metrics={'train_runtime': 75.8198, 'train_samples_per_second': 5.276, 'train_steps_per_second': 0.659, 'total_flos': 1213414543196160.0, 'train_loss': 2.037128016203642, 'epoch': 33.33})

In [23]:
!pip install huggingface-hub



In [24]:
import huggingface_hub

In [25]:
# !export HF_TOKEN=<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>

In [26]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# notebook_login()

In [65]:
trainer.push_to_hub()

events.out.tfevents.1706121015.2335d3f91aa7.1443.2:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HarshithNLP/bloom_in_out/commit/3bb66268ddc9c5088d98e5f263609efca9f38a20', commit_message='End of training', commit_description='', oid='3bb66268ddc9c5088d98e5f263609efca9f38a20', pr_url=None, pr_revision=None, pr_num=None)

In [66]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "HarshithNLP/bloom_in_out"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

adapter_model.safetensors:   0%|          | 0.00/19.7M [00:00<?, ?B/s]

In [67]:
batch = tokenizer("Alice works at Acme Corp.", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))



 Alice works at Acme Corp. Name: Alice, Organization: Acme Corp., Name: Alice, Organization: Acme Corp., Name: Alice, Organization: Acme Corp. 
