In [1]:
!pip install peft
!pip install datasets
!pip install accelerate -U

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26

In [1]:
import os
import random
import string
import torch
import transformers
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, GenerationConfig, TrainingArguments, Trainer
from datasets import Dataset

In [2]:
#loading model and the tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch.bfloat16)
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Printing model trainable and non trainable parameters

In [3]:
all_params = 0
trainable_params = 0
for _, params in pretrained_model.named_parameters():
  all_params += params.numel()
  if params.requires_grad:
    trainable_params += params.numel()

print(f"Total parameters in FLAN-t5 base: {all_params},\nTrainable parameters in FLAN-t5 base: {trainable_params},\nNon-Trainable parameters in FLAN-t5 base: {all_params - trainable_params} ")

Total parameters in FLAN-t5 base: 247577856,
Trainable parameters in FLAN-t5 base: 247577856,
Non-Trainable parameters in FLAN-t5 base: 0 


Performing Inference

In [4]:
prompt = """Instruction:

Write an SQL query to find the total number of rows in a table named 'orders'.

Output:
"""

inputs = tokenizer(prompt, return_tensors='pt')

output = tokenizer.decode(
        pretrained_model.generate(
            inputs["input_ids"],
            max_new_tokens=50,
        )[0],
        skip_special_tokens=True
    )

dash_line = '-'.join('' for x in range(100))

print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'MODEL GENERATION:\n{output}\n')

##ouput not so close

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruction:

Write an SQL query to find the total number of rows in a table named 'orders'.

Output:

---------------------------------------------------------------------------------------------------
MODEL GENERATION:
n = 0 for i in range(1, n + 1): n += i * n - 1



#Full Fine-tune

#Loading Dataset

In [5]:
import pandas as pd

df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,text
0,[INST] Write an SQL query to select all record...
1,[INST] How do you retrieve distinct values fro...
2,[INST] Write an SQL query to find the total nu...
3,[INST] Explain the difference between INNER JO...
4,[INST] How do you update values in a table usi...


In [6]:
 # Given text
text = "[INST] Write an SQL query to select all records from the 'customers' table. [/INST] To select all records from the 'customers' table, you can use the following SQL query: SELECT * FROM customers;"

# Find the start and end indices of the instruction and label
inst_cur = text.find("[INST]")
inst_start = inst_cur + len("[INST]")
sinst_cur = text.find("[/INST]")
label_start = sinst_cur + len("[/INST]")

print(f"Instruction: {text[inst_start + 1: sinst_cur]}")
print(f"Label: {text[label_start + 1: ]}")


Instruction: Write an SQL query to select all records from the 'customers' table. 
Label: To select all records from the 'customers' table, you can use the following SQL query: SELECT * FROM customers;


In [7]:
def process_data(df):

    input_ids_list = []
    labels_list = []

    for text in df['text']:
        start_prompt = 'Instruction:\n\n'
        end_prompt = '\n\nOutput: '

        inst_cur = text.find("[INST]")
        inst_start = inst_cur + len("[INST]")
        sinst_cur = text.find("[/INST]")

        instruction = text[inst_start + 1: sinst_cur]
        output = text[sinst_cur + len("[/INST]") + 1:]

        input_ids_list.append(start_prompt + instruction + end_prompt)
        labels_list.append(output)

    processed_df = pd.DataFrame({'instruction': input_ids_list, 'output': labels_list})

    # Save the DataFrame to CSV
    processed_df.to_csv('processed.csv', index=False)

In [8]:
dataset_dict = process_data(df)

In [9]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="processed.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 85
    })
})

In [11]:
def tokenize_function(example):
  example['input_ids'] = tokenizer(example["instruction"], padding="max_length",  return_tensors="pt").input_ids
  example['labels'] = tokenizer(example["output"], padding="max_length",  return_tensors="pt").input_ids

  return example

In [12]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/85 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output', 'input_ids', 'labels'],
        num_rows: 85
    })
})

In [14]:
tokenized_datasets = tokenized_datasets.remove_columns(['instruction','output'])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 85
    })
})

In [15]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")

Filter:   0%|          | 0/85 [00:00<?, ? examples/s]

Shapes of the datasets:
Training: (1, 2)


In [16]:
import time
import numpy as np

In [26]:
output_dir = f'./test-full-fine-tune-main-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=5
)

In [27]:
trainer = Trainer(
    model=pretrained_model,
    args=training_args,
    train_dataset=tokenized_datasets['train']
)

max_steps is given, it will override any value given in num_train_epochs


In [28]:
trainer.train()

Step,Training Loss
1,38.4955
2,39.9225
3,40.6296
4,38.0525
5,41.6081


TrainOutput(global_step=5, training_loss=39.74162902832031, metrics={'train_runtime': 104.1915, 'train_samples_per_second': 0.384, 'train_steps_per_second': 0.048, 'total_flos': 3423786762240.0, 'train_loss': 39.74162902832031, 'epoch': 5.0})

In [29]:
prompt = """Instruction:

Write an SQL query to find the total number of rows in a table named 'orders'.

Output:
"""

inputs = tokenizer(prompt, return_tensors='pt')

output = tokenizer.decode(
        pretrained_model.generate(
            inputs["input_ids"],
            max_new_tokens=100,
        )[0],
        skip_special_tokens=True
    )

dash_line = '-'.join('' for x in range(100))

print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'MODEL GENERATION:\n{output}\n')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruction:

Write an SQL query to find the total number of rows in a table named 'orders'.

Output:

---------------------------------------------------------------------------------------------------
MODEL GENERATION:
tctl = 0 n = n - 1

