In [1]:
!pip install peft datasets

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import json
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sklearn.model_selection import train_test_split
from datasets import Dataset


# Load train and test dataset

In [4]:
def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            try:
                data.append(json.loads(line))  # Load each line as a JSON object
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_number}: {line}")
                print(f"Error: {e}")
    return pd.DataFrame(data)  # Convert the list of dictionaries to a DataFrame


# Load train and test data
train_df = load_data('/content/train.jsonl')
test_df = load_data('/content/test.jsonl')
dev_df = load_data('/content/eval.jsonl')



# Define Tokenization Function

In [5]:
def preprocess_data(df, tokenizer):
    def tokenize_function(examples):
        inputs = tokenizer(
            examples['input_prompt'],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        targets = tokenizer(
            examples['function_call'],
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )

        # Ensure that targets are converted to lists of integers
        inputs['labels'] = targets['input_ids'].tolist()

        return inputs

    # Create a Dataset from the DataFrame
    dataset = Dataset.from_pandas(df[['input_prompt', 'function_call']])
    return dataset.map(tokenize_function, batched=True, remove_columns=['input_prompt', 'function_call'])

# Load model and tokenizer from huggingface

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")


# Set the padding token
tokenizer.pad_token_id = tokenizer.eos_token_id



# Add pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))


train_dataset = preprocess_data(train_df, tokenizer)
test_dataset = preprocess_data(test_df, tokenizer)
dev_dataset = preprocess_data(dev_df, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/659 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

In [7]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

# Check the shape

In [8]:
print(f"Shape of testset:{test_dataset.shape}")
print(f"Shape of trainset:{train_dataset.shape}")
print(f"Shape of devset:{dev_dataset.shape}")
print(f"Train set: {train_dataset[0]}")


Shape of testset:(102, 3)
Shape of trainset:(659, 3)
Shape of devset:(49, 3)
Train set: {'input_ids': [276, 3171, 3, 9, 1207, 5059, 13, 1085, 147, 8, 657, 215, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [5944, 834, 1047, 834, 4059, 17, 599, 6757, 2423, 31, 7, 4529, 834, 6757, 31,

In [None]:
from transformers import TrainerCallback
import pandas as pd

class LossLoggerCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # Log training loss
            if 'loss' in logs:
                self.train_losses.append(logs['loss'])
            # Log evaluation loss (validation loss)
            if 'eval_loss' in logs:
                self.eval_losses.append(logs['eval_loss'])

    def save_losses(self, output_dir):
        # Create a DataFrame for the losses
        losses_df = pd.DataFrame({
            'train_loss': self.train_losses,
            'eval_loss': self.eval_losses + [None] * (len(self.train_losses) - len(self.eval_losses))
        })
        # Save to CSV
        losses_df.to_csv(f'{output_dir}/train_dev_loss.csv', index=False)


# Training hyperparameter configuration

In [10]:
loss_logger = LossLoggerCallback()
# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q", "k", "v", "o"]
)
model_lora_config = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/t5-large-lora',
    per_device_train_batch_size=8,  # Increased batch size
    per_device_eval_batch_size=8,
    num_train_epochs=20,  # Increased number of epochs
    logging_dir='/logs',
    logging_steps=25,  # More frequent logging
    evaluation_strategy="epoch",
    eval_steps=25,  # More frequent evaluation
    save_strategy="epoch",
    learning_rate=1e-4,  # Increased learning rate
    weight_decay=0.01,  # Adjust based on performance
    remove_unused_columns=False
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    callbacks=[loss_logger]
)




In [11]:
# Train the model
trainer.train()
trainer.save_model()

# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", eval_results)

# Save the losses to CSV files
loss_logger.save_losses(training_args.output_dir)

Epoch,Training Loss,Validation Loss
1,0.678,0.428637
2,0.5193,0.248778
3,0.3623,0.155005
4,0.2559,0.120349
5,0.2549,0.109476
6,0.2128,0.106607
7,0.2053,0.105019
8,0.2035,0.098613
9,0.1835,0.098063
10,0.164,0.096937


Evaluation Results: {'eval_loss': 0.09213370829820633, 'eval_runtime': 3.8844, 'eval_samples_per_second': 12.615, 'eval_steps_per_second': 1.802, 'epoch': 20.0}


In [15]:
repo_id = 'yourusername/reponame'
from huggingface_hub import login

# Replace 'your_huggingface_token' with your actual Hugging Face token
login("your_huggingface_token")

# Push model and tokenizer to Hub
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


model.safetensors:   0%|          | 0.00/3.15G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Homayounsrp/ToolSelection/commit/b4c64cac9a1660174b95b1e34c57e25655971546', commit_message='Upload tokenizer', commit_description='', oid='b4c64cac9a1660174b95b1e34c57e25655971546', pr_url=None, pr_revision=None, pr_num=None)

# Load finetuned model

In [16]:
model.save_pretrained("/content/model/t5-large")

# Test the model

In [42]:
prompt = "Create a dashboard for sales analytics"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")

# Generate with a specified max_new_tokens
outputs = model.generate(input_ids=input_ids, max_new_tokens=50)


# Decode the generated tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


connect_and_query_db(db_name='sales_db', query='SELECT * FROM sales')
