In [1]:
#!pip install bitsandbytes

In [2]:
#!pip install trl==0.21.0

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m")
model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m")
generation_config = GenerationConfig.from_model_config(model.config)

generation_config.temperature = 0
generation_config.do_sample = True

generation_config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


GenerationConfig {
  "bos_token_id": 2,
  "do_sample": true,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "temperature": 0
}

In [5]:
import transformers
#transformers.logging.set_verbosity_info()
input_ids = tokenizer.encode("Hello", return_tensors="pt")
generated_ids = model.generate(input_ids, generation_config=generation_config)
tokenizer.decode(generated_ids[0], skip_special_tokens=True)

`generation_config` default values have been modified to match model-specific defaults: {'cache_implementation': 'hybrid', 'top_k': 64, 'top_p': 0.95}. If this is not desired, please set these values explicitly.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


'Hello,\n\nI have a 2005 3.0L V6 4x'

In [6]:
sql_prompt = "What is the total volume of timber sold by each salesperson, sorted by salesperson?"
table_ddl = "CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');"

template = """{% for m in messages -%}
{{ '<|'+m['role']+'|>\\n' + m['content'] + '\\n' }}
{%- endfor %}<|assistant|>"""


messages = [
    {"role": "system", "content": "You are a helpful assistant skilled in translating natural language into SQL Queries given the Table DDL. You will respond only with the correct MySQL Query."},
    {"role": "user", "content": f"Text: {sql_prompt}; \nDDL: {table_ddl}"},
]

tokenizer.chat_template=template
chat = tokenizer.apply_chat_template(messages, tokenize=True, return_tensors="pt")

# Supervised Fine Tuning

## Data Preparation

In [7]:
from datasets import load_dataset

ds = load_dataset("gretelai/synthetic_text_to_sql")

In [8]:
ds["train"][0].keys()

dict_keys(['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'])

In [9]:
user_prompt = """Given the <USER_QUERY> and the <SCHEMA>, generate the corresponding SQL command to retrieve the desired data, considering the query's syntax, semantics, and schema constraints.

<SCHEMA>
{context}
</SCHEMA>

<USER_QUERY>
{question}
</USER_QUERY>
"""

In [10]:
def create_conversation(example):
    return {
        "messages": [
            {"role": "user", "content": user_prompt.format(question=example["sql_prompt"], context=example["sql_context"])},
            {"role": "assistant", "content": example["sql"]}
        ]
    }

In [11]:
features = ds["train"].features

In [12]:
ds = ds.shuffle()

train_ds = ds["train"].select(range(100000))
validation_ds = ds["train"].skip(80000).select(range(10000))

train_ds = train_ds.map(create_conversation, remove_columns=features)
validation_ds = validation_ds.map(create_conversation, remove_columns=features)
test_ds = ds["test"].map(create_conversation, remove_columns=features)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5851 [00:00<?, ? examples/s]

In [13]:
train_ds[0]

{'messages': [{'content': "Given the <USER_QUERY> and the <SCHEMA>, generate the corresponding SQL command to retrieve the desired data, considering the query's syntax, semantics, and schema constraints.\n\n<SCHEMA>\nCREATE TABLE Bridge (id INT, name TEXT, location TEXT, type TEXT); INSERT INTO Bridge (id, name, location, type) VALUES (1, 'Brooklyn Bridge', 'NYC, NY', 'Pedestrian'), (2, 'Manhattan Bridge', 'NYC, NY', 'Pedestrian');\n</SCHEMA>\n\n<USER_QUERY>\nHow many pedestrian bridges are there in New York City?\n</USER_QUERY>\n",
   'role': 'user'},
  {'content': "SELECT COUNT(*) FROM Bridge WHERE location = 'NYC, NY' AND type = 'Pedestrian';",
   'role': 'assistant'}]}

## SFT

Parameters and their meanings:

1. Training Duration Params:
   1. num_train_epochs: controls the total training duration
   2. max_steps: alternative to epochs, sets max number of training steps
2. Batch size params:
   1. per_device_train_batch_size: determines memory usage and training stability
   2. gradient_accumulation_steps: enables larger effctive batch sizes
3. Learning Rate params:
   1. learning_rate: controlla la dimensione degli update dei pesi
   2. warmup_ratio:
4. Monitoring Params:
   1. logging_steps:
   2. eval_steps
   3. save_steps

In [14]:
from datasets import load_dataset
from transformers import BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
import torch
import gc

del model
torch.cuda.empty_cache()
gc.collect()

device = torch.device("cuda")

if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16

model_kwargs = {}


In [15]:
model_kwargs = dict(
    attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
      torch_dtype=torch_dtype,
)

# BitsAndBytesConfig: Enables 4-bit quantization to reduce model size/memory usage
model_kwargs["quantization_config"] = BitsAndBytesConfig(
    load_in_8bit=True,
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m", **model_kwargs)

In [16]:
#leopoldo-todiscozte-
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mleopoldo-todiscozte[0m ([33mleopoldo-todiscozte-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [17]:
%env WANDB_PROJECT=Gemma-Text2SQL

env: WANDB_PROJECT=Gemma-Text2SQL


In [18]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=8,
    lora_dropout=0.05,
    r=8,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"]
)

In [19]:
training_args = SFTConfig(
    output_dir="./model/gemma-270m-Text2SQL-Fine-tuned",
    #max_seq_length=512,                     # max sequence length for model and packing of the dataset
    packing=True,                           # Groups multiple samples in the dataset into a single sequence
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    fp16=True if torch_dtype == torch.float16 else False,   # use float16 precision
    bf16=True if torch_dtype == torch.bfloat16 else False,   # use bfloat16 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="wandb",
    run_name="Gemma-Text2SQL"
)

In [20]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=validation_ds,
    processing_class=tokenizer,
    peft_config=peft_config
)



Tokenizing train dataset:   0%|          | 0/100000 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/100000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,1.1324
20,0.763
30,0.6855
40,0.6551
50,0.6191
60,0.6167
70,0.6166
80,0.5962
90,0.569
100,0.5628




In [None]:
trainer.save_model()

In [None]:
trainer.push_to_hub("First SFT Completed")

In [None]:
del model
del trainer
torch.cuda.empty_cache()

In [None]:
from peft import PeftModel

# Load Model base model
model = AutoModelForCausalLM.from_pretrained("google/gemma-3-270m", low_cpu_mem_usage=True)

# Merge LoRA and base model and save
peft_model = PeftModel.from_pretrained(model, "./model/gemma-270m-Text2SQL-Fine-tuned")
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("merged_model", safe_serialization=True, max_shard_size="2GB")

processor = AutoTokenizer.from_pretrained("./model/gemma-270m-Text2SQL-Fine-tuned")
processor.save_pretrained("merged_model")

In [None]:
merged_model.push_to_hub("leotod/gemma-270m-Text2SQL-Fine-tuned")

In [None]:
# todo test