In [15]:
!pip install datasets transformers torch accelerate bitsandbytes evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [16]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_dataset, interleave_datasets, load_from_disk
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [17]:
import torch
from datasets import load_dataset


dataset = load_dataset("gretelai/synthetic_text_to_sql")

Using the latest cached version of the dataset since gretelai/synthetic_text_to_sql couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /teamspace/studios/this_studio/.cache/huggingface/datasets/gretelai___synthetic_text_to_sql/default/0.0.0/273a86f5f290e8d61b6767a9ff690c82bc990dc4 (last modified on Tue Mar 11 07:45:00 2025).


In [18]:
from datasets import DatasetDict

# Define split ratio (e.g., 10% of train set for validation)
split_ratio = 0.1

# Split the train dataset into new train & validation sets
train_valid_split = dataset["train"].train_test_split(test_size=split_ratio, seed=42)

# Create new DatasetDict with train, validation, and test
dataset = DatasetDict({
    "train": train_valid_split["train"],
    "validation": train_valid_split["test"],  # New validation split
    "test": dataset["test"]  # Keep test set unchanged
})

# Verify dataset sizes
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
        num_rows: 90000
    })
    validation: Dataset({
        features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
        num_rows: 5851
    })
})


In [19]:
model_name='t5-small'

tokenizer = AutoTokenizer.from_pretrained(model_name)

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
original_model = original_model.to('cuda')

In [20]:
dataset['test'][0]

{'id': 1,
 'domain': 'artificial intelligence',
 'domain_description': 'AI data on algorithmic fairness, AI safety, explainable AI, and creative AI applications.',
 'sql_complexity': 'basic SQL',
 'sql_complexity_description': 'basic SQL with a simple select statement',
 'sql_task_type': 'analytics and reporting',
 'sql_task_type_description': 'generating reports, dashboards, and analytical insights',
 'sql_prompt': "What is the average explainability score of creative AI applications in 'Europe' and 'North America' in the 'creative_ai' table?",
 'sql_context': "CREATE TABLE creative_ai (application_id INT, name TEXT, region TEXT, explainability_score FLOAT); INSERT INTO creative_ai (application_id, name, region, explainability_score) VALUES (1, 'ApplicationX', 'Europe', 0.87), (2, 'ApplicationY', 'North America', 0.91), (3, 'ApplicationZ', 'Europe', 0.84), (4, 'ApplicationAA', 'North America', 0.93), (5, 'ApplicationAB', 'Europe', 0.89);",
 'sql': "SELECT AVG(explainability_score) FRO

In [21]:
def tokenize_function(example):
    
#     print(len(example["question"]))
    start_prompt = "Tables:\n"
    middle_prompt = "\n\nQuestion:\n"
    end_prompt = "\n\nAnswer:\n"
  
    data_zip = zip(example['sql_context'], example['sql_prompt'])
    prompt = [start_prompt + context + middle_prompt + question + end_prompt for context, question in data_zip]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example['sql'], padding="max_length", truncation=True, return_tensors="pt").input_ids
#     print(prompt[0])
#     print()
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
# Check if the tokenized dataset exists before loading
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unused columns (Keep only tokenized inputs & labels)
tokenized_datasets = tokenized_datasets.remove_columns([
    'id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description',
    'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'
])

print("✅ Tokenization completed. Data stored in memory (CPU).")

✅ Tokenization completed. Data stored in memory (CPU).


In [22]:
print(tokenized_datasets.keys())
print(tokenized_datasets['train'][0].keys())
print(tokenized_datasets['train'][0]['input_ids'][:10])
print(tokenized_datasets['train'][0]['labels'][:10])
print(tokenized_datasets)

dict_keys(['train', 'validation', 'test'])
dict_keys(['input_ids', 'labels'])
[4398, 7, 10, 205, 4386, 6048, 332, 17098, 1761, 6696]
[3, 23143, 14196, 180, 6122, 599, 7484, 324, 663, 834]
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 90000
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5851
    })
})


In [23]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['validation'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)

Shapes of the datasets:
Training: (90000, 2)
Validation: (10000, 2)
Test: (5851, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 90000
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5851
    })
})


## Test the model with zero shot inferencing

In [24]:
index = 0

question = dataset['test'][index]['sql_prompt']
context = dataset['test'][index]['sql_context']
answer = dataset['test'][index]['sql']

prompt = f"""Tables:
{context}

Question:
{question}

Answer:
"""

inputs = tokenizer(prompt, return_tensors='pt')
inputs = inputs.to('cuda')

output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN ANSWER:\n{answer}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Tables:
CREATE TABLE creative_ai (application_id INT, name TEXT, region TEXT, explainability_score FLOAT); INSERT INTO creative_ai (application_id, name, region, explainability_score) VALUES (1, 'ApplicationX', 'Europe', 0.87), (2, 'ApplicationY', 'North America', 0.91), (3, 'ApplicationZ', 'Europe', 0.84), (4, 'ApplicationAA', 'North America', 0.93), (5, 'ApplicationAB', 'Europe', 0.89);

Question:
What is the average explainability score of creative AI applications in 'Europe' and 'North America' in the 'creative_ai' table?

Answer:

---------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:
SELECT AVG(explainability_score) FROM creative_ai WHERE region IN ('Europe', 'North America');

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO S

In [25]:
try:
    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("finetuned_model_2_epoch")
    finetuned_model = finetuned_model.to('cuda')
    to_train = False

except:
    to_train = True
    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
    finetuned_model = finetuned_model.to('cuda')
    tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
%%time

if to_train:
    output_dir = f'./sql-training-{str(int(time.time()))}'

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=5e-3,
        num_train_epochs=2,
        per_device_train_batch_size=16,     # batch size per device during training
        per_device_eval_batch_size=16,      # batch size for evaluation
        weight_decay=0.01,
        logging_steps=50,
        evaluation_strategy='steps',        # evaluation strategy to adopt during training
        eval_steps=500,                     # number of steps between evaluation
    )

    trainer = Trainer(
        model=finetuned_model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
    )
    
    trainer.train()

    finetuned_model.save_pretrained("finetuned_model_2_epoch")

Step,Training Loss,Validation Loss
500,0.057,0.046815
1000,0.0501,0.044052
1500,0.0453,0.040788
2000,0.0452,0.038474
2500,0.0406,0.036216
3000,0.0442,0.034373
3500,0.0411,0.033247
4000,0.0385,0.032296
4500,0.0414,0.030588
5000,0.0368,0.029752


In [26]:
checkpoint_path = "./sql-training-1741679596/checkpoint-6000/"

trainer.train(resume_from_checkpoint=checkpoint_path)


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Step,Training Loss,Validation Loss
6500,0.0313,0.028263
7000,0.0312,0.027976
7500,0.0298,0.027744
8000,0.0322,0.027334
8500,0.0313,0.027046
9000,0.0287,0.026888
9500,0.0317,0.026732
10000,0.0321,0.026716
10500,0.032,0.026702
11000,0.0304,0.026681


TrainOutput(global_step=11250, training_loss=0.014829920789930555, metrics={'train_runtime': 1021.0972, 'train_samples_per_second': 176.281, 'train_steps_per_second': 11.018, 'total_flos': 2.436152426496e+16, 'train_loss': 0.014829920789930555, 'epoch': 2.0})

In [27]:
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("finetuned_model_2_epoch")
finetuned_model = finetuned_model.to('cuda')

## Testing the model with zero shot inferencing

In [36]:
index = 8
# index = len(dataset['test'])-200

question = dataset['test'][index]['sql_prompt']
context = dataset['test'][index]['sql_context']
answer = dataset['test'][index]['sql']

prompt = f"""Tables:
{context}

Question:
{question}

Answer:
"""

inputs = tokenizer(prompt, return_tensors='pt')
inputs = inputs.to('cuda')

output = tokenizer.decode(
    finetuned_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN ANSWER:\n{answer}\n')
print(dash_line)
print(f'FINE-TUNED MODEL - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Tables:
CREATE TABLE marine_species (name TEXT, conservation_status TEXT);

Question:
List all marine species with their conservation status.

Answer:

---------------------------------------------------------------------------------------------------
BASELINE HUMAN ANSWER:
SELECT name, conservation_status FROM marine_species;

---------------------------------------------------------------------------------------------------
FINE-TUNED MODEL - ZERO SHOT:
SELECT name, conservation_status FROM marine_species;
