In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Install necessary packages
!pip install -q pandas numpy torch datasets transformers peft bitsandbytes evaluate rapidfuzz

In [2]:
import gc
import random
import time
import math
import re
import pandas as pd
import numpy as np
import torch

from datasets import Dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback  
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig 

import evaluate
from rapidfuzz import fuzz


In [3]:
# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

In [4]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()


In [5]:
!pip install -q -U transformers bitsandbytes

In [6]:
import os
import gc
import random
import time
import math
import re
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # For progress bar

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from datasets import Dataset, DatasetDict, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from transformers.trainer_callback import EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    BitsAndBytesConfig  
)
import evaluate
from rapidfuzz import fuzz


In [7]:
import gc
import re
import random
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Enable cudnn benchmark for fixed input sizes (can speed up computation)
torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def preprocess(text: str) -> str:
    if not isinstance(text, str):
        return ""
    return re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()


def clean_df(df, rename=None, drop=None, select=None):
    if drop:
        df = df.drop(columns=drop, errors='ignore')
    if rename:
        df = df.rename(columns=rename)
    for col in ['query', 'context', 'response']:
        if col in df.columns:
            df[col] = df[col].apply(preprocess)
    if select:
        df = df[select]
    return df

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
max_length_prompt = 500
max_length_response = 250

def tokenize_length_filter(row):
    start_prompt = "Context:\n"
    middle_prompt = "\n\nQuery:\n"
    end_prompt = "\n\nResponse:\n"
    prompt = f"{start_prompt}{row['context']}{middle_prompt}{row['query']}{end_prompt}"
    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=True, truncation=False)
    response_tokens = tokenizer.encode(row['response'], add_special_tokens=True, truncation=False)
    return len(prompt_tokens) <= max_length_prompt and len(response_tokens) <= max_length_response

def split_dataframe(df, train_frac=0.85, test_frac=0.1, val_frac=0.05):
    n = len(df)
    train_end = int(n * train_frac)
    test_end = train_end + int(n * test_frac)
    train_df = df.iloc[:train_end].reset_index(drop=True)
    test_df = df.iloc[train_end:test_end].reset_index(drop=True)
    val_df = df.iloc[test_end:].reset_index(drop=True)
    return train_df, test_df, val_df

# --- DATA LOADING AND PROCESSING ---

print("Loading GretelAI dataset...")
df3 = pd.read_parquet("hf://datasets/gretelai/synthetic_text_to_sql/synthetic_text_to_sql_train.snappy.parquet")

df3 = clean_df(
    df3,
    rename={'sql_prompt': 'query', 'sql_context': 'context', 'sql': 'response'},
    select=['query', 'context', 'response']
)

print("Rows before dropping duplicates:", len(df3))
df3 = df3.dropna(subset=['query', 'context', 'response']).drop_duplicates()
print("Rows after dropping duplicates:", len(df3))

df3 = df3[df3.apply(tokenize_length_filter, axis=1)]
print("Rows after token length filtering:", len(df3))

# Split into train/test/val
train_df, test_df, val_df = split_dataframe(df3)
print("Data splits - Train:", len(train_df), "Test:", len(test_df), "Validation:", len(val_df))

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset,
    "validation": val_dataset
})

dataset.save_to_disk("gretelai_dataset")
print("Saved GretelAI dataset successfully!")
clear_memory()


Using device: cuda
Loading GretelAI dataset...
Rows before dropping duplicates: 100000
Rows after dropping duplicates: 100000


Token indices sequence length is longer than the specified maximum sequence length for this model (597 > 512). Running this sequence through the model will result in indexing errors


Rows after token length filtering: 99906
Data splits - Train: 84920 Test: 9990 Validation: 4996


Saving the dataset (0/1 shards):   0%|          | 0/84920 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9990 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4996 [00:00<?, ? examples/s]

Saved GretelAI dataset successfully!


In [8]:
# Reload dataset 
dataset = load_from_disk("gretelai_dataset")
print("Example from test set:", dataset["test"][0])

# Function to tokenize a batch of examples (creates prompt from context and query)
def tokenize_function(batch: dict) -> dict:
    start_prompt = "Context:\n"
    middle_prompt = "\n\nQuery:\n"
    end_prompt = "\n\nResponse:\n"
    prompts = [f"{start_prompt}{ctx}{middle_prompt}{qry}{end_prompt}"
               for ctx, qry in zip(batch["context"], batch["query"])]
    tokenized_inputs = tokenizer(prompts, padding="max_length", truncation=True, max_length=512)
    tokenized_labels = tokenizer(batch["response"], padding="max_length", truncation=True, max_length=256)
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in seq]
              for seq in tokenized_labels["input_ids"]]
    batch["input_ids"] = tokenized_inputs["input_ids"]
    batch["attention_mask"] = tokenized_inputs["attention_mask"]
    batch["labels"] = labels
    return batch

# Try to load the tokenized dataset
try:
    tokenized_datasets = load_from_disk("tokenized_datasets")
    print("Loaded tokenized dataset from disk.")
except Exception as e:
    print("Tokenized dataset not found. Creating a new one...")
    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["query", "context", "response"], num_proc=4)
    tokenized_datasets.save_to_disk("tokenized_datasets")
    print("Tokenized dataset saved.")
tokenized_datasets.set_format("torch")
print("Tokenized dataset splits:", tokenized_datasets.keys())


Example from test set: {'query': 'What is the average CO2 emission for each garment manufacturing process?', 'context': 'CREATE TABLE emissions (emission_id INT, garment_type VARCHAR(50), manufacturing_process VARCHAR(50), co2_emission DECIMAL(10, 2));', 'response': 'SELECT garment_type, AVG(co2_emission) FROM emissions GROUP BY garment_type;'}
Tokenized dataset not found. Creating a new one...


Map (num_proc=4):   0%|          | 0/84920 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/9990 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4996 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/84920 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9990 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4996 [00:00<?, ? examples/s]

Tokenized dataset saved.
Tokenized dataset splits: dict_keys(['train', 'test', 'validation'])


In [9]:
!pip install hf_xet

Collecting hf_xet
  Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 MB[0m [31m164.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: hf_xet
Successfully installed hf_xet-1.0.3


In [10]:
# --- MODEL INITIALIZATION ---

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model_name = "google/flan-t5-base"

# Load the baseline (original) model for inference/testing
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
original_model = original_model.to(device)

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Test a sample prompt from the test set
index = 0
query = dataset["test"][index]["query"]
context = dataset["test"][index]["context"]
response = dataset["test"][index]["response"]
prompt = f"Context:\n{context}\n\nQuery:\n{query}\n\nResponse:\n"

inputs = tokenizer(prompt, return_tensors="pt").to(device)
baseline_output = tokenizer.decode(
    original_model.generate(inputs["input_ids"], max_new_tokens=200)[0],
    skip_special_tokens=True
)

print("-" * 100)
print("INPUT PROMPT:\n", prompt)
print("-" * 100)
print("HUMAN RESPONSE:\n", response)
print("-" * 100)
print("BASELINE MODEL OUTPUT (ZERO SHOT):\n", baseline_output)

clear_memory()

# --- LOAD FINETUNED MODEL OR INITIALIZE FOR QLORA TRAINING ---

to_train = True
try:
    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("text2sql_flant5base_finetuned")
    finetuned_model = finetuned_model.to(device)
    print("Fine-tuned model loaded successfully.")
    to_train = False
except Exception as e:
    print("Fine-tuned model not found. Initializing model for QLORA fine-tuning...")

    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )

    finetuned_model = prepare_model_for_kbit_training(finetuned_model)

    lora_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=["q", "v"],
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_2_SEQ_LM"
    )

    finetuned_model = get_peft_model(finetuned_model, lora_config)
    print("Base model loaded and prepared for QLORA fine-tuning.")
    clear_memory()

print("Finetuned model is on device:", next(finetuned_model.parameters()).device)


----------------------------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE emissions (emission_id INT, garment_type VARCHAR(50), manufacturing_process VARCHAR(50), co2_emission DECIMAL(10, 2));

Query:
What is the average CO2 emission for each garment manufacturing process?

Response:

----------------------------------------------------------------------------------------------------
HUMAN RESPONSE:
 SELECT garment_type, AVG(co2_emission) FROM emissions GROUP BY garment_type;
----------------------------------------------------------------------------------------------------
BASELINE MODEL OUTPUT (ZERO SHOT):
 10 - 2
Fine-tuned model not found. Initializing model for QLORA fine-tuning...
Base model loaded and prepared for QLORA fine-tuning.
Finetuned model is on device: cuda:0


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, get_peft_model_state_dict
import torch
from torch.utils.checkpoint import checkpoint
from torch.utils.data import DataLoader
from tqdm import tqdm

model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

to_train = True
try:
    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
    finetuned_model = finetuned_model.to(device)
    print("Fine-tuned model loaded successfully.")
    to_train = False
except Exception as e:
    print("Fine-tuned model not found. Initializing model for QLORA fine-tuning...")

    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto",
        torch_dtype=torch.bfloat16,
    )

    # Disable use_cache for compatibility with gradient checkpointing
    finetuned_model.config.use_cache = False

    # Enable gradient checkpointing with explicit use_reentrant=False
    finetuned_model.gradient_checkpointing_enable(use_reentrant=False)

    finetuned_model = prepare_model_for_kbit_training(finetuned_model)

    lora_config = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=["q", "v"],
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_2_SEQ_LM"
    )

    finetuned_model = get_peft_model(finetuned_model, lora_config)
    print("Base model loaded and prepared for QLORA fine-tuning.")
    clear_memory()

# TRAINING CONFIG
num_train_epochs = 5  
per_device_train_batch_size = 16
effective_batch_size = 64
accumulation_steps = effective_batch_size // per_device_train_batch_size
learning_rate = 2e-4

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=per_device_train_batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True,
)

finetuned_model.train()
optimizer = torch.optim.AdamW(finetuned_model.parameters(), lr=learning_rate)

for epoch in range(num_train_epochs):
    print(f"Epoch {epoch+1}/{num_train_epochs}")
    epoch_loss = 0.0
    optimizer.zero_grad()
    for i, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1}", leave=False)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = finetuned_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps
        loss.backward()

        epoch_loss += loss.item()
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    optimizer.step()
    optimizer.zero_grad()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} average loss: {avg_loss:.4f}")

print("Training completed.")


In [16]:
# --- EVALUATION ON TEST SET ---

print("Evaluating on test set...")
all_human_responses = []
all_model_responses = []

# Create a DataLoader for the test set; here batch size can be higher.
test_dataloader = DataLoader(
    tokenized_datasets["test"],
    batch_size=64,
    shuffle=False,
    num_workers=4,
    pin_memory=True,
)

finetuned_model.eval()
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating", leave=False):
        input_ids = batch["input_ids"]
        if input_ids.dim() == 1:
            input_ids = input_ids.unsqueeze(0)
        attention_mask = batch["attention_mask"]
        if attention_mask.dim() == 1:
            attention_mask = attention_mask.unsqueeze(0)
        # Generate responses
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        generated_ids = finetuned_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=100,
            num_beams=5,
            repetition_penalty=1.2,
            temperature=0.1,
            early_stopping=True,
        )
        # Decode outputs
        outputs_decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        all_model_responses.extend(outputs_decoded)
        # Also keep the human responses from the original dataset (not tokenized)
        all_human_responses.extend(batch["labels"])  # Note: you may wish to decode labels separately

# If needed, you can convert the gold labels back from tokens using your dataset
# For this example, we assume your original 'test' split in dataset has the human responses.
all_human_responses = [ex for ex in dataset["test"]["response"]]

# Compute evaluation metrics using evaluate library
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

orig_rouge = rouge.compute(
    predictions=all_model_responses,
    references=all_human_responses,
    use_aggregator=True,
    use_stemmer=True,
)
orig_bleu = bleu.compute(
    predictions=all_model_responses,
    references=[[ref] for ref in all_human_responses]
)

# Compute a fuzzy match score and exact match accuracy
def normalize_sql(sql):
    return " ".join(sql.strip().lower().split())

def compute_exact_match(preds, refs):
    matches = sum(1 for pred, ref in zip(preds, refs) if normalize_sql(pred) == normalize_sql(ref))
    return 100 * matches / len(preds) if preds else 0

def compute_fuzzy_match(preds, refs):
    scores = [fuzz.token_set_ratio(pred, ref) for pred, ref in zip(preds, refs)]
    return sum(scores) / len(scores) if scores else 0

fuzzy_score = compute_fuzzy_match(all_model_responses, all_human_responses)
exact_match = compute_exact_match(all_model_responses, all_human_responses)

print("=" * 100)
print("Evaluation Metrics:")
print("=" * 100)
print("ROUGE:", orig_rouge)
print("BLEU:", orig_bleu)
print(f"Fuzzy Match Score: {fuzzy_score:.2f}%")
print(f"Exact Match Accuracy: {exact_match:.2f}%")
print("=" * 100)


Evaluating on test set...


                                                             

Evaluation Metrics:
ROUGE: {'rouge1': 0.8177750163270219, 'rouge2': 0.6984209538302064, 'rougeL': 0.7908273408377583, 'rougeLsum': 0.7909292764194}
BLEU: {'bleu': 0.5528253924023702, 'precisions': [0.8523966800184269, 0.7132143843741368, 0.6239086066580287, 0.5472419708545557], 'brevity_penalty': 0.8190255090486739, 'length_ratio': 0.8335833742836408, 'translation_length': 256146, 'reference_length': 307283}
Fuzzy Match Score: 89.36%
Exact Match Accuracy: 26.63%


In [13]:
!pip install -q rouge_score nltk

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Load the BLEU metric
bleu = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [19]:
# ✅ Save LoRA adapters (not the full model)
finetuned_model.save_pretrained("./finetuned_flan_t5_lora", save_adapter=True)
tokenizer.save_pretrained("./finetuned_flan_t5_lora")

('./finetuned_flan_t5_lora/tokenizer_config.json',
 './finetuned_flan_t5_lora/special_tokens_map.json',
 './finetuned_flan_t5_lora/tokenizer.json')

In [22]:
from peft import PeftModel

save_directory = "./finetuned_flan_t5_lora"

# Save ONLY the LoRA adapter and config
finetuned_model.save_pretrained(save_directory, save_adapter=True)
tokenizer.save_pretrained(save_directory)

print(f"LoRA adapters and tokenizer saved to {save_directory}")

LoRA adapters and tokenizer saved to ./finetuned_flan_t5_lora


In [24]:
def infer_index(dataset, idx, tokenizer, model, device="cuda", max_new_tokens=50):
    model.eval()

    sample = dataset[idx]
    input_text = sample["input"] if "input" in sample else sample["text"]

    # Tokenize input
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(device)

    # Generate output
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens
        )

    # Decode prediction
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return {
        "input": input_text,
        "prediction": prediction,
        "reference": sample.get("output") or sample.get("target") or None
    }


In [27]:
# Define your save directory
save_directory = "./finetuned_flan_t5_lora_2"

# Save only the LoRA adapters and related configuration (ensures adapter_config.json is included)
finetuned_model.save_pretrained(save_directory, save_adapter=True)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"QLoRA model and tokenizer have been saved to: {save_directory}")


QLoRA model and tokenizer have been saved to: ./finetuned_flan_t5_lora_2


In [29]:
import torch
from transformers import AutoTokenizer
import gc

# Clear memory
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the baseline tokenizer
# (Assuming that the same base tokenizer was used for fine-tuning)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

# -------------------------------
# Inference function definition
# -------------------------------

def infer_index(sample: dict, model, tokenizer, device, max_new_tokens=200):
    """
    Given a sample from the dataset (with keys 'context', 'query', and optionally 'response'),
    this function constructs the prompt, tokenizes it, and generates a prediction using `model`.
    """
    # Create the prompt by concatenating context and query
    prompt = f"Context:\n{sample['context']}\n\nQuery:\n{sample['query']}\n\nResponse:\n"
    
    # Tokenize the prompt (ensure proper truncation/padding if necessary)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
    
    # In evaluation mode, generate outputs
    model.eval()
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
        )
    
    # Decode the generated ids to get the text output
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    return {
        "prompt": prompt,
        "prediction": generated_text,
        "reference": sample.get("response", None)
    }


from datasets import load_from_disk
dataset = load_from_disk("gretelai_dataset")
print("Dataset loaded. Number of test samples:", len(dataset["test"]))

sample_idx = 10
test_sample = dataset["test"][sample_idx]

result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

clear_memory()


Using device: cuda
Dataset loaded. Number of test samples: 9990
--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE broadband_services (service_id INT, region VARCHAR(255), revenue DECIMAL(10,2)); INSERT INTO broadband_services (service_id, region, revenue) VALUES (1, 'North', 5000), (2, 'South', 7000);

Query:
What is the total revenue from broadband services for each region?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT region, SUM(revenue) FROM broadband_services GROUP BY region;
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT region, SUM(revenue) FROM broadband_services GROUP BY region;
--------------------------------------------------------------------------------


In [32]:
sample_idx = 22
test_sample = dataset["test"][sample_idx]

result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE container_ships (ship_id INT, ship_name VARCHAR(255), ship_builder VARCHAR(255), year INT, cargo_weight INT);INSERT INTO container_ships (ship_id, ship_name, ship_builder, year, cargo_weight) VALUES (1, 'Ever Given', 'Baosteel', 2010, 210000), (2, 'CMA CGM Marco Polo', 'Daewoo Shipbuilding & Marine Engineering', 2008, 165000);

Query:
What is the total cargo weight handled by container ships built before 2010, grouped by ship builder?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT ship_builder, SUM(cargo_weight) FROM container_ships WHERE year < 2010 GROUP BY ship_builder;
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT ship_builder, SUM(cargo_weight) FROM container_ships WHERE year  2010 GROUP BY ship_builder;
----------------------------

In [33]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 32
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE region_rainfall (region TEXT, date DATE, rainfall INTEGER);

Query:
What is the maximum rainfall recorded for each region in the past year?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT region, MAX(rainfall) as max_rainfall FROM region_rainfall WHERE date >= DATEADD(year, -1, GETDATE()) GROUP BY region;
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT region, MAX(rainfall) FROM region_rainfall WHERE date >= DATEADD(year, -1, GETDATE()) GROUP BY region;
--------------------------------------------------------------------------------


In [34]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 41
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE fish_biomass (species TEXT, population REAL, biomass REAL); INSERT INTO fish_biomass (species, population, biomass) VALUES ('Cod', 10000, 200000), ('Herring', 20000, 300000);

Query:
What is the total biomass of fish in the Barents Sea?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT SUM(biomass) FROM fish_biomass WHERE species IN ('Cod', 'Herring', 'Capelin');
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT SUM(biomass) FROM fish_biomass WHERE species IN ('Cod', 'Herring');
--------------------------------------------------------------------------------


In [36]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 120
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE SpaceLaunchs (LaunchID INT, Country VARCHAR(50), SatelliteID INT); INSERT INTO SpaceLaunchs (LaunchID, Country, SatelliteID) VALUES (1, 'USA', 101), (2, 'Russia', 201), (3, 'China', 301), (4, 'India', 401), (5, 'Japan', 501);

Query:
What is the total number of satellites launched by country in the SpaceLaunchs table?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT Country, COUNT(SatelliteID) AS TotalSatellites FROM SpaceLaunchs GROUP BY Country;
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT Country, COUNT(*) as TotalSatellites FROM SpaceLaunches GROUP BY Country;
--------------------------------------------------------------------------------


In [37]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 190
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE claim (claim_id INT, processed_by VARCHAR(50)); INSERT INTO claim VALUES (1, 'Laura Smith'); INSERT INTO claim VALUES (2, 'Maria Silva');

Query:
Which claims were processed by the claims adjuster 'Maria Silva'?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT claim_id FROM claim WHERE processed_by = 'Maria Silva';
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT claim_id FROM claim WHERE processed_by = 'Maria Silva';
--------------------------------------------------------------------------------


In [38]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 2010
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE environmental_impact (site_name VARCHAR(50), co2_emissions INT, waste_generation INT); INSERT INTO environmental_impact (site_name, co2_emissions, waste_generation) VALUES ('Site Alpha', 1200, 500), ('Site Bravo', 1800, 800), ('Site Charlie', 2500, 1000);

Query:
What are the total CO2 emissions for all mining sites in the 'environmental_impact' table?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT SUM(co2_emissions) FROM environmental_impact;
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT SUM(co2_emissions) FROM environmental_impact;
--------------------------------------------------------------------------------


In [39]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 2100
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE clients (client_id INT, name VARCHAR(50)); CREATE TABLE cases (case_id INT, client_id INT, billing_amount DECIMAL(10,2)); INSERT INTO clients (client_id, name) VALUES (1, 'Smith'), (2, 'Johnson'), (3, 'Williams'), (4, 'Brown'); INSERT INTO cases (case_id, client_id, billing_amount) VALUES (1, 1, 3000.00), (2, 2, 6000.00), (3, 3, 7000.00), (4, 4, 4000.00);

Query:
List all clients who have paid exactly $4000 in total billing amount?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT clients.name FROM clients INNER JOIN cases ON clients.client_id = cases.client_id GROUP BY clients.name HAVING SUM(billing_amount) = 4000;
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT clients.name FROM clients INNER JOIN cases ON clients.client_id = cases.client_

In [40]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 1010
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE Water_Usage (Year INT, Sector VARCHAR(20), Volume INT); INSERT INTO Water_Usage (Year, Sector, Volume) VALUES (2019, 'Industry', 12300000), (2018, 'Industry', 12000000), (2020, 'Industry', 12500000);

Query:
What is the total volume of water consumed by the industrial sector in the state of Florida in 2020?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT SUM(Volume) FROM Water_Usage WHERE Year = 2020 AND Sector = 'Industry';
--------------------------------------------------------------------------------
MODEL PREDICTION:
 SELECT SUM(Volume) FROM Water_Usage WHERE Sector = 'Industry' AND Year = 2020;
--------------------------------------------------------------------------------


In [41]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 1011
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE Dispensaries (id INT, name VARCHAR(255), city VARCHAR(255), state VARCHAR(255));CREATE TABLE Inventory (id INT, dispensary_id INT, weight DECIMAL(10, 2), product_type VARCHAR(255), month INT, year INT);INSERT INTO Dispensaries (id, name, city, state) VALUES (1, 'Green Leaf', 'Denver', 'CO');INSERT INTO Inventory (id, dispensary_id, weight, product_type, month, year) VALUES (1, 1, 250, 'flower', 4, 2021);

Query:
What was the total weight of cannabis flower sold by each dispensary in the city of Denver in the month of April 2021?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT d.name, SUM(i.weight) as total_weight FROM Dispensaries d JOIN Inventory i ON d.id = i.dispensary_id WHERE d.city = 'Denver' AND i.product_type = 'flower' AND i.month = 4 AND i.year = 2021 GROUP BY d.name;
--------------------

In [42]:
# Choose an index from the test set (e.g., first sample)
sample_idx = 1012
test_sample = dataset["test"][sample_idx]

# Generate inference on the sample using the in-memory model (finetuned_model)
result = infer_index(test_sample, finetuned_model, tokenizer, device, max_new_tokens=200)

# Print the results
print("-" * 80)
print("INPUT PROMPT:\n", result["prompt"])
print("-" * 80)
print("REFERENCE RESPONSE:\n", result["reference"])
print("-" * 80)
print("MODEL PREDICTION:\n", result["prediction"])
print("-" * 80)

# Cleanup memory if needed
clear_memory()


--------------------------------------------------------------------------------
INPUT PROMPT:
 Context:
CREATE TABLE Departments (DepartmentID INT PRIMARY KEY, DepartmentName VARCHAR(50), BudgetForDisabilityAccommodations DECIMAL(10,2), NumberOfStudentsWithDisabilities INT); CREATE TABLE Universities (UniversityID INT PRIMARY KEY, UniversityName VARCHAR(50), UniversityLocation VARCHAR(50)); CREATE TABLE UniversityDepartments (UniversityDepartmentID INT PRIMARY KEY, UniversityID INT, DepartmentID INT, FOREIGN KEY (UniversityID) REFERENCES Universities(UniversityID), FOREIGN KEY (DepartmentID) REFERENCES Departments(DepartmentID));

Query:
What is the total budget for disability accommodations in departments with more than 20% of students with disabilities in a university in Canada?

Response:

--------------------------------------------------------------------------------
REFERENCE RESPONSE:
 SELECT SUM(BudgetForDisabilityAccommodations) as TotalBudget FROM UniversityDepartments ud JO