In [1]:
!pip install -q sacrebleu rouge-score bert-score

In [5]:
import warnings
warnings.filterwarnings("ignore")

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from rouge_score import rouge_scorer
from sacrebleu.metrics import BLEU
import bert_score

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the fine-tuned model
def load_model(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
    model.eval()
    return tokenizer, model

# SQL generation function
def generate_sql(model, tokenizer, sql_prompt, sql_context, max_length=128):
    input_text = f"sql_prompt: {sql_prompt} sql_context: {sql_context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_new_tokens=max_length, min_new_tokens=5)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Load model and tokenizer
tokenizer, model = load_model("facebook/bart-large-cnn")

# Load test dataset
dataset = load_dataset("gretelai/synthetic_text_to_sql")
test_data = dataset["test"].select(range(1000))

# Compute metrics
exact_matches = []
references = []
predictions = []

for example in tqdm(test_data):
    prompt = example["sql_prompt"]
    context = example["sql_context"]
    ground_truth = example["sql"]

    prediction = generate_sql(model, tokenizer, prompt, context)

    references.append(ground_truth)
    predictions.append(prediction)
    exact_matches.append(int(prediction.strip().lower() == ground_truth.strip().lower()))

# Exact Match Accuracy
em_score = sum(exact_matches) / len(exact_matches)

# BLEU Score
bleu = BLEU()
bleu_score = bleu.corpus_score(predictions, [references]).score

# ROUGE-L
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
rouge_l_scores = [scorer.score(ref, pred)["rougeL"].fmeasure for ref, pred in zip(references, predictions)]
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

# BERTScore
P, R, F1 = bert_score.score(predictions, references, lang="en", device=device)
bert_f1 = F1.mean()

# Print metrics
print("\n=== Evaluation Metrics ===")
print(f"Exact Match: {em_score:.4f}")
print(f"BLEU Score: {bleu_score:.2f}")
print(f"ROUGE-L F1: {avg_rouge_l:.4f}")
print(f"BERTScore F1: {bert_f1.item():.4f}")


100%|██████████| 1000/1000 [13:04<00:00,  1.27it/s]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Evaluation Metrics ===
Exact Match: 0.0000
BLEU Score: 5.17
ROUGE-L F1: 0.2332
BERTScore F1: 0.8346


In [7]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import sqlite3

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from tqdm import tqdm
import sqlite3
import warnings

warnings.filterwarnings("ignore")
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


def load_model(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    model.eval()
    return tokenizer, model

def generate_sql(model, tokenizer, sql_prompt, sql_context, max_length=128):
    input_text = f"sql_prompt: {sql_prompt} sql_context: {sql_context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        output_tokens = model.generate(**inputs, max_new_tokens=max_length, min_new_tokens=5)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)


# -------------------------------
# Execute SQL on SQLite DB
# -------------------------------
def run_query(context, query):
    try:
        conn = sqlite3.connect(":memory:")
        cur = conn.cursor()
        # Split context into separate statements and execute
        for stmt in context.strip().split(";"):
            stmt = stmt.strip()
            if stmt:
                cur.execute(stmt + ";")
        # Run the predicted/ground truth query
        cur.execute(query)
        results = cur.fetchall()
        conn.close()
        return results
    except Exception as e:
        return str(e)

def compute_execution_accuracy(dataset, model, tokenizer):
    correct = 0
    total = len(dataset)

    for sample in tqdm(dataset):
        sql_prompt = sample["sql_prompt"]
        sql_context = sample["sql_context"]
        gt_sql = sample["sql"]

        pred_sql = generate_sql(model, tokenizer, sql_prompt, sql_context)

        result_gt = run_query(sql_context, gt_sql)
        result_pred = run_query(sql_context, pred_sql)

        if result_gt == result_pred:
            correct += 1
        else:
            # print("\n---")
            # print("Prompt:", sql_prompt)
            # print("Predicted:", pred_sql)
            # print("Expected:", gt_sql)
            # print("GT Result:", result_gt)
            # print("Pred Result:", result_pred)
            ff = 0

    return correct / total



tokenizer, model = load_model("nl2sql_epoch2") 
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

dataset = load_dataset("gretelai/synthetic_text_to_sql")["test"]
subset = [dataset[i] for i in range(1000)]
exec_acc = compute_execution_accuracy(subset, model, tokenizer)
print(f"\nExecution Accuracy: {exec_acc:.2%}")


100%|██████████| 1000/1000 [08:01<00:00,  2.08it/s]


Execution Accuracy: 55.10%



