In [1]:
# evaluate_spider.py - Script to generate predictions for the dev set

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset # Used for loading dev.json cleanly
import json
import os
from tqdm import tqdm # For progress bar

# --- Configuration ---
# This path should point to where main_spider.py saved your trained model
MODEL_PATH = "./fine_tuned_t5_spider_sql_generator"
TOKENIZER = T5Tokenizer.from_pretrained(MODEL_PATH)
MODEL = T5ForConditionalGeneration.from_pretrained(MODEL_PATH)

# --- IMPORTANT: DATASET_DIR needs to be the same as in main_spider.py ---
DATASET_DIR = "/home/jaggu/Deep_L/spider_dataset_extracted/spider_data"

# Verify paths
if not os.path.isdir(DATASET_DIR):
    raise FileNotFoundError(f"DATASET_DIR '{DATASET_DIR}' does not exist. Please check your path.")
if not os.path.exists(os.path.join(DATASET_DIR, "dev.json")):
    raise FileNotFoundError(f"dev.json not found in '{DATASET_DIR}'. Ensure Spider files are extracted correctly.")
if not os.path.exists(os.path.join(DATASET_DIR, "tables.json")):
    raise FileNotFoundError(f"tables.json not found in '{DATASET_DIR}'. Ensure Spider files are extracted correctly.")


# Load the tables.json for schema lookup
tables_file_path = os.path.join(DATASET_DIR, 'tables.json')
with open(tables_file_path, 'r', encoding='utf-8') as f:
    db_schemas = {db['db_id']: db for db in json.load(f)}

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL.to(device)
MODEL.eval() # Set model to evaluation mode

print(f"Model loaded from {MODEL_PATH} on: {device}")


# --- Schema Representation Function (MUST be identical to the one used during training) ---
def get_schema_representation(db_id, db_schemas_dict):
    """
    Generates a textual representation of the database schema for a given db_id.
    Includes table names, column names, and their types.
    """
    schema = db_schemas_dict[db_id]
    schema_parts = []
    
    for table_idx, table_name_original in enumerate(schema['table_names_original']):
        schema_parts.append(f"table {table_idx}: {table_name_original}")
        
        table_cols = []
        for col_idx, (col_table_idx, col_name_original) in enumerate(schema['column_names_original']):
            if col_table_idx == table_idx:
                col_type = schema['column_types'][col_idx]
                table_cols.append(f"column {col_idx}: {col_name_original} ({col_type})")
        
        if table_cols:
            schema_parts.append("  " + "; ".join(table_cols))
            
    return " | ".join(schema_parts)


# --- SQL Query Generation Function ---
def generate_sql_query(nl_question, db_id):
    """
    Generates an SQL query from a natural language question and a database ID.
    """
    schema_text = get_schema_representation(db_id, db_schemas)
    input_text = f"generate sql: {schema_text} | question: {nl_question}"
    
    inputs = TOKENIZER(input_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad(): # Disable gradient calculation for inference
        outputs = MODEL.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=512,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    generated_sql = TOKENIZER.decode(outputs[0], skip_special_tokens=True)
    return generated_sql

# --- Load the Spider Dev Set ---
# Use the same normalization function as in main_spider.py for consistency
def load_and_normalize_spider_split(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    questions = []
    queries = []
    db_ids = []
    for item in data:
        sql_query = item['query']
        if isinstance(sql_query, list):
            sql_query = sql_query[0] if sql_query else ""
        if 'question' in item and 'db_id' in item and sql_query is not None:
            questions.append(item['question'])
            queries.append(sql_query)
            db_ids.append(item['db_id'])
    return Dataset.from_dict({'question': questions, 'query': queries, 'db_id': db_ids})

print("\nLoading Spider dev (validation) set for prediction...")
dev_dataset = load_and_normalize_spider_split(os.path.join(DATASET_DIR, 'dev.json'))
print(f"Loaded {len(dev_dataset)} examples for prediction.")

# --- Generate Predictions and Save to File ---
output_predictions_file = "predictions.sql" # This will be created in your current working directory

print(f"\nGenerating SQL predictions for {len(dev_dataset)} examples...")
generated_sqls = []
for i, example in tqdm(enumerate(dev_dataset), total=len(dev_dataset)):
    question = example['question']
    db_id = example['db_id']
    
    pred_sql = generate_sql_query(question, db_id)
    generated_sqls.append(pred_sql)

# Write predictions to a file, one query per line
with open(output_predictions_file, 'w', encoding='utf-8') as f:
    for sql in generated_sqls:
        f.write(sql.strip() + '\n')

print(f"Predictions saved to '{output_predictions_file}'.")
print("\nProceed to Step 2: Set up and run the official Spider evaluation script.")

  from .autonotebook import tqdm as notebook_tqdm


Model loaded from ./fine_tuned_t5_spider_sql_generator on: cuda

Loading Spider dev (validation) set for prediction...
Loaded 1034 examples for prediction.

Generating SQL predictions for 1034 examples...


100%|███████████████████████████████████████| 1034/1034 [02:46<00:00,  6.22it/s]

Predictions saved to 'predictions.sql'.

Proceed to Step 2: Set up and run the official Spider evaluation script.



