In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [4]:
# Load the dataset
df = pd.read_csv("spider_text_sql.csv")

In [5]:
# Inspect columns to verify correctness
print("Dataset Columns:", df.columns)

Dataset Columns: Index(['text_query', 'sql_command'], dtype='object')


In [6]:
# Define the correct column names
query_column_name = "text_query"  # Column containing natural language queries
target_column_name = "sql_command"  # Column containing SQL commands


In [7]:
# Drop unnecessary index column if it exists
if "__index_level_0__" in df.columns:
    df = df.drop(columns=["__index_level_0__"])

In [8]:
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Convert to Hugging Face's Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [10]:
# Tokenization Function
def preprocess_data(examples):
    inputs = [f"translate English to SQL: {query}" for query in examples[query_column_name]]
    targets = examples[target_column_name]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Tokenize the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Apply tokenization
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)