# Spider Dataset to Azure AI Foundry JSONL Conversion

This notebook converts the Spider text-to-SQL dataset into the JSONL format required for fine-tuning GPT-4o mini on Azure AI Foundry.

**Goal Format:**
```json
{"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
```

In [None]:
import json
import os

## 1. Load Data
We define the paths to the Spider dataset files.

In [None]:
# Define paths (assuming notebook is in spider/azureAiFoundary/)
base_path = "../"
tables_path = os.path.join(base_path, "tables.json")
train_path = os.path.join(base_path, "train_spider.json")
dev_path = os.path.join(base_path, "dev.json")
output_path = "spider_finetuning.jsonl"
val_output_path = "spider_validation.jsonl"

print(f"Tables path: {tables_path}")
print(f"Train path: {train_path}")
print(f"Dev path: {dev_path}")

## 2. Process Schemas
We need to create a string representation of the database schema for each `db_id` to include in the system prompt. This helps the model understand the table structures.

In [None]:
def load_schemas(tables_path):
    with open(tables_path, 'r') as f:
        tables_data = json.load(f)
    
    schemas = {}
    for db in tables_data:
        db_id = db['db_id']
        table_names = db['table_names_original']
        column_names = db['column_names_original']
        
        # Organize columns by table index
        table_cols = {i: [] for i in range(len(table_names))}
        for col_idx, (table_idx, col_name) in enumerate(column_names):
            if table_idx >= 0: # -1 is usually for "*"
                table_cols[table_idx].append(col_name)
        
        # Format schema string
        schema_parts = []
        for i, table_name in enumerate(table_names):
            cols = ", ".join(table_cols[i])
            schema_parts.append(f"Table: {table_name}, columns: [{cols}]")
        
        schemas[db_id] = "\n".join(schema_parts)
    
    return schemas

schemas = load_schemas(tables_path)
print(f"Loaded schemas for {len(schemas)} databases.")
# Example schema
example_db = list(schemas.keys())[0]
print(f"\nExample Schema ({example_db}):\n{schemas[example_db]}")

## 3. Convert to JSONL
Iterate through the training examples and format them.

In [None]:
def convert_to_jsonl(input_path, schemas, output_path):
    with open(input_path, 'r') as f:
        data = json.load(f)
    
    with open(output_path, 'w') as f_out:
        for item in data:
            db_id = item['db_id']
            question = item['question']
            query = item['query']
            
            if db_id not in schemas:
                print(f"Warning: Schema not found for {db_id}")
                continue
            
            schema_context = schemas[db_id]
            
            # Construct the chat message
            system_message = f"You are a helpful assistant that translates natural language questions into SQL queries.\nThe database schema is as follows:\n{schema_context}"
            
            jsonl_entry = {
                "messages": [
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": question},
                    {"role": "assistant", "content": query}
                ]
            }
            
            f_out.write(json.dumps(jsonl_entry) + "\n")
            
    print(f"Converted {len(data)} examples from {input_path} to {output_path}")

# Convert Training Data
convert_to_jsonl(train_path, schemas, output_path)

# Convert Validation Data
if os.path.exists(dev_path):
    convert_to_jsonl(dev_path, schemas, val_output_path)
else:
    print(f"Warning: {dev_path} not found")

## 4. Verify Output
Check the first few lines of the generated files.

In [None]:
print("First 2 lines of training output:")
with open(output_path, 'r') as f:
    for i in range(2):
        print(f.readline())

print("\nFirst 2 lines of validation output:")
if os.path.exists(val_output_path):
    with open(val_output_path, 'r') as f:
        for i in range(2):
            print(f.readline())