In [None]:
# Dynamic Few-shot Examples

In [2]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import faiss
from google import genai

load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=GEMINI_API_KEY)

In [7]:
synthetic_dataset = pd.read_csv("synthetic_dataset.csv")   
synthetic_dataset

Unnamed: 0,id,domain,domain_description,sql_complexity,sql_complexity_description,sql_task_type,sql_task_type_description,sql_prompt,sql_context,sql,sql_explanation
0,5097,forestry,Comprehensive data on sustainable forest manag...,single join,"only one join (specify inner, outer, cross)",analytics and reporting,"generating reports, dashboards, and analytical...",What is the total volume of timber sold by eac...,"CREATE TABLE salesperson (salesperson_id INT, ...","SELECT salesperson_id, name, SUM(volume) as to...","Joins timber_sales and salesperson tables, gro..."
1,5098,defense industry,"Defense contract data, military equipment main...",aggregation,"aggregation functions (COUNT, SUM, AVG, MIN, M...",analytics and reporting,"generating reports, dashboards, and analytical...",List all the unique equipment types and their ...,CREATE TABLE equipment_maintenance (equipment_...,"SELECT equipment_type, SUM(maintenance_frequen...",This query groups the equipment_maintenance ta...
2,5099,marine biology,"Comprehensive data on marine species, oceanogr...",basic SQL,basic SQL with a simple select statement,analytics and reporting,"generating reports, dashboards, and analytical...",How many marine species are found in the South...,"CREATE TABLE marine_species (name VARCHAR(50),...",SELECT COUNT(*) FROM marine_species WHERE loca...,This query counts the number of marine species...
3,5100,financial services,Detailed financial data including investment s...,aggregation,"aggregation functions (COUNT, SUM, AVG, MIN, M...",analytics and reporting,"generating reports, dashboards, and analytical...",What is the total trade value and average pric...,"CREATE TABLE trade_history (id INT, trader_id ...","SELECT trader_id, stock, SUM(price * quantity)...",This query calculates the total trade value an...
4,5101,energy,Energy market data covering renewable energy s...,window functions,"window functions (e.g., ROW_NUMBER, LEAD, LAG,...",analytics and reporting,"generating reports, dashboards, and analytical...",Find the energy efficiency upgrades with the h...,"CREATE TABLE upgrades (id INT, cost FLOAT, typ...","SELECT type, cost FROM (SELECT type, cost, ROW...",The SQL query uses the ROW_NUMBER function to ...
...,...,...,...,...,...,...,...,...,...,...,...
95,5192,disability services,Comprehensive data on disability accommodation...,basic SQL,basic SQL with a simple select statement,analytics and reporting,"generating reports, dashboards, and analytical...",List the programs and their budgets for mobili...,"CREATE TABLE Programs (Program VARCHAR(20), Bu...","SELECT Program, Budget FROM Programs WHERE Typ...",This SQL query lists the programs and their bu...
96,5193,space exploration,"Spacecraft manufacturing data, space mission r...",aggregation,"aggregation functions (COUNT, SUM, AVG, MIN, M...",analytics and reporting,"generating reports, dashboards, and analytical...",List the space missions that have had astronau...,CREATE TABLE SpaceMissions (mission_name VARCH...,SELECT mission_name FROM SpaceMissions WHERE a...,This query lists the space missions that have ...
97,5194,oceans,"Ocean data on marine conservation, ocean acidi...",basic SQL,basic SQL with a simple select statement,analytics and reporting,"generating reports, dashboards, and analytical...",What is the average depth of all marine protec...,CREATE TABLE marine_protected_areas (name VARC...,SELECT AVG(avg_depth) FROM marine_protected_ar...,This query calculates the average depth of all...
98,5195,oil and gas,"Exploration data, production figures, infrastr...",aggregation,"aggregation functions (COUNT, SUM, AVG, MIN, M...",analytics and reporting,"generating reports, dashboards, and analytical...",Calculate the total production in the Southern...,"CREATE TABLE production (well_id INT, type VAR...","SELECT type, SUM(quantity) as total_production...",This SQL query groups the production data by o...


In [30]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

Loading weights: 100%|██████████| 103/103 [00:00<00:00, 936.46it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [31]:
dataset_questions = synthetic_dataset['sql_prompt'].tolist()
dataset_embeddings = embedder.encode(dataset_questions, convert_to_numpy=True)

In [32]:
def retrieve_few_shot_examples(user_question, dataset, dataset_embeddings, embedder, k=1):
    # Encode user question
    user_emb = embedder.encode([user_question], convert_to_numpy=True)
    
    # Compute cosine similarity
    cos_sim = np.dot(dataset_embeddings, user_emb.T).flatten() / (
        np.linalg.norm(dataset_embeddings, axis=1) * np.linalg.norm(user_emb)
    )
    
    # Get top-k similar examples
    top_indices = cos_sim.argsort()[-k:][::-1]
    examples = [dataset.iloc[i].to_dict() for i in top_indices]
    return examples

# Example user question
user_question = "Show all orders by Alice"

examples = retrieve_few_shot_examples(user_question, synthetic_dataset, dataset_embeddings, embedder, k=2)
print("Retrieved Examples for DFE:", examples)


Retrieved Examples for DFE: [{'id': 5157, 'domain': 'space', 'domain_description': 'Space data on space exploration, satellite technology, space debris mitigation, and astrobiology.', 'sql_complexity': 'basic SQL', 'sql_complexity_description': 'basic SQL with a simple select statement', 'sql_task_type': 'analytics and reporting', 'sql_task_type_description': 'generating reports, dashboards, and analytical insights', 'sql_prompt': 'Display the names of all satellites launched before 2010', 'sql_context': "CREATE TABLE satellites (id INT, name TEXT, country TEXT, launch_date DATE); INSERT INTO satellites (id, name, country, launch_date) VALUES (1, 'Sentinel-1A', 'France', '2012-04-03'); INSERT INTO satellites (id, name, country, launch_date) VALUES (2, 'Sentinel-1B', 'France', '2014-04-22'); INSERT INTO satellites (id, name, country, launch_date) VALUES (3, 'USA-202', 'USA', '2011-03-24'); INSERT INTO satellites (id, name, country, launch_date) VALUES (4, 'INSAT-3A', 'India', '2003-04-1

In [24]:
examples

[{'id': 5157,
  'domain': 'space',
  'domain_description': 'Space data on space exploration, satellite technology, space debris mitigation, and astrobiology.',
  'sql_complexity': 'basic SQL',
  'sql_complexity_description': 'basic SQL with a simple select statement',
  'sql_task_type': 'analytics and reporting',
  'sql_task_type_description': 'generating reports, dashboards, and analytical insights',
  'sql_prompt': 'Display the names of all satellites launched before 2010',
  'sql_context': "CREATE TABLE satellites (id INT, name TEXT, country TEXT, launch_date DATE); INSERT INTO satellites (id, name, country, launch_date) VALUES (1, 'Sentinel-1A', 'France', '2012-04-03'); INSERT INTO satellites (id, name, country, launch_date) VALUES (2, 'Sentinel-1B', 'France', '2014-04-22'); INSERT INTO satellites (id, name, country, launch_date) VALUES (3, 'USA-202', 'USA', '2011-03-24'); INSERT INTO satellites (id, name, country, launch_date) VALUES (4, 'INSAT-3A', 'India', '2003-04-10');",
  'sq

In [26]:
def danke_keyword_match(user_question):
    # Simple demo mapping; in production this comes from DANKE API
    KM = {}
    if "Alice" in user_question or "orders" in user_question:
        KM = {
            "Alice": {"table": "Customer", "column": "name"},
            "orders": {"table": "Orders", "column": "amount"}
        }
    return KM

KM = danke_keyword_match(user_question)
print("DANKE keywords:", KM)


DANKE keywords: {'Alice': {'table': 'Customer', 'column': 'name'}, 'orders': {'table': 'Orders', 'column': 'amount'}}


In [27]:
KM

{'Alice': {'table': 'Customer', 'column': 'name'},
 'orders': {'table': 'Orders', 'column': 'amount'}}

In [28]:
# Define database schema for join discovery
schema_info = {
    "Customer": ["id", "name", "city"],
    "Orders": ["id", "customer_id", "amount", "order_date"]
}

# Simple join discovery function
def find_joins(tables, schema):
    # Very simple demo: we know Customer.id -> Orders.customer_id
    if "Customer" in tables and "Orders" in tables:
        return "Customer JOIN Orders ON Customer.id = Orders.customer_id"
    else:
        return " , ".join(tables)

def danke_synthesize_view(KM, schema):
    tables = set([v['table'] for v in KM.values()])
    join_clauses = find_joins(tables, schema)
    view_sql = f"CREATE VIEW V AS SELECT * FROM {join_clauses};"
    return view_sql

view_sql = danke_synthesize_view(KM, schema_info)
print("Generated SQL view:\n", view_sql)


Generated SQL view:
 CREATE VIEW V AS SELECT * FROM Customer JOIN Orders ON Customer.id = Orders.customer_id;


In [22]:
from dotenv import load_dotenv
import os
 

# Load API key
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=GEMINI_API_KEY)

# Construct prompt
few_shot_text = "\n".join([f"Example Question: {e['sql_prompt']}\nExample SQL: {e['sql']}" for e in examples])

prompt = f"""
You are an AI assistant that generates SQL queries.

Database schema view (pre-joined tables):
{view_sql}

Few-shot examples:
{few_shot_text}

User question: {user_question}

Generate the corresponding SQL query:
"""

# Generate SQL
response = client.models.generate_content(
    model="gemini-3-flash-preview",
    contents=prompt,
    config={"temperature": 0.2}
)

sql_query = response.text
print("Generated SQL:\n", sql_query)


Generated SQL:
 SELECT * FROM V WHERE name = 'Alice';
