# Explainable NL Query Database Agents 



In [151]:

import sqlite3
import json
import pandas as pd
import os
import textwrap

In [None]:

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from openai import OpenAI


In [103]:
#set up environment with OpenAI API Key
os.environ["OPENAI_API_KEY"] = "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"

In [None]:

client = OpenAI()


In [105]:
#testing a simple question with the API 
response = client.responses.create (
    model = "gpt-5-mini",
    input = "how much gold would it take to coat the statue of liberty in a 1mm layer?",
    reasoning = {
        "effort": "minimal"
    }
)

print(response.output[1].content[0].text)

We need two pieces of information: the statue‚Äôs surface area and the volume of gold required for a 1 mm thick coating. Then we convert volume to mass and (optionally) to value.

1) Surface area
- Commonly quoted external surface area for the Statue of Liberty (the statue alone, not the pedestal) is about 2,000‚Äì2,500 square feet. More precise and widely used figure: roughly 2,500 ft¬≤ ‚âà 232 m¬≤.
- Some sources give ~57,000 ft¬≤ ‚Äî that includes interior surfaces and pedestal; ignore that for an exterior gold leaf coating. Most reliable estimates for the copper skin are ~250 m¬≤ (range 200‚Äì300 m¬≤). I'll use 232 m¬≤ as a representative value; I‚Äôll also show results for 200‚Äì300 m¬≤ to give a range.

2) Volume of gold for 1 mm layer
- Thickness t = 1 mm = 0.001 m.
- Volume V = area A √ó t.

For A = 232 m¬≤:
V = 232 √ó 0.001 = 0.232 m¬≥.

For A = 200 m¬≤:
V = 0.200 m¬≥.

For A = 300 m¬≤:
V = 0.300 m¬≥.

3) Mass of gold
- Density of gold œÅ ‚âà 19,300 kg/m¬≥.

Mass m = œÅ √ó V.


In [None]:
#loading JSON (schema file) from my local drive
tables_json_path = r"C:\Users\coffe\OneDrive\Desktop\CITS5553 Capstone Project\spider_data\spider_data\tables.json"
if os.path.exists(tables_json_path):
    with open(tables_json_path, 'r', encoding='utf-8') as f:
        tables_data = json.load(f)
    print(f"Number of tables/schemas: {len(tables_data)}")
    
    # Show column name
    if tables_data:
        first_entry = tables_data[5] 
        print(f"Keys: {list(first_entry.keys()) if isinstance(first_entry, dict) else 'Not a dictionary'}")
else:
    print("error.")


Number of tables/schemas: 166
Keys: ['column_names', 'column_names_original', 'column_types', 'db_id', 'foreign_keys', 'primary_keys', 'table_names', 'table_names_original']


In [118]:
# Here I only extract only db_id, table_names, and column_names from tables.json
def extract_essential_schema(tables_data):
    essential_data = []
    
    for entry in tables_data:
        simplified_entry = {
            'database_name': entry.get('db_id', 'undefined'),
            'table_names': entry.get('table_names', [])
            ##'column_names': entry.get('column_names', [])
        }
        essential_data.append(simplified_entry)
    
    return essential_data

# Apply the extraction if tables_data is loaded
if 'tables_data' in locals():
    essential_schemas = extract_essential_schema(tables_data)
    print(f" Extracted data for {len(essential_schemas)} database schemas")
    
    # Show example of the simplified structure
    if essential_schemas:
        print(f"\n Example of simplified entry:")
        example = essential_schemas[3]
        print(f"  database_name: {example['database_name']}")
        print(f"  table_names: {example['table_names']}")
        ##print(f"  column_names (first 3): {example['column_names'][:3]}...")
        ##print(f"  Total columns: {len(example['column_names'])}")
    
else:
    print("tables_data not found.")

 Extracted data for 166 database schemas

 Example of simplified entry:
  database_name: icfp_1
  table_names: ['institution', 'authors', 'papers', 'authorship count']


In [119]:
essential_schemas

[{'database_name': 'perpetrator', 'table_names': ['perpetrator', 'people']},
 {'database_name': 'college_2',
  'table_names': ['classroom',
   'department',
   'course',
   'instructor',
   'section',
   'teaches',
   'student',
   'takes classes',
   'advisor',
   'time slot',
   'prerequisite']},
 {'database_name': 'flight_company',
  'table_names': ['airport', 'operate company', 'flight']},
 {'database_name': 'icfp_1',
  'table_names': ['institution', 'authors', 'papers', 'authorship count']},
 {'database_name': 'body_builder', 'table_names': ['body builder', 'people']},
 {'database_name': 'storm_record',
  'table_names': ['storm', 'region', 'affected region']},
 {'database_name': 'pilot_record',
  'table_names': ['aircraft', 'pilot', 'pilot record']},
 {'database_name': 'race_track', 'table_names': ['race', 'track']},
 {'database_name': 'academic',
  'table_names': ['author',
   'conference',
   'domain',
   'domain author',
   'domain conference',
   'journal',
   'domain journal'

In [120]:

schema_texts = []

for db in essential_schemas:
    db_name = db["database_name"]
    table_names = db["table_names"]   
    # Create descriptive text for embedding
    text = f"Database: {db_name} | Table: {table_names}"      
    schema_texts.append(text)
schema_texts

["Database: perpetrator | Table: ['perpetrator', 'people']",
 "Database: college_2 | Table: ['classroom', 'department', 'course', 'instructor', 'section', 'teaches', 'student', 'takes classes', 'advisor', 'time slot', 'prerequisite']",
 "Database: flight_company | Table: ['airport', 'operate company', 'flight']",
 "Database: icfp_1 | Table: ['institution', 'authors', 'papers', 'authorship count']",
 "Database: body_builder | Table: ['body builder', 'people']",
 "Database: storm_record | Table: ['storm', 'region', 'affected region']",
 "Database: pilot_record | Table: ['aircraft', 'pilot', 'pilot record']",
 "Database: race_track | Table: ['race', 'track']",
 "Database: academic | Table: ['author', 'conference', 'domain', 'domain author', 'domain conference', 'journal', 'domain journal', 'keyword', 'domain keyword', 'publication', 'domain publication', 'organization', 'publication keyword', 'writes', 'cite']",
 "Database: department_store | Table: ['addresses', 'staff', 'suppliers', 'de

In [121]:
embeddings = OpenAIEmbeddings()  
vectorstore = FAISS.from_texts(schema_texts, embeddings)

In [122]:
llm = ChatOpenAI(
    model="gpt-5-mini",   
    temperature=0
)

In [None]:
prompt_db = PromptTemplate(
    input_variables=["query", "retrieved_schema"],
    template="""
Please selects the most relevant database and table in order to answer user's query.
User query: {query}
Schema info: {retrieved_schema}
Which database and tables has the most relevant information for this query? Respond in JSON format: {{ "db_name": "...", "tables": ["..."] }}
"""
)
db_chain = LLMChain(llm=llm, prompt=prompt_db)

In [160]:
# Test the database selection agent with natural language queries
def database_selection_agent(user_query, top_k=5):
    print(f" User Query: '{user_query}'")
    
    # Step 1: Retrieve relevant schemas using vector search
    relevant_docs = vectorstore.similarity_search_with_score(user_query, k=top_k)

    #print(f"\n Step 2: LLM Database Selection")

    selected_schema = ""
    for doc, score in relevant_docs:
        selected_schema += f"score: {score}, content: {doc.page_content}\n"

    #return selected_schema

    # Step 2: Use LLM to select the best database and tables
    

    response = db_chain.run(
         query=user_query,
         retrieved_schema=selected_schema
     )
    #print(f"LLM Response: {response}")
    
    return {
        "user_query": user_query,
        "retrieved_schemas":  selected_schema,
        "llm_selection": response
    }

# Test queries available
test_queries = [
    "Show me information about singers and their concerts",
    "I want to see student enrollment data",
    "Find information about car manufacturers and models",
    "What data do you have about movies and actors?",
    "Show me employee salary information",
    "Which produce has the most complaints where the status are still open"
]



In [161]:


# Choose which query to test (change the index number)
test_query = test_queries[5]  # 0=singers/concerts, 1=students, 2=cars, 3=movies, 4=employees

# Run the test
result = database_selection_agent(test_query)

print(result['user_query'])
print(result['retrieved_schemas'])
print(result['llm_selection'])


 User Query: 'Which produce has the most complaints where the status are still open'
Which produce has the most complaints where the status are still open
score: 0.4461432695388794, content: Database: customer_complaints | Table: ['staff', 'customers', 'products', 'complaints']
score: 0.5071680545806885, content: Database: tracking_software_problems | Table: ['problem category codes', 'problem log', 'problem status codes', 'product', 'staff', 'problems']
score: 0.539566159248352, content: Database: manufactory_1 | Table: ['manufacturers', 'products']
score: 0.5415076017379761, content: Database: insurance_and_eClaims | Table: ['customers', 'staff', 'policies', 'claim headers', 'claims documents', 'claims processing stages', 'claims processing']
score: 0.5418848991394043, content: Database: local_govt_mdm | Table: ['customer master index', 'cmi cross references', 'council tax', 'business rates', 'benefits overpayments', 'parking fines', 'rent arrears', 'electoral register']

{ "db_name"

In [10]:
# Function to extract all information for a specific DB_ID
def get_database_info(db_id, tables_data):
    """
    Extract all information for a specific database ID
    """
    db_info = []
    for entry in tables_data:
        if entry.get('db_id') == db_id:
            db_info.append(entry)
    
    if not db_info:
        print(f"‚ùå No data found for DB_ID: {db_id}")
        return None
    
    print(f"‚úÖ Found {len(db_info)} entries for DB_ID: {db_id}")
    
    # Combine all information for this database
    database_summary = {
        'db_id': db_id,
        'tables': [],
        'columns': [],
        'table_names': [],
        'column_names': [],
        'column_types': [],
        'foreign_keys': [],
        'primary_keys': []
    }
    
    for entry in db_info:
        # Extract table information
        if 'table_names' in entry:
            database_summary['table_names'].extend(entry['table_names'])
        if 'table_names_original' in entry:
            database_summary['tables'].extend(entry['table_names_original'])
        if 'column_names' in entry:
            database_summary['column_names'].extend(entry['column_names'])
        if 'column_names_original' in entry:
            database_summary['columns'].extend(entry['column_names_original'])
        if 'column_types' in entry:
            database_summary['column_types'].extend(entry['column_types'])
        if 'foreign_keys' in entry:
            database_summary['foreign_keys'].extend(entry['foreign_keys'])
        if 'primary_keys' in entry:
            database_summary['primary_keys'].extend(entry['primary_keys'])
    
    return database_summary

# Example usage - let's try with the first available DB_ID
if 'unique_db_ids' in locals() and unique_db_ids:
    example_db_id = unique_db_ids[0]
    print(f"üîç Extracting information for DB_ID: {example_db_id}")
    db_info = get_database_info(example_db_id, tables_data)
    
    if db_info:
        print(f"\nüìä Database: {db_info['db_id']}")
        print(f"üìã Tables: {db_info['table_names']}")
        print(f"üîó Columns: {len(db_info['column_names'])} total columns")
        print(f"üîë Primary Keys: {db_info['primary_keys']}")
        print(f"üîó Foreign Keys: {db_info['foreign_keys']}")
else:
    print("‚ùå No DB_IDs available. Please run the previous cells first.")

üîç Extracting information for DB_ID: insurance_fnol
‚úÖ Found 1 entries for DB_ID: insurance_fnol

üìä Database: insurance_fnol
üìã Tables: ['customers', 'services', 'available policies', 'customers policies', 'first notification of loss', 'claims', 'settlements']
üîó Columns: 23 total columns
üîë Primary Keys: [1, 3, 5, 8, 12, 16, 19]
üîó Foreign Keys: [[9, 5], [8, 1], [13, 8], [14, 9], [15, 3], [17, 12], [20, 16]]


In [11]:
# Interactive: Choose a specific DB_ID to explore
# Change this to any DB_ID you want to explore
target_db_id = "concert_singer"  # Example - change this to your desired DB_ID

print(f"üéØ Exploring Database: {target_db_id}")
print("="*50)

if 'tables_data' in locals():
    selected_db_info = get_database_info(target_db_id, tables_data)
    
    if selected_db_info:
        print(f"\nüìã Complete Database Schema for '{target_db_id}':")
        print(f"Number of tables: {len(selected_db_info['table_names'])}")
        
        # Show detailed table and column information
        print(f"\nüóÉÔ∏è Tables and their columns:")
        for i, table_name in enumerate(selected_db_info['table_names']):
            print(f"\n  Table {i}: {table_name}")
            
            # Find columns for this table
            table_columns = []
            for col_idx, (table_idx, col_name) in enumerate(selected_db_info['column_names']):
                if table_idx == i:
                    col_type = selected_db_info['column_types'][col_idx] if col_idx < len(selected_db_info['column_types']) else 'unknown'
                    table_columns.append(f"{col_name} ({col_type})")
            
            if table_columns:
                for col in table_columns:
                    print(f"    - {col}")
            else:
                print(f"    - No columns found")
        
        # Show relationships
        print(f"\nüîë Primary Keys: {selected_db_info['primary_keys']}")
        print(f"üîó Foreign Key Relationships: {selected_db_info['foreign_keys']}")
        
    else:
        print(f"Available DB_IDs: {unique_db_ids[:20]}...")  # Show first 20
        print(f"Total available: {len(unique_db_ids)}")
else:
    print("‚ùå Please run the previous cells to load the data first.")

üéØ Exploring Database: concert_singer
‚úÖ Found 1 entries for DB_ID: concert_singer

üìã Complete Database Schema for 'concert_singer':
Number of tables: 4

üóÉÔ∏è Tables and their columns:

  Table 0: stadium
    - stadium id (number)
    - location (text)
    - name (text)
    - capacity (number)
    - highest (number)
    - lowest (number)
    - average (number)

  Table 1: singer
    - singer id (number)
    - name (text)
    - country (text)
    - song name (text)
    - song release year (text)
    - age (number)
    - is male (others)

  Table 2: concert
    - concert id (number)
    - concert name (text)
    - theme (text)
    - stadium id (text)
    - year (text)

  Table 3: singer in concert
    - concert id (number)
    - singer id (text)

üîë Primary Keys: [1, 8, 15, 20]
üîó Foreign Key Relationships: [[18, 1], [21, 8], [20, 15]]
