# Explainable NL Query Database Agents 



### Team 9

In [1]:
# Import necessary libraries
import sqlite3
import json
import pandas as pd
import os
import textwrap
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from openai import OpenAI

In [None]:
#set up environment with OpenAI API Key
os.environ["OPENAI_API_KEY"] = ""
client = OpenAI()

Collecting git-filter-repo
  Downloading git_filter_repo-2.47.0-py3-none-any.whl.metadata (31 kB)
Downloading git_filter_repo-2.47.0-py3-none-any.whl (76 kB)
Installing collected packages: git-filter-repo
Successfully installed git-filter-repo-2.47.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
#testing a simple question ensure API is working
response = client.responses.create (
    model = "gpt-5-mini",
    input = "how much gold would it take to coat the statue of liberty in a 1mm layer?",
    reasoning = {
        "effort": "minimal"
    }
)
print(response.output[1].content[0].text)

Estimate steps and result:

1) Surface area: Common estimates for the Statue of Liberty’s exterior surface area range from about 20,000 to 30,000 square feet. A frequently cited value is ~25,000 ft² (≈ 2,322 m²). I'll use 2,300 m² as a reasonable round number.

2) Volume of 1 mm gold layer: 1 mm = 0.001 m, so volume = area × thickness = 2,300 m² × 0.001 m = 2.3 m³.

3) Mass of that gold: Density of gold ≈ 19,320 kg/m³. Mass = 2.3 m³ × 19,320 kg/m³ ≈ 44,436 kg ≈ 44.4 metric tonnes.

4) Value (optional): At a gold price of, e.g., $2,000 per troy ounce (1 troy ounce = 0.0311035 kg), that’s about $64,396 per kg. So value ≈ 44,436 kg × $64,396/kg ≈ $2.86 billion. (This scales linearly with the gold price you choose.)

If you prefer a different surface area estimate: using 2,000 m² gives ~38.6 tonnes; using 2,500 m² gives ~48.3 tonnes.

Summary: Roughly 40–50 metric tonnes of gold to coat the Statue of Liberty in a 1 mm layer (≈ about 2–3 m³), worth on the order of a few billion dollars at p

In [4]:
#loading JSON (schema file) from my local drive
tables_json_path = r"C:\Users\coffe\OneDrive\Desktop\CITS5553 Capstone Project\spider_data\spider_data\tables.json"
if os.path.exists(tables_json_path):
    with open(tables_json_path, 'r', encoding='utf-8') as f:
        tables_data = json.load(f)
    print(f"Number of tables/schemas: {len(tables_data)}")
    
    # Show column name
    if tables_data:
        first_entry = tables_data[5] 
        print(f"Keys: {list(first_entry.keys()) if isinstance(first_entry, dict) else 'Not a dictionary'}")
else:
    print("error.")


Number of tables/schemas: 166
Keys: ['column_names', 'column_names_original', 'column_types', 'db_id', 'foreign_keys', 'primary_keys', 'table_names', 'table_names_original']


In [5]:
# Here I've extract only the db_id, table_names, and column_names from tables.json
def extract_essential_schema(tables_data):
    essential_data = []
    for entry in tables_data:
        simplified_entry = {
            'database_name': entry.get('db_id', 'undefined'),
            'table_names': entry.get('table_names', []),
            'column_names': entry.get('column_names', [])
        }
        essential_data.append(simplified_entry)
    return essential_data

# Apply the extraction if tables_data is loaded
if 'tables_data' in locals():
    essential_schemas = extract_essential_schema(tables_data)
    print(f" Extracted data for {len(essential_schemas)} database schemas")
    
    # Show example of the simplified structure
    if essential_schemas:
        print(f"\n Example of simplified entry:")
        example = essential_schemas[0]
        print(f"  database_name: {example['database_name']}")
        print(f"  table_names: {example['table_names']}")
        print(f"  column_names (first 3): {example['column_names'][:3]}...")
        print(f"  Total columns: {len(example['column_names'])}")
    
else:
    print("tables_data not found.")

 Extracted data for 166 database schemas

 Example of simplified entry:
  database_name: perpetrator
  table_names: ['perpetrator', 'people']
  column_names (first 3): [[-1, '*'], [0, 'perpetrator id'], [0, 'people id']]...
  Total columns: 14


In [6]:
#Examples of schemas
essential_schemas[0]

{'database_name': 'perpetrator',
 'table_names': ['perpetrator', 'people'],
 'column_names': [[-1, '*'],
  [0, 'perpetrator id'],
  [0, 'people id'],
  [0, 'date'],
  [0, 'year'],
  [0, 'location'],
  [0, 'country'],
  [0, 'killed'],
  [0, 'injured'],
  [1, 'people id'],
  [1, 'name'],
  [1, 'height'],
  [1, 'weight'],
  [1, 'home town']]}

In [7]:
#below is iterating each database getting db name , table name and column names, then print in a easy to read format.
#This block is not being used for any agents, only for visually inspection of the schema file. 

for db in essential_schemas:  # Iterate through each database
    db_name = db["database_name"]
    table_names = db["table_names"]
    column_names = db["column_names"]
    
    print(f"Database: {db_name}")
    
    # Create a dictionary for this database's tables and columns
    essential_schemas_reshape = {}
    for idx, table_name in enumerate(table_names):
        # Collect all columns belonging to this table
        # column_names format: [(table_index, column_name), ...]
        cols = [col_name for table_idx, col_name in column_names if table_idx == idx]
        essential_schemas_reshape[table_name] = cols

    # Pretty print for this database
    for table, cols in essential_schemas_reshape.items():
        print(f"  Table: {table}")
        print(f"    Columns: {', '.join(cols)}")
    print("="*50)

Database: perpetrator
  Table: perpetrator
    Columns: perpetrator id, people id, date, year, location, country, killed, injured
  Table: people
    Columns: people id, name, height, weight, home town
Database: college_2
  Table: classroom
    Columns: building, room number, capacity
  Table: department
    Columns: department name, building, budget
  Table: course
    Columns: course id, title, department name, credits
  Table: instructor
    Columns: id, name, department name, salary
  Table: section
    Columns: course id, section id, semester, year, building, room number, time slot id
  Table: teaches
    Columns: id, course id, section id, semester, year
  Table: student
    Columns: id, name, department name, total credits
  Table: takes classes
    Columns: id, course id, section id, semester, year, grade
  Table: advisor
    Columns: student id, instructor id
  Table: time slot
    Columns: time slot id, day, start hour, start minute, end hour, end minute
  Table: prerequisite

### Below part is to redefining and reshaping essential_schemas to better suit for LLM 

In [8]:
essential_schemas_reshape = {}
for db in essential_schemas: 
    db_name = db.get("database_name", "unknown")
    table_columns = []
    for idx, table_name in enumerate(db["table_names"]):
        cols = [col for t_idx, col in db["column_names"] if t_idx == idx]
        table_columns.append([table_name, cols])
    essential_schemas_reshape[db_name] = table_columns

essential_schemas_reshape

{'perpetrator': [['perpetrator',
   ['perpetrator id',
    'people id',
    'date',
    'year',
    'location',
    'country',
    'killed',
    'injured']],
  ['people', ['people id', 'name', 'height', 'weight', 'home town']]],
 'college_2': [['classroom', ['building', 'room number', 'capacity']],
  ['department', ['department name', 'building', 'budget']],
  ['course', ['course id', 'title', 'department name', 'credits']],
  ['instructor', ['id', 'name', 'department name', 'salary']],
  ['section',
   ['course id',
    'section id',
    'semester',
    'year',
    'building',
    'room number',
    'time slot id']],
  ['teaches', ['id', 'course id', 'section id', 'semester', 'year']],
  ['student', ['id', 'name', 'department name', 'total credits']],
  ['takes classes',
   ['id', 'course id', 'section id', 'semester', 'year', 'grade']],
  ['advisor', ['student id', 'instructor id']],
  ['time slot',
   ['time slot id',
    'day',
    'start hour',
    'start minute',
    'end hour',


In [9]:
# here I've trying to add headings like database/table_name etc, to be more descriptive, LLM may not able to identify which one is database, which one is table and column name from above.
def reshape_with_headings(essential_schemas):
    out = {}
    # I didn't use the counter in the end, orginally just trying to index the database. 
    counter = 0
    for db in essential_schemas:
        order_ = 'database ' + str(counter)
        db_name =  db.get("database_name", "unknown")
        table_names = list(db.get("table_names", []))
        col_specs   = list(db.get("column_names", []))
        tables = []
        for idx, table_name in enumerate(table_names):
            cols = []
            for pair in col_specs:
                if not isinstance(pair, (list, tuple)) or len(pair) != 2:
                    continue
                t_idx, col = pair
                try:
                    t_idx = int(t_idx)
                except (ValueError, TypeError):
                    continue
                if t_idx != idx:
                    continue
                if col is None or str(col).strip() == "*" or t_idx < 0:
                    continue
                cols.append(str(col))

            tables.append({
                "table_name": table_name,
                "columns": cols,
            })

        out[db_name] = {
           "database_name": db_name,
            "tables": tables
        }
        counter += 1

    return out

In [10]:
reshaped_essential_schemas = reshape_with_headings(essential_schemas)
reshaped_essential_schemas


{'perpetrator': {'database_name': 'perpetrator',
  'tables': [{'table_name': 'perpetrator',
    'columns': ['perpetrator id',
     'people id',
     'date',
     'year',
     'location',
     'country',
     'killed',
     'injured']},
   {'table_name': 'people',
    'columns': ['people id', 'name', 'height', 'weight', 'home town']}]},
 'college_2': {'database_name': 'college_2',
  'tables': [{'table_name': 'classroom',
    'columns': ['building', 'room number', 'capacity']},
   {'table_name': 'department',
    'columns': ['department name', 'building', 'budget']},
   {'table_name': 'course',
    'columns': ['course id', 'title', 'department name', 'credits']},
   {'table_name': 'instructor',
    'columns': ['id', 'name', 'department name', 'salary']},
   {'table_name': 'section',
    'columns': ['course id',
     'section id',
     'semester',
     'year',
     'building',
     'room number',
     'time slot id']},
   {'table_name': 'teaches',
    'columns': ['id', 'course id', 'secti

In [11]:
#Below function to finalise the format of the schema, into DB-TABLE-Column names format
def format_schema_jsonish(reshaped_essential_schemas):
    lines = []
    for _, db in reshaped_essential_schemas.items():
        db_name = db.get("database_name", "unknown")
        for t in db.get("tables", []):
            obj = {
                "database": db_name,
                "table": t.get("table_name", "unknown"),
                "columns": t.get("columns", [])
            }
            lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
    return lines

In [12]:
final_schema_result = format_schema_jsonish(reshaped_essential_schemas)

In [13]:
final_schema_result [:10]

['{"database":"perpetrator","table":"perpetrator","columns":["perpetrator id","people id","date","year","location","country","killed","injured"]}',
 '{"database":"perpetrator","table":"people","columns":["people id","name","height","weight","home town"]}',
 '{"database":"college_2","table":"classroom","columns":["building","room number","capacity"]}',
 '{"database":"college_2","table":"department","columns":["department name","building","budget"]}',
 '{"database":"college_2","table":"course","columns":["course id","title","department name","credits"]}',
 '{"database":"college_2","table":"instructor","columns":["id","name","department name","salary"]}',
 '{"database":"college_2","table":"section","columns":["course id","section id","semester","year","building","room number","time slot id"]}',
 '{"database":"college_2","table":"teaches","columns":["id","course id","section id","semester","year"]}',
 '{"database":"college_2","table":"student","columns":["id","name","department name","tota

### Below Embedding the full prepared  Final_Schema_Result

In [14]:
embeddings = OpenAIEmbeddings()  
vectorstore_2 = FAISS.from_texts(final_schema_result, embeddings)

## Agent 1 Database Selection

#### Set up LLM


In [15]:

llm = ChatOpenAI(
    model="gpt-5-mini",   
    temperature=0
)

#### Set up Agent A prompt template . Note : I've used the new function call with '|', the hashed line of code is by using the old syntax, still working now but will be depreciated in v.1.0

In [None]:

prompt_db = PromptTemplate(
    input_variables=["query", "retrieved_schema"],
    template="""
Please selects the most relevant database and table in order to answer user's query.
User query: {query}
Schema info: {retrieved_schema}
Which database and tables has the most relevant information for this query? Selecting 1 database only. 
Respond the database name, table and column infomation in JSON format: {{ "db_name": "...", "tables": ["..."], "columns":["..."]}}
"""
)
#db_chain = LLMChain(llm=llm, prompt=prompt_db)
db_chain = prompt_db | llm

#### Set up the main agent A function call with natural language question input, updated from 'db_chain.run' to 'db_chain.invoke', as above I updated 'LLMChain(llm=llm, prompt=prompt_db)' to 'db_chain = prompt_db | llm'

In [17]:

def database_selection_agent(user_query, top_k=5):
    print(f" User Query: '{user_query}'")

    # Step 1: Retrieve relevant schemas using vector search (Here the default is L2 Euclidean distances, I have had a quick look this 
    # can be updated, but not in a very easy / parameter way )
    relevant_docs = vectorstore_2.similarity_search_with_score(user_query, k=top_k)

    #print(f"\n Step 2: LLM Database Selection")

    selected_schema = ""
    for doc, score in relevant_docs:
        selected_schema += f"score: {score}, content: {doc.page_content}\n"

    #return selected_schema with a score 


    # Step 2: Use LLM to select the best database and tables
    response = db_chain.invoke({
         "query": user_query,
         "retrieved_schema": selected_schema
     })
    #print(f"LLM Response: {response}")
    
    return {
        "user_query": user_query,
        "retrieved_schemas":  selected_schema,
        "llm_selection": response
    }


### A list of test queries

In [18]:
test_queries = [
    "Count the number of rooms that are not in the Lamberton building",
    "Find the name of all students who were in the tryout sorted in alphabetic order",
    "Find the average price of all product clothes.",
    "Show the names of artworks in ascending order of the year they are nominated in.",
    "What is the name of the department with the student that has the lowest GPA?",
    "What are the names and years of the movies that has the top 3 highest rating star?",
    "How many students does each advisor have?",
    "Count flights departing from Dallas in 2017",
    "What are the distinct creation years of the departments managed by a secretary born in state 'Alabama'?",
    "List courses worth more than 3 credits and their departments",
    "For each customer, compute total order value and sort desc.",
    "select all the deaths caused by ship",
    "Show me information about singers and their concerts",
    "I want to see student enrollment data",
    "Find information about car manufacturers and models",
    "What data do you have about movies and actors?",
    "Show me employee salary information",
    "Which produce has the most complaints where the status are still open"
]

#### This block can pull question from test query list and return all the process result. Would be ideal the top_k parameter can be designed for user to set up a value, if the top 5 do not return desire result. 

In [19]:
test_query = test_queries[1]  
result = database_selection_agent(test_query,top_k=10)
print(result['user_query'])
print(result['retrieved_schemas'])
print(result['llm_selection'])


 User Query: 'Find the name of all students who were in the tryout sorted in alphabetic order'
Find the name of all students who were in the tryout sorted in alphabetic order
score: 0.484733521938324, content: {"database":"school_player","table":"school","columns":["school id","school","location","enrollment","founded","denomination","boys or girls","day or boarding","year entered competition","school colors"]}
score: 0.4936240017414093, content: {"database":"e_learning","table":"students","columns":["student id","date of registration","date of latest logon","login name","password","personal name","middle name","family name"]}
score: 0.49639877676963806, content: {"database":"soccer_2","table":"tryout","columns":["player id","college name","player position","decision"]}
score: 0.506287157535553, content: {"database":"school_player","table":"school performance","columns":["school id","school year","class a","class aa"]}
score: 0.5084279775619507, content: {"database":"student_1","table"

####  Again, as I've updated the Langchain call from Run to Invoke above, below hashed code suitable for the old Run command only

In [20]:
#db_name = str(json.loads(result['llm_selection'])["db_name"])
llm_output = result['llm_selection'].content if hasattr(result['llm_selection'], 'content') else str(result['llm_selection'])
db_name = str(json.loads(llm_output)["db_name"])
db_name

'soccer_2'

# Agent 2

In [21]:
# quickly inspect what table the selected database contain.
reshaped_essential_schemas[db_name]['tables']

[{'table_name': 'college', 'columns': ['college name', 'state', 'enrollment']},
 {'table_name': 'player',
  'columns': ['player id', 'player name', 'yes card', 'training hours']},
 {'table_name': 'tryout',
  'columns': ['player id', 'college name', 'player position', 'decision']}]

#### Again I have updated the chain call function to use '|', same with agent 1.

In [22]:
list_tables_prompt = PromptTemplate(
    input_variables=["user_query", "db_schema_json"],
    template=(
        "Given the selected database schema, return ONLY valid JSON with exactly these keys"
        '  "relevant_tables": ["..."],\n'
        '  "reasons": "..." \n\n'
        #'  "SQL Code": "..." \n\n'
        "User query: {user_query}\n"
        "DB schema JSON: {db_schema_json}\n"
        "Do not wrap all_tables in an extra list. Do not include any text outside JSON."
    ),
)
#db_chain_2 = LLMChain(llm=llm, prompt=list_tables_prompt)
db_chain_2 = list_tables_prompt | llm

#### Here used db_chain_2 from above and generating the response_2 for Agent B

In [23]:
def table_selection_agent(user_query, db_name):
    #selecting ALL table information from selected database from agent A 
    full_schema = reshaped_essential_schemas[db_name]['tables']
    response_2 = db_chain_2.invoke({
        "user_query": user_query,
        "db_schema_json": full_schema
    })

    return {
        "User Query:" : user_query,
        "Database Name:" : db_name,
        "llm_result": response_2
    }

#### I've updated below due to the invoke funtion update from the run. The result_2 contains all the neccessary information from the agent 2, which contains questions, and selected database name from agent A. Then converted the LLM output to be JSON format as well to be able to be called via the keys. 

In [24]:
result_2 = table_selection_agent(user_query=test_query, db_name = db_name)
llm_result = result_2['llm_result'].content if hasattr(result_2['llm_result'], 'content') else str(result_2['llm_result'])
result_2_JSON = json.loads(llm_result)
print(f"User Query : " + result_2['User Query:'])
print(f"Selected DB from Agent1 : " + result_2['Database Name:'])
print(f"Selected Tables: " + str(result_2_JSON['relevant_tables']))
print(f"Reason: " + result_2_JSON['reasons'])
#print(f"SQL code: " + result_2_JSON['SQL Code'])

User Query : Find the name of all students who were in the tryout sorted in alphabetic order
Selected DB from Agent1 : soccer_2
Selected Tables: ['player', 'tryout']
Reason: The tryout table lists participating players by player id, while the player table contains player names. Join tryout.player id = player.player id to retrieve the names of all players who were in the tryout, then sort the resulting names alphabetically.


# Agent 3

In [25]:
#loading JSON (schema file) from my local drive
complete_combined_schema = r"C:\Users\coffe\OneDrive\Desktop\CITS5553 Capstone Project\spider_data\spider_data\complete_combined_schema.json"
if os.path.exists(complete_combined_schema):
    with open(complete_combined_schema, 'r', encoding='utf-8') as f:
        complete_schema = json.load(f)
    print(f"Number of tables/schemas: {len(complete_schema)}")
else:
    print("error.")

Number of tables/schemas: 4


In [26]:
result_2_JSON['relevant_tables']

['player', 'tryout']

In [27]:
# Here getting all the tables from database, not just selected from Agent 2.
complete_schema['schema'][db_name]['tables']

{'College': {'columns': ['cName', 'state', 'enr'],
  'primary_key': ['cName'],
  'foreign_keys': []},
 'Player': {'columns': ['pID', 'pName', 'yCard', 'HS'],
  'primary_key': ['pID'],
  'foreign_keys': []},
 'Tryout': {'columns': ['pID', 'cName', 'pPos', 'decision'],
  'primary_key': ['pID', 'cName'],
  'foreign_keys': [{'from_column': 'cName',
    'ref_table': 'College',
    'ref_column': 'cName'},
   {'from_column': 'pID', 'ref_table': 'Player', 'ref_column': 'pID'}]}}

In [None]:
# The prompt need to add 'case insensitive' as the Spider.JSON are all lower case where our own schema used upper case.
produce_sql_prompt = PromptTemplate(
    input_variables=["user_query", "db_schema_json", "selected_tables"],
    template=(
        "Given the selected database schema and selected table names, please be case insensitive, return ONLY valid JSON with exactly these keys"
        '  "relevant_tables": ["..."],\n'
        '  "SQL Code": "..." \n\n'
        '  "reasons": "..." \n\n'
        "User query: {user_query}\n"
        "DB schema JSON: {db_schema_json}\n"
        "Selected tables: {selected_tables}\n"
        "Do not wrap all_tables in an extra list. Do not include any text outside JSON."
    ),
)
db_chain_3 = produce_sql_prompt | llm

In [29]:
def sql_production_agent(user_query, db_name, selected_tables):
    final_schema = complete_schema['schema'][db_name]['tables']
    response_3 = db_chain_3.invoke({
        "user_query": user_query,
        "db_schema_json": final_schema,
        "selected_tables": selected_tables
    })

    return {
        "User Query:" : user_query,
        "Database Name:" : db_name,
        "llm_result": response_3
    }

In [30]:

# Can ignore below. 
# schema_all = complete_schema['schema'][db_name]['tables']
# Pull out selected schemas based on the 
# schema_selected = {k: v for k, v in schema_all.items() if k.lower() in result_2_JSON['relevant_tables']}
# print(schema_selected)


In [None]:
# Below is the return of the result, by using the selected db, question and All the table schemas from that database.
result_3 = sql_production_agent(user_query=test_query, db_name = db_name,selected_tables = result_2_JSON['relevant_tables'])
llm_result = result_3['llm_result'].content if hasattr(result_3['llm_result'], 'content') else str(result_3['llm_result'])
result_3_JSON = json.loads(llm_result)
print(f"User Query : " + result_3['User Query:'])
print(f"Selected DB from Agent1 : " + result_3['Database Name:'])
print(f"Selected Tables: " + str(result_3_JSON['relevant_tables']))
print(f"SQL code: " + result_3_JSON['SQL Code'])
print(f"Reason: " + result_3_JSON['reasons'])

User Query : Find the name of all students who were in the tryout sorted in alphabetic order
Selected DB from Agent1 : soccer_2
Selected Tables: ['player', 'tryout']
SQL code: SELECT DISTINCT player.pName FROM player JOIN tryout ON player.pID = tryout.pID ORDER BY player.pName ASC;
Reason: Join player and tryout on pID to get only players who participated in tryouts. Use DISTINCT to avoid duplicate names if a player appears in multiple tryout records. Order results alphabetically by pName using ORDER BY ASC.


In [52]:
import networkx as nx
from pyvis.network import Network

def schema_to_networkx(tables_dict):
    G = nx.MultiDiGraph()

    for table_name, meta in tables_dict.items():
        G.add_node(
            table_name,
            label=table_name,                
            nodetype="Table",
            group="Table",
            color="#5BBCD6",
            shape="box",
            title=f"Table: {table_name} PK: {', '.join(meta.get('primary_key', [])) or '(none)'}"
        )

    
    for table_name, meta in tables_dict.items():
        for col in meta.get("columns", []):
            col_node = f"{table_name}.{col}"
            G.add_node(
                col_node,
                label=col,
                nodetype="Column",
                group=f"col::{table_name}",
                color="#a4e6a0",
                shape="ellipse",
                title=f"Column: {col}  Table: {table_name}"
            )
            G.add_edge(
                table_name, col_node,
                type="HAS_COLUMN",
                color="#999999",
                arrows="to",
                title="HAS_COLUMN"
            )

    # Foreign key edges
    for table_name, meta in tables_dict.items():
        for fk in meta.get("foreign_keys", []):
            child_node = f"{table_name}.{fk['from_column']}"
            parent_node = f"{fk['ref_table']}.{fk['ref_column']}"
            G.add_edge(
                child_node, parent_node,
                type="FK_TO",
                color="#FF0000",
                arrows="to",
                smooth=True,
                title=f"FK_TO: {table_name}.{fk['from_column']} → {fk['ref_table']}.{fk['ref_column']}"
            )

    return G

# Build graph
G = schema_to_networkx(schema_all)

# PyVis: use remote or inline resources to avoid the Jupyter warning
net = Network(notebook=True, directed=True, cdn_resources='remote', height="650px", width="100%")
net.from_nx(G)

# Optional: nicer layout options
# net.set_options("""
# const options = {
#   physics: { enabled: true, solver: 'forceAtlas2Based', stabilization: { iterations: 200 } },
#   edges: { arrows: { to: { enabled: true } }, smooth: { type: 'curvedCW', roundness: 0.2 } },
#   nodes: { font: { face: 'Inter', size: 14 }, borderWidth: 1 }
# }
# """)

net.show("schema.html")

schema.html


### Testing, below systematically testing with Train data by using the question + database + SQL code. Please be mindful for the client's cost, and ensure we are not testing all the 7,000 question from the training. 

In [32]:
tables_json_path_2 = r"C:\Users\coffe\OneDrive\Desktop\CITS5553 Capstone Project\spider_data\spider_data\train_spider.json"
if os.path.exists(tables_json_path_2):
    with open(tables_json_path_2, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    print(f"Number of tables/schemas: {len(train_data)}")
    if train_data :
        first_entry = train_data[5] 
        print(f"Keys: {list(first_entry.keys()) if isinstance(first_entry, dict) else 'Not a dictionary'}")
else:
    print("error.")

Number of tables/schemas: 7000
Keys: ['db_id', 'query', 'query_toks', 'query_toks_no_value', 'question', 'question_toks', 'sql']


In [None]:
# pull only the question + database + SQL code from the training data
dbid_question_list = [
    {'db_id': entry.get('db_id'), 'question': entry.get('question'), 'query': entry.get('query')}
    for entry in train_data if isinstance(entry, dict)
]
print(dbid_question_list[:5]) 


[{'db_id': 'department_management', 'question': 'How many heads of the departments are older than 56 ?', 'query': 'SELECT count(*) FROM head WHERE age  >  56'}, {'db_id': 'department_management', 'question': 'List the name, born state and age of the heads of departments ordered by age.', 'query': 'SELECT name ,  born_state ,  age FROM head ORDER BY age'}, {'db_id': 'department_management', 'question': 'List the creation year, name and budget of each department.', 'query': 'SELECT creation ,  name ,  budget_in_billions FROM department'}, {'db_id': 'department_management', 'question': 'What are the maximum and minimum budget of the departments?', 'query': 'SELECT max(budget_in_billions) ,  min(budget_in_billions) FROM department'}, {'db_id': 'department_management', 'question': 'What is the average number of employees of the departments whose rank is between 10 and 15?', 'query': 'SELECT avg(num_employees) FROM department WHERE ranking BETWEEN 10 AND 15'}]


In [34]:
# selecting every x of the question/db pair, as there are 7,000 training, we do not want to use all of them. 
every_x_th = dbid_question_list[::250]
# checking how many in total to be tested
len(every_x_th) 

28

In [None]:
# printing the question + db + sql code from testing data.
for item in every_x_th:
    print(item['question'])
    print(item['db_id'])
    print(item['query'])

How many heads of the departments are older than 56 ?
department_management
SELECT count(*) FROM head WHERE age  >  56
Show names of actors and names of musicals they are in.
musical
SELECT T1.Name ,  T2.Name FROM actor AS T1 JOIN musical AS T2 ON T1.Musical_ID  =  T2.Musical_ID
How many students does each advisor have?
allergy_1
SELECT advisor ,  count(*) FROM Student GROUP BY advisor
What are the names and locations of all tracks?
race_track
SELECT name ,  LOCATION FROM track
Return the total and minimum enrollments across all schools.
university_basketball
SELECT sum(enrollment) ,  min(enrollment) FROM university
Return all the apartment numbers sorted by the room count in ascending order.
apartment_rentals
SELECT apt_number FROM Apartments ORDER BY room_count ASC
Show the distinct venues of debates
debate
SELECT DISTINCT Venue FROM debate
What is the age of the tallest person?
gymnast
SELECT Age FROM people ORDER BY Height DESC LIMIT 1
What are the names and headquarters of all com

#### I've updated below that we can now feed both ground truth SQL code, as well as our prediction groud truth SQL code, to excute in our database then returned the results. 

#### feed all the x number of questions into the agent A + agent B + agent C, and collect the results. 

In [68]:

prediction_results = []
for item in every_x_th:
    question = item['question']
    result_1 = database_selection_agent(question)
    llm_output_1 = result_1['llm_selection'].content if hasattr(result_1['llm_selection'], 'content') else str(result_1['llm_selection'])
    db_name = str(json.loads(llm_output_1)["db_name"])
    result_2 = table_selection_agent(question, db_name = db_name)
    llm_result_2 = result_2['llm_result'].content if hasattr(result_2['llm_result'], 'content') else str(result_2['llm_result'])
    result_2_JSON = json.loads(llm_result_2)
    result_3 = sql_production_agent(user_query=question, db_name = db_name,selected_tables = result_2_JSON['relevant_tables'])
    llm_result_3 = result_3['llm_result'].content if hasattr(result_3['llm_result'], 'content') else str(result_3['llm_result'])
    result_3_JSON = json.loads(llm_result_3)
   
    prediction_results.append({'question': question, 'db_name': db_name, 'SQL_Code': result_3_JSON['SQL Code'] })

#print(db_name_results[:5])

 User Query: 'How many heads of the departments are older than 56 ?'
 User Query: 'Show names of actors and names of musicals they are in.'
 User Query: 'How many students does each advisor have?'
 User Query: 'What are the names and locations of all tracks?'
 User Query: 'Return the total and minimum enrollments across all schools.'
 User Query: 'Return all the apartment numbers sorted by the room count in ascending order.'
 User Query: 'Show the distinct venues of debates'
 User Query: 'What is the age of the tallest person?'
 User Query: 'What are the names and headquarters of all companies ordered by descending market value?'
 User Query: 'What is the team with at least 2 technicians?'
 User Query: 'What are the names and years of the movies that has the top 3 highest rating star?'
 User Query: 'How many distinct governors are there?'
 User Query: 'What is the largest payment amount?'
 User Query: 'What is the total number of professors with a Ph.D. ?'
 User Query: 'What are the fu

In [70]:
# visually inpect the prediction output
prediction_results

[{'question': 'How many heads of the departments are older than 56 ?',
  'db_name': 'department_management',
  'SQL_Code': 'SELECT COUNT(DISTINCT h.head_ID) AS num_heads_older_than_56\nFROM head h\nJOIN management m ON h.head_ID = m.head_ID\nWHERE h.age > 56;'},
 {'question': 'Show names of actors and names of musicals they are in.',
  'db_name': 'musical',
  'SQL_Code': 'SELECT a.Name AS actor_name, m.Name AS musical_name\nFROM actor a\nJOIN musical m ON a.Musical_ID = m.Musical_ID;'},
 {'question': 'How many students does each advisor have?',
  'db_name': 'college_2',
  'SQL_Code': 'SELECT i.ID AS advisor_id, i.name AS advisor_name, COUNT(a.s_ID) AS student_count\nFROM instructor i\nJOIN advisor a ON i.ID = a.i_ID\nGROUP BY i.ID, i.name\nORDER BY student_count DESC;'},
 {'question': 'What are the names and locations of all tracks?',
  'db_name': 'race_track',
  'SQL_Code': 'SELECT Name, Location FROM track;'},
 {'question': 'Return the total and minimum enrollments across all schools

## Below I am only compare the DB name from Truth and Prediction First with accuracy score. Then I am extracting the Truth SQL code and Prediction SQL code, connect both to the databases and return the excution results.


### Compare database name first 

In [None]:
# extracting only the database name from training and prediciton for accuracy checking. 
db_ground_truth=[]
for item in every_x_th:
    db_ground_truth.append(item['db_id'])
db_prediction=[]
for item in prediction_results:
    db_prediction.append(item['db_name'])

In [40]:
# printing last 5 for each list 
print(db_ground_truth [-5:])
print(db_prediction [-5:])

['dorm_1', 'game_1', 'ship_1', 'scientist_1', 'activity_1']
['dorm_1', 'game_1', 'ship_1', 'scientist_1', 'college_2']


In [41]:
comparison_results = [gt == pred for gt, pred in zip(db_ground_truth, db_prediction)]
print(comparison_results)

[True, True, False, True, False, True, True, True, False, True, False, True, True, True, True, True, True, False, True, True, True, True, False, True, True, True, True, False]


In [42]:
# below return percentage of correct prediction, out of X number of questions/db you choose for the testing. 
sum(comparison_results) / len (comparison_results)

0.75

In [None]:
# printing all the incorrect database prediction 
false_indices = [i for i, val in enumerate(comparison_results) if not val]
for i in false_indices:
    question = every_x_th[i]['question']
    ground_truth = db_ground_truth[i]
    prediction = db_prediction[i]
    print(f"Q: {question}\nTruth: {ground_truth} | Pred: {prediction}\n{'-'*60}")

Q: How many students does each advisor have?
Truth: allergy_1 | Pred: college_2
------------------------------------------------------------
Q: Return the total and minimum enrollments across all schools.
Truth: university_basketball | Pred: school_finance
------------------------------------------------------------
Q: What are the names and headquarters of all companies ordered by descending market value?
Truth: gas_company | Pred: company_employee
------------------------------------------------------------
Q: What are the names and years of the movies that has the top 3 highest rating star?
Truth: movie_1 | Pred: imdb
------------------------------------------------------------
Q: How many clubs are there?
Truth: club_1 | Pred: sports_competition
------------------------------------------------------------
Q: For each advisor, report the total number of students advised by him or her.
Truth: voter_2 | Pred: college_2
------------------------------------------------------------
Q: Wh

### Compare SQL excution results from both Ground Truth and Prediction.

In [69]:
from Create_Schema import extract_sql_file_paths
import json
# Remember to put the data into data folder
sql_file_paths = extract_sql_file_paths(save_json=True)

In [None]:
from SQL_Connector import SQLite_Connector
from Create_Schema import schema_extractor

# Create the connector (For mutiple queries on multiple tables)
connector = SQLite_Connector(sql_file_paths)

def connect_and_query(data_name: str, query: str):
    connector.connect(data_name)
    return connector.execute_queries([query])


In [None]:
# Here create an empty list to store the SQL excution ground truth results
Ground_Truth_Results = []
for item in every_x_th:
    db_name_Truth = item['db_id']
    query_Truth = item['query']
    result = connect_and_query(db_name_Truth, query_Truth)
    #print(result)
    Ground_Truth_Results.append(result)

Connecting to SQLite database at: data\department_management\department_management.sqlite
Connection successful.
Connecting to SQLite database at: data\musical\musical.sqlite
Connection successful.
Connecting to SQLite database at: data\allergy_1\allergy_1.sqlite
Connection successful.
Connecting to SQLite database at: data\race_track\race_track.sqlite
Connection successful.
Connecting to SQLite database at: data\university_basketball\university_basketball.sqlite
Connection successful.
Connecting to SQLite database at: data\apartment_rentals\apartment_rentals.sqlite
Connection successful.
Connecting to SQLite database at: data\debate\debate.sqlite
Connection successful.
Connecting to SQLite database at: data\gymnast\gymnast.sqlite
Connection successful.
Connecting to SQLite database at: data\gas_company\gas_company.sqlite
Connection successful.
Connecting to SQLite database at: data\machine_repair\machine_repair.sqlite
Connection successful.
Connecting to SQLite database at: data\movie

In [58]:
for x in Ground_Truth_Results:
    print(x)

[
    [
        {
            "count(*)": 5
        }
    ]
]
[
    [
        {
            "Name": "The Phantom of the Opera"
        },
        {
            "Name": "The Phantom of the Opera"
        },
        {
            "Name": "Les Mis\u00e9rables"
        },
        {
            "Name": "Les Mis\u00e9rables"
        },
        {
            "Name": "West Side Story"
        },
        {
            "Name": "The Book of Mormon"
        },
        {
            "Name": "The Book of Mormon"
        },
        {
            "Name": "The Book of Mormon"
        },
        {
            "Name": "The Book of Mormon"
        }
    ]
]
[
    [
        {
            "Advisor": 1121,
            "count(*)": 3
        },
        {
            "Advisor": 1148,
            "count(*)": 3
        },
        {
            "Advisor": 2192,
            "count(*)": 4
        },
        {
            "Advisor": 2311,
            "count(*)": 3
        },
        {
            "Advisor": 5718,
   

In [None]:
# Here create an empty list to store the predicted SQL excution  results
Prediction_Results = []

for item in prediction_results:
    db_name_Predict = item['db_name']
    query_Predict = item['SQL_Code']
    result = connect_and_query(db_name_Predict, query_Predict)
    #print(result)
    Prediction_Results.append(result)

Connecting to SQLite database at: data\department_management\department_management.sqlite
Connection successful.
Connecting to SQLite database at: data\musical\musical.sqlite
Connection successful.
Connecting to SQLite database at: data\college_2\college_2.sqlite
Connection successful.
Connecting to SQLite database at: data\race_track\race_track.sqlite
Connection successful.
Connecting to SQLite database at: data\school_finance\school_finance.sqlite
Connection successful.
Connecting to SQLite database at: data\apartment_rentals\apartment_rentals.sqlite
Connection successful.
Connecting to SQLite database at: data\debate\debate.sqlite
Connection successful.
Connecting to SQLite database at: data\gymnast\gymnast.sqlite
Connection successful.
Connecting to SQLite database at: data\company_employee\company_employee.sqlite
Connection successful.
Connecting to SQLite database at: data\machine_repair\machine_repair.sqlite
Connection successful.
Connecting to SQLite database at: data\imdb\imdb

In [60]:
for x in Prediction_Results:
    print(x)

[
    [
        {
            "num_heads_over_56": 3
        }
    ]
]
[
    [
        {
            "ActorName": "Ray Meagher",
            "MusicalName": "The Phantom of the Opera"
        },
        {
            "ActorName": "Tom Oliver",
            "MusicalName": "The Phantom of the Opera"
        },
        {
            "ActorName": "Lynne McGranger",
            "MusicalName": "Les Mis\u00e9rables"
        },
        {
            "ActorName": "Kate Ritchie",
            "MusicalName": "Les Mis\u00e9rables"
        },
        {
            "ActorName": "Alan Fletcher",
            "MusicalName": "West Side Story"
        },
        {
            "ActorName": "Jackie Woodburne",
            "MusicalName": "The Book of Mormon"
        },
        {
            "ActorName": "Ryan Moloney",
            "MusicalName": "The Book of Mormon"
        },
        {
            "ActorName": "Ian Smith",
            "MusicalName": "The Book of Mormon"
        },
        {
            "Actor

In [63]:
data_name = "musical"
query = "SELECT T1.Name ,  T2.Name FROM actor AS T1 JOIN musical AS T2 ON T1.Musical_ID  =  T2.Musical_ID"
#query = "SELECT a.Name AS ActorName, m.Name AS MusicalName FROM actor a JOIN musical m ON a.Musical_ID = m.Musical_ID"
results = connect_and_query(data_name, query)
print("Query results:", results)



Connecting to SQLite database at: data\musical\musical.sqlite
Connection successful.
Query results: [
    [
        {
            "Name": "The Phantom of the Opera"
        },
        {
            "Name": "The Phantom of the Opera"
        },
        {
            "Name": "Les Mis\u00e9rables"
        },
        {
            "Name": "Les Mis\u00e9rables"
        },
        {
            "Name": "West Side Story"
        },
        {
            "Name": "The Book of Mormon"
        },
        {
            "Name": "The Book of Mormon"
        },
        {
            "Name": "The Book of Mormon"
        },
        {
            "Name": "The Book of Mormon"
        }
    ]
]
