In [7]:
from openai import OpenAI
client = OpenAI(
    base_url="http://ftisu.ddns.net:9293/vllm/v1",
    api_key="token-abc123",
)


completion = client.chat.completions.create(
  model="Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-Int4",
  messages=[
    {"role": "user", "content": "Hello!"}
  ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Hello! How can I assist you today?', refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[], reasoning_content=None)


In [8]:
from llama_index.llms.vllm import VllmServer
from llama_index.core.llms import ChatMessage

llm = VllmServer(
    api_url="http://ftisu.ddns.net:9293/vllm/generate", max_new_tokens=100, temperature=0
)

llm.complete("what is a black hole ?")

KeyError: 'text'

In [9]:
from llama_index.llms.openai_like import OpenAILike
from llama_index.core import PromptTemplate

# Create the LLM client with sampling parameters
llm = OpenAILike(
    model="Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-Int8", 
    api_base="http://prepared-anemone-routinely.ngrok-free.app/v1", 
    api_key="fake",
    temperature=0.7,  # Control randomness (higher = more random)
    max_tokens=100    # Maximum number of tokens to generate
)

TEXT_TO_SQL_PROMPT = (
    "CREATE TABLE city (\n"
    "    id int4(10),\n"
    "    name text(2147483647),\n"
    "    country_code bpchar(3),\n"
    "    district text(2147483647),\n"
    "    population int4(10),\n"
    "    local_name text(2147483647), -- City local name\n"
    ");\n"
    "\n"
    "CREATE TABLE country (\n"
    "    code bpchar(3),\n"
    "    name text(2147483647),\n"
    "    continent text(2147483647),\n"
    "    region text(2147483647),\n"
    "    surface_area float4(8),\n"
    "    indep_year int2(5),\n"
    "    population int4(10),\n"
    "    life_expectancy float4(8),\n"
    "    gnp numeric(10), -- GNP is Gross national product\n"
    "    gnp_old numeric(10),\n"        
    "    local_name text(2147483647),\n"
    "    government_form text(2147483647),\n"
    "    head_of_state text(2147483647),\n"
    "    capital int4(10),\n"
    "    code2 bpchar(2), -- Following ISO 3166-1 alpha-2 code\n"
    ");\n"
    "-- Using valid {dialect} SQL, answer the following questions for the tables provided above.\n"
    "Question: Top 2 city with highest population in each country\n"
)
TEXT_TO_SQL_TMPL = PromptTemplate(TEXT_TO_SQL_PROMPT)
fmt_messages = TEXT_TO_SQL_TMPL.format_messages(dialect="POSTGRESQL")
chat_response = llm.chat(fmt_messages)
print(str(chat_response))

assistant:  SELECT T2.name ,  T1.code FROM country AS T1 JOIN city AS T2 ON T1.code  =  T2.country_code GROUP BY T1.code ORDER BY count(*) DESC LIMIT 2


In [17]:
import ast
import re

def extract_table_list(response: str) -> list:
    """Extracts a list of table names from a response string that may contain code blocks."""
    # First, strip any code block formatting
    cleaned_response = response.strip()
    
    # Remove markdown code block if present
    code_block_pattern = r"```(?:python)?\s*([\s\S]*?)\s*```"
    code_block_match = re.search(code_block_pattern, cleaned_response)
    
    if code_block_match:
        # Use the content inside the code block
        cleaned_response = code_block_match.group(1).strip()
    
    try:
        # Try to parse as a Python literal
        return ast.literal_eval(cleaned_response)
    except (SyntaxError, ValueError):
        # Fallback: look for strings in single quotes
        pattern = r"'([^']+)'"
        matches = re.findall(pattern, cleaned_response)
        if matches:
            return matches
            
        # Try double quotes if no single quotes found
        pattern = r'"([^"]+)"'
        matches = re.findall(pattern, cleaned_response)
        if matches:
            return matches
            
        # If no quotes found, split by commas and clean up
        items = [item.strip().strip("'\"") for item in cleaned_response.split(',')]
        return [item for item in items if item]
    

extract_table_list("""
```python
["pets"]
```
""")


['pets']

In [18]:
import re
from typing import List

def extract_tables_from_sql(sql_query: str) -> List[str]:
    """
    Extract all table names from a SQL query, excluding tables defined in CTEs.
    
    Args:
        sql_query (str): The SQL query to analyze
        
    Returns:
        List[str]: List of table names referenced in the query (excluding CTE tables)
    """
    # Normalize whitespace and line breaks
    sql_query = " ".join(sql_query.split())
    
    # Step 1: Identify and extract all CTE names to exclude them later
    cte_pattern = r"WITH\s+(?:RECURSIVE\s+)?([^(].*?)\s+AS\s*\("
    cte_tables = []
    
    # Find CTEs at the start of the query
    cte_match = re.search(cte_pattern, sql_query, re.IGNORECASE)
    if cte_match:
        cte_part = cte_match.group(1)
        # Handle multiple CTEs separated by commas
        cte_sections = cte_part.split(',')
        for section in cte_sections:
            # Extract the CTE name
            cte_name_match = re.search(r"^\s*(\w+)(?:\s*\(.*?\))?\s*$", section.strip())
            if cte_name_match:
                cte_tables.append(cte_name_match.group(1).lower())
    
    # Also handle the case of multiple CTEs defined using comma and the AS keyword
    cte_segments_pattern = r"WITH(?:\s+RECURSIVE)?\s+.*?(?:,\s*(\w+)\s+AS\s*\()"
    for cte_segment in re.finditer(cte_segments_pattern, sql_query, re.IGNORECASE):
        if cte_segment.group(1):
            cte_tables.append(cte_segment.group(1).lower())
    
    # Step 2: Extract tables from FROM and JOIN clauses
    # Pattern for FROM clause
    from_pattern = r"FROM\s+([^\s,();]*)(?:\s+AS\s+\w+)?(?:\s*,\s*([^\s,();]*)(?:\s+AS\s+\w+)?)*"
    
    # Pattern for JOIN clauses
    join_pattern = r"JOIN\s+([^\s,();]*)(?:\s+AS\s+\w+)?"
    
    # Find all tables in FROM clauses
    table_names = []
    for match in re.finditer(from_pattern, sql_query, re.IGNORECASE):
        # The first group contains the first table
        if match.group(1) and match.group(1).strip():
            table_name = re.sub(r'^\(|\)$', '', match.group(1).strip())
            if "." in table_name:
                # Extract just the table name if schema.table format is used
                table_name = table_name.split('.')[-1]
            table_names.append(table_name.lower())
        
        # Check if we have multiple tables in the FROM clause
        rest_of_from = match.group(0)[len("FROM ") + len(match.group(1)):].strip()
        if rest_of_from.startswith(','):
            # Split by comma to get additional tables
            additional_tables = re.findall(r',\s*([^\s,();]*)', rest_of_from)
            for table in additional_tables:
                if table and table.strip():
                    table_name = re.sub(r'^\(|\)$', '', table.strip())
                    if "." in table_name:
                        table_name = table_name.split('.')[-1]
                    table_names.append(table_name.lower())
    
    # Find all tables in JOIN clauses
    for match in re.finditer(join_pattern, sql_query, re.IGNORECASE):
        if match.group(1) and match.group(1).strip():
            table_name = re.sub(r'^\(|\)$', '', match.group(1).strip())
            if "." in table_name:
                table_name = table_name.split('.')[-1]
            table_names.append(table_name.lower())
    
    # Step 3: Filter out CTE tables and SQL keywords
    sql_keywords = {'select', 'where', 'group', 'order', 'limit', 'offset', 'having', 'union', 'intersect', 'except'}
    filtered_tables = []
    
    for table in table_names:
        # Skip empty strings
        if not table:
            continue
            
        # Skip CTEs
        if table.lower() in cte_tables:
            continue
            
        # Skip if it's just a SQL keyword
        if table.lower() in sql_keywords:
            continue
            
        # Skip if it's a subquery (likely starts with SELECT)
        if table.lower().startswith('select'):
            continue
            
        filtered_tables.append(table)
    
    # Remove duplicates while preserving order
    unique_tables = []
    for table in filtered_tables:
        if table not in unique_tables:
            unique_tables.append(table)
    
    return unique_tables


In [21]:

# Test case 1: Simple query
sql1 = """
SELECT * FROM users WHERE age > 18;
"""

# Test case 2: Query with joins
sql2 = """
SELECT u.name, o.order_date 
FROM users u 
JOIN orders o ON u.id = o.user_id
LEFT JOIN order_items oi ON o.id = oi.order_id;
"""

# Test case 3: Query with a CTE
sql3 = """
WITH UserOrders AS (
    SELECT u.name, COUNT(o.id) as order_count
    FROM users u
    LEFT JOIN orders o ON u.id = o.user_id
    GROUP BY u.name
)
SELECT name, order_count 
FROM UserOrders 
WHERE order_count > 5;
"""

# Test case 4: Query with multiple CTEs
sql4 = """
WITH 
RecentOrders AS (
    SELECT * FROM orders WHERE order_date > '2023-01-01'
),
TopCustomers AS (
    SELECT customer_id, SUM(total) as total_spent
    FROM RecentOrders
    GROUP BY customer_id
    ORDER BY total_spent DESC
    LIMIT 10
)
SELECT c.name, tc.total_spent
FROM TopCustomers tc
JOIN customers c ON tc.customer_id = c.id
ORDER BY tc.total_spent DESC;
"""

# Test case 5: More complex query with subqueries and schema qualifiers
sql5 = """
SELECT 
    d.department_name,
    (SELECT COUNT(*) FROM public.employees e WHERE e.department_id = d.id) as employee_count,
    (SELECT AVG(salary) FROM hr.salaries s JOIN public.employees e ON s.employee_id = e.id WHERE e.department_id = d.id) as avg_salary
FROM 
    company.departments d
WHERE 
    d.active = true
ORDER BY 
    employee_count DESC;
"""

# Test case 6: Query with schema qualifiers and aliases
sql6 = """
SELECT * 
FROM schema1.table1 t1
JOIN schema2.table2 AS t2 ON t1.id = t2.id
JOIN schema3.very_long_table_name as t3 ON t2.id = t3.parent_id;
"""

# Test case 7: Query with recursive CTE
sql7 = """
WITH RECURSIVE OrgHierarchy AS (
    SELECT id, name, manager_id, 1 as level
    FROM employees
    WHERE manager_id IS NULL
    
    UNION ALL
    
    SELECT e.id, e.name, e.manager_id, oh.level + 1
    FROM employees e
    JOIN OrgHierarchy oh ON e.manager_id = oh.id
)
SELECT id, name, level
FROM OrgHierarchy
ORDER BY level, name;
"""

# Test case 8: Query with multiple CTEs separated by commas
sql8 = """
WITH 
    OrderStats(order_id, total_items) AS (
        SELECT order_id, COUNT(*) 
        FROM order_items 
        GROUP BY order_id
    ),
    CustomerStats AS (
        SELECT customer_id, COUNT(*) as order_count
        FROM orders
        GROUP BY customer_id
    )
SELECT 
    c.name,
    cs.order_count,
    AVG(os.total_items) as avg_items_per_order
FROM customers c
JOIN orders o ON c.id = o.customer_id
JOIN OrderStats os ON o.id = os.order_id
JOIN CustomerStats cs ON c.id = cs.customer_id
GROUP BY c.id, c.name, cs.order_count;
"""

# Run tests
test_cases = [
    ("Simple query", sql1),
    ("Query with joins", sql2),
    ("Query with a CTE", sql3),
    ("Query with multiple CTEs", sql4),
    ("Complex query with subqueries", sql5),
    ("Query with schema qualifiers", sql6),
    ("Query with recursive CTE", sql7),
    ("Query with multiple CTEs with commas", sql8)
]

for name, sql in test_cases:
    tables = extract_tables_from_sql(sql)
    print(f"\n{name}:")
    print("SQL:", sql)
    print("Tables found:", tables)


Simple query:
SQL: 
SELECT * FROM users WHERE age > 18;

Tables found: ['users']

Query with joins:
SQL: 
SELECT u.name, o.order_date 
FROM users u 
JOIN orders o ON u.id = o.user_id
LEFT JOIN order_items oi ON o.id = oi.order_id;

Tables found: ['users', 'orders', 'order_items']

Query with a CTE:
SQL: 
WITH UserOrders AS (
    SELECT u.name, COUNT(o.id) as order_count
    FROM users u
    LEFT JOIN orders o ON u.id = o.user_id
    GROUP BY u.name
)
SELECT name, order_count 
FROM UserOrders 
WHERE order_count > 5;

Tables found: ['users', 'orders']

Query with multiple CTEs:
SQL: 
WITH 
RecentOrders AS (
    SELECT * FROM orders WHERE order_date > '2023-01-01'
),
TopCustomers AS (
    SELECT customer_id, SUM(total) as total_spent
    FROM RecentOrders
    GROUP BY customer_id
    ORDER BY total_spent DESC
    LIMIT 10
)
SELECT c.name, tc.total_spent
FROM TopCustomers tc
JOIN customers c ON tc.customer_id = c.id
ORDER BY tc.total_spent DESC;

Tables found: ['orders', 'customers']

Com