# SQL Generation with Transformer API

In [1]:
! pip install torch transformers bitsandbytes accelerate sqlparse

Collecting sqlparse
  Downloading sqlparse-0.5.1-py3-none-any.whl.metadata (3.9 kB)
Downloading sqlparse-0.5.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.2/44.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sqlparse
Successfully installed sqlparse-0.5.1
[0m

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [3]:
torch.cuda.is_available()

True

In [4]:
available_memory = torch.cuda.get_device_properties(0).total_memory

In [5]:
print(available_memory)

25425608704


##Download the Model
Use any model on Colab (or any system with >30GB VRAM on your own machine) to load this in f16. If unavailable, use a GPU with minimum 8GB VRAM to load this in 8bit, or with minimum 5GB of VRAM to load in 4bit.

This step can take around 5 minutes the first time. So please be patient :)

In [6]:
model_name = "defog/sqlcoder-7b-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if available_memory > 15e9:
    # if you have atleast 15GB of GPU memory, run load the model in float16
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        device_map="auto",
        use_cache=True,
    )
else:
    # else, load in 8 bits – this is a bit slower
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        # torch_dtype=torch.float16,
        load_in_8bit=True,
        device_map="auto",
        use_cache=True,
    )

tokenizer_config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

##Set the Question & Prompt and Tokenize
Feel free to change the schema in the prompt below to your own schema

In [7]:
prompt = """### Task
Generate a SQL query to answer [QUESTION]{question}[/QUESTION]

### Instructions
- If you cannot answer the question with the available database schema, return 'I do not know'
- Remember that revenue is price multiplied by quantity
- Remember that cost is supply_price multiplied by quantity

### Database Schema
This query will run on a database whose schema is represented in this string:
CREATE TABLE products (
  product_id INTEGER PRIMARY KEY, -- Unique ID for each product
  name VARCHAR(50), -- Name of the product
  price DECIMAL(10,2), -- Price of each unit of the product
  quantity INTEGER  -- Current quantity in stock
);

CREATE TABLE customers (
   customer_id INTEGER PRIMARY KEY, -- Unique ID for each customer
   name VARCHAR(50), -- Name of the customer
   address VARCHAR(100) -- Mailing address of the customer
);

CREATE TABLE salespeople (
  salesperson_id INTEGER PRIMARY KEY, -- Unique ID for each salesperson
  name VARCHAR(50), -- Name of the salesperson
  region VARCHAR(50) -- Geographic sales region
);

CREATE TABLE sales (
  sale_id INTEGER PRIMARY KEY, -- Unique ID for each sale
  product_id INTEGER, -- ID of product sold
  customer_id INTEGER,  -- ID of customer who made purchase
  salesperson_id INTEGER, -- ID of salesperson who made the sale
  sale_date DATE, -- Date the sale occurred
  quantity INTEGER -- Quantity of product sold
);

CREATE TABLE product_suppliers (
  supplier_id INTEGER PRIMARY KEY, -- Unique ID for each supplier
  product_id INTEGER, -- Product ID supplied
  supply_price DECIMAL(10,2) -- Unit price charged by supplier
);

-- sales.product_id can be joined with products.product_id
-- sales.customer_id can be joined with customers.customer_id
-- sales.salesperson_id can be joined with salespeople.salesperson_id
-- product_suppliers.product_id can be joined with products.product_id

### Answer
Given the database schema, here is the SQL query that answers [QUESTION]{question}[/QUESTION]
[SQL]
"""

##Generate the SQL
This can be excruciatingly slow on a T4 in Colab, and can take 10-20 seconds per query. On faster GPUs, this will take ~1-2 seconds

Ideally, you should use `num_beams`=4 for best results. But because of memory constraints, we will stick to just 1 for now.

In [8]:
import sqlparse

def generate_query(question):
    updated_prompt = prompt.format(question=question)
    inputs = tokenizer(updated_prompt, return_tensors="pt").to("cuda")
    generated_ids = model.generate(
        **inputs,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=400,
        do_sample=False,
        num_beams=1,
    )
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    # empty cache so that you do generate more results w/o memory crashing
    # particularly important on Colab – memory management is much more straightforward
    # when running on an inference service
    return sqlparse.format(outputs[0].split("[SQL]")[-1], reindent=True)

In [9]:
question = "What was our revenue by product in the New York region last month?"
generated_sql = generate_query(question)

In [10]:
print(generated_sql)


SELECT p.product_id,
       SUM(s.quantity * p.price) AS revenue
FROM sales s
JOIN salespeople sp ON s.salesperson_id = sp.salesperson_id
JOIN products p ON s.product_id = p.product_id
WHERE sp.region = 'New York'
  AND s.sale_date >= (CURRENT_DATE - INTERVAL '1 month')
GROUP BY p.product_id
ORDER BY revenue DESC NULLS LAST;


# Exercise
 - Complete the prompts similar to what we did in class. 
     - Try at least 3 versions
     - Be creative
 - Write a one page report summarizing your findings.
     - Were there variations that didn't work well? i.e., where GPT either hallucinated or wrong
 - What did you learn?

In [11]:
question = "Who was the sales person and the customer with the highest order from 2 months ago?"
generated_sql = generate_query(question)

In [12]:
print(generated_sql)


SELECT s.salesperson_id,
       s.customer_id,
       p.name AS product_name,
       row_number() OVER(PARTITION BY s.salesperson_id, s.customer_id
                         ORDER BY s.sale_date DESC) AS recent_sale_rank
FROM sales s
JOIN products p ON s.product_id = p.product_id
WHERE s.sale_date >= (CURRENT_DATE - interval '2 months')
ORDER BY recent_sale_rank;


In [13]:
prompt = """### Task
Generate a SQL query to answer [QUESTION]{question}[/QUESTION]

### Instructions
- If you cannot answer the question with the available database schema, return 'Dude, I have no idea'
- Remember

### Database Schema
This query will run on a database whose schema is represented in this string:
CREATE TABLE products (
  product_id INTEGER PRIMARY KEY, -- Unique ID for each product
  name VARCHAR(50), -- Name of the product
  price DECIMAL(10,2), -- Price of each unit of the product
  session INTEGER  -- Current sessions possible per week
);

CREATE TABLE customers (
   customer_id INTEGER PRIMARY KEY, -- Unique ID for each customer
   name VARCHAR(50), -- Name of the customer
   address VARCHAR(100) -- Mailing address of the customer
);

CREATE TABLE trainers (
  trainer_id INTEGER PRIMARY KEY, -- Unique ID for each trainer
  name VARCHAR(50), -- Name of the salesperson
  gym VARCHAR(50) -- Gym of the trainer
);

CREATE TABLE sales (
  sale_id INTEGER PRIMARY KEY, -- Unique ID for each sale
  product_id INTEGER, -- ID of product sold
  customer_id INTEGER,  -- ID of customer who made purchase
  trainer_id INTEGER, -- ID of salesperson who made the sale
  sale_date DATE, -- Date the sale occurred
  sessions INTEGER -- Quantity of sessions sold
);

-- sales.product_id can be joined with products.product_id
-- sales.customer_id can be joined with customers.customer_id
-- sales.trainer_id can be joined with trainers.trainer_id

### Answer
Given the database schema, here is the SQL query that answers [QUESTION]{question}[/QUESTION]
[SQL]
"""

In [14]:
question = "Who is the trainer that books more sessions per month?"
generated_sql = generate_query(question)
print(generated_sql)


SELECT t.name,
       to_char(s.sale_date, 'YYYY-MM') AS year_month,
       SUM(s.sessions) AS total_sessions
FROM sales s
JOIN trainers t ON s.trainer_id = t.trainer_id
GROUP BY t.name,
         year_month
ORDER BY total_sessions DESC;


In [15]:
prompt = """### Task
Generate a SQL query to answer [QUESTION]{question}[/QUESTION]

### Instructions
- If you cannot answer the question with the available database schema, return 'Dude, I have no idea'
- 

### Database Schema
-- Database schema for a Boulder Gym

-- Employees Table
CREATE TABLE employees (
    employee_id INTEGER PRIMARY KEY, -- Unique ID for each employee
    name VARCHAR(50), -- Name of the employee
    role VARCHAR(50), -- Role of the employee (e.g., office, counter, trainer, cleaning, routesetting)
    shift_type VARCHAR(20), -- Shift type (e.g., full-time, part-time)
    preferred_shift VARCHAR(20), -- Preferred shift (morning or evening)
    shift_weight INTEGER, -- Weight for shift preferences to balance out good and bad shifts
    phone VARCHAR(15), -- Contact phone number
    email VARCHAR(50) -- Contact email
);

-- Customers Table
CREATE TABLE customers (
    customer_id INTEGER PRIMARY KEY, -- Unique ID for each customer
    name VARCHAR(50), -- Name of the customer
    phone VARCHAR(15), -- Contact phone number
    billing_info VARCHAR(100) -- Billing information for payments
);

-- Shifts Table
CREATE TABLE shifts (
    shift_id INTEGER PRIMARY KEY, -- Unique ID for each shift
    employee_id INTEGER, -- Employee assigned to the shift
    shift_date DATE, -- Date of the shift
    shift_time VARCHAR(20), -- Shift time (morning or evening)
    job VARCHAR(50), -- Assigned job for the shift
    shift_weight INTEGER, -- Weight of the shift for fairness
    FOREIGN KEY (employee_id) REFERENCES employees(employee_id)
);

-- Kids Courses Table
CREATE TABLE kids_courses (
    course_id INTEGER PRIMARY KEY, -- Unique ID for each kids course
    course_date DATE, -- Date of the course
    course_time VARCHAR(20), -- Time of the course (morning or evening)
    trainer_id INTEGER, -- Trainer assigned to the course
    FOREIGN KEY (trainer_id) REFERENCES employees(employee_id)
);

-- Group Training Sessions Table
CREATE TABLE group_sessions (
    session_id INTEGER PRIMARY KEY, -- Unique ID for each group training session
    session_date DATE, -- Date of the session
    session_time VARCHAR(20), -- Time of the session
    trainer_id INTEGER, -- Trainer assigned to the session
    FOREIGN KEY (trainer_id) REFERENCES employees(employee_id)
);

-- Personal Training Table
CREATE TABLE personal_training (
    training_id INTEGER PRIMARY KEY, -- Unique ID for each personal training session
    customer_id INTEGER, -- Customer receiving personal training
    trainer_id INTEGER, -- Trainer assigned to the session
    training_date DATE, -- Date of the training
    training_time VARCHAR(20), -- Time of the training
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id),
    FOREIGN KEY (trainer_id) REFERENCES employees(employee_id)
);

-- Shift Preferences Table
CREATE TABLE shift_preferences (
    preference_id INTEGER PRIMARY KEY, -- Unique ID for each shift preference
    employee_id INTEGER, -- Employee with the preference
    preferred_day VARCHAR(20), -- Preferred day for the shift (e.g., Monday, Tuesday)
    preferred_shift VARCHAR(20), -- Preferred shift time (morning or evening)
    preferred_job VARCHAR(50), -- Preferred job for the shift
    FOREIGN KEY (employee_id) REFERENCES employees(employee_id)
);

-- Attendance Table
CREATE TABLE attendance (
    attendance_id INTEGER PRIMARY KEY, -- Unique ID for attendance record
    session_type VARCHAR(20), -- Type of session (shift, group training, personal training, kids course)
    session_id INTEGER, -- ID of the session (could reference shifts, group_sessions, etc.)
    attendee_id INTEGER, -- Employee or customer ID
    attended BOOLEAN, -- Whether the attendee was present (true/false)
    FOREIGN KEY (session_id) REFERENCES shifts(shift_id),
    FOREIGN KEY (attendee_id) REFERENCES employees(employee_id)
);

-- Memberships Table
CREATE TABLE memberships (
    membership_id INTEGER PRIMARY KEY, -- Unique ID for each membership
    customer_id INTEGER, -- Customer with the membership
    membership_type VARCHAR(50), -- Type of membership (e.g., monthly, yearly, premium)
    start_date DATE, -- Start date of membership
    end_date DATE, -- End date of membership
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
);

-- Shift Planning Table
CREATE TABLE shift_plan (
    plan_id INTEGER PRIMARY KEY, -- Unique ID for shift plan
    shift_id INTEGER, -- Shift being planned
    employee_id INTEGER, -- Employee assigned for the shift
    plan_date DATE, -- Date when shift was assigned
    assigned_by VARCHAR(50), -- Manager who assigned the shift
    FOREIGN KEY (shift_id) REFERENCES shifts(shift_id),
    FOREIGN KEY (employee_id) REFERENCES employees(employee_id)
);

-- Equipment Inventory Table
CREATE TABLE equipment (
    equipment_id INTEGER PRIMARY KEY, -- Unique ID for each equipment
    name VARCHAR(50), -- Name of the equipment
    status VARCHAR(20), -- Status (available, under maintenance, etc.)
    last_service_date DATE -- Date of the last maintenance
);

-- Feedback and Reviews Table
CREATE TABLE feedback (
    feedback_id INTEGER PRIMARY KEY, -- Unique ID for each feedback
    customer_id INTEGER, -- Customer providing the feedback
    session_id INTEGER, -- Training session or service the feedback is related to
    rating INTEGER, -- Rating (1-5)
    comments TEXT, -- Additional comments
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
);

-- Payments Table
CREATE TABLE payments (
    payment_id INTEGER PRIMARY KEY, -- Unique ID for each payment
    customer_id INTEGER, -- Customer making the payment
    amount DECIMAL(10, 2), -- Amount paid
    payment_date DATE, -- Date of payment
    payment_method VARCHAR(50), -- Payment method (e.g., credit card, cash)
    FOREIGN KEY (customer_id) REFERENCES customers(customer_id)
);

-- This schema covers:
-- 1. Employees with different roles, shift preferences, and a weighting system for shift fairness.
-- 2. Customers with their contact and billing info.
-- 3. Shifts assigned to employees, with a focus on balancing good and bad shifts.
-- 4. Kids courses, group training sessions, and personal training with trainer assignments.
-- 5. Attendance tracking for both employees and customers.
-- 6. Memberships for customers with different types and durations.
-- 7. Shift planning to manage and assign shifts effectively.
-- 8. Equipment inventory for maintenance and availability tracking.
-- 9. Feedback system for customers to provide reviews on trainers and sessions.
-- 10. Payment history for tracking customer transactions.

-- Table Joins:
-- 1. employees.employee_id can be joined with shifts.employee_id, kids_courses.trainer_id, group_sessions.trainer_id, personal_training.trainer_id, shift_preferences.employee_id, shift_plan.employee_id, attendance.attendee_id, and feedback.session_id.
-- 2. customers.customer_id can be joined with personal_training.customer_id, memberships.customer_id, feedback.customer_id, and payments.customer_id.
-- 3. shifts.shift_id can be joined with shift_plan.shift_id and attendance.session_id.
-- 4. kids_courses.course_id, group_sessions.session_id, and personal_training.training_id can be used in attendance.session_id for tracking attendance.
-- 5. attendance.session_id can be used to track attendance for various session types.
-- 6. shift_preferences.employee_id can be used to match preferences for specific employees when planning shifts.
-- 7. equipment.equipment_id is independent but can be referenced in other systems like maintenance logs or tracking systems.

### Answer
Given the database schema, here is the SQL query that answers [QUESTION]{question}[/QUESTION]
[SQL]
"""

In [21]:
question = "Who is the employee with more morning shifts?"
generated_sql = generate_query(question)
print(generated_sql)


SELECT e.name,
       COUNT(*) AS morning_shift_count
FROM employees e
JOIN shifts s ON e.employee_id = s.employee_id
WHERE s.shift_time = 'Morning'
GROUP BY e.name
ORDER BY morning_shift_count DESC
LIMIT 1;
