# SQL query from table names - Continued

In [1]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')

## The old Prompt

In [2]:
#The old prompt
old_context = [ {'role':'system', 'content':"""
you are a bot to assist in create SQL commands, all your answers should start with \
this is your SQL, and after that an SQL that can do what the user request. \
Your Database is composed by a SQL database with some tables. \
Try to maintain the SQL order simple.
Put the SQL command in white letters with a black background, and just after \
a simple and concise text explaining how it works.
If the user ask for something that can not be solved with an SQL Order \
just answer something nice and simple, maximum 10 words, asking him for something that \
can be solved with SQL.
"""} ]

old_context.append( {'role':'system', 'content':"""
first table:
{
  "tableName": "employees",
  "fields": [
    {
      "nombre": "ID_usr",
      "tipo": "int"
    },
    {
      "nombre": "name",
      "tipo": "varchar"
    }
  ]
}
"""
})

old_context.append( {'role':'system', 'content':"""
second table:
{
  "tableName": "salary",
  "fields": [
    {
      "nombre": "ID_usr",
      "type": "int"
    },
    {
      "name": "year",
      "type": "date"
    },
    {
      "name": "salary",
      "type": "float"
    }
  ]
}
"""
})

old_context.append( {'role':'system', 'content':"""
third table:
{
  "tablename": "studies",
  "fields": [
    {
      "name": "ID",
      "type": "int"
    },
    {
      "name": "ID_usr",
      "type": "int"
    },
    {
      "name": "educational_level",
      "type": "int"
    },
    {
      "name": "Institution",
      "type": "varchar"
    },
    {
      "name": "Years",
      "type": "date"
    }
    {
      "name": "Speciality",
      "type": "varchar"
    }
  ]
}
"""
})

## New Prompt.
We are going to improve it following the instructions of a Paper from the Ohaio University: [How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings](https://arxiv.org/abs/2305.11853). I recommend you read that paper.

For each table, we will define the structure using the same syntax as in a SQL create table command, and add the sample rows of the content.

Finally, at the end of the prompt, we'll include some example queries with the SQL that the model should generate. This technique is called Few-Shot Samples, in which we provide the prompt with some examples to assist it in generating the correct SQL.


In [9]:
context = [ {'role':'system', 'content':"""
-- Normalized Database Schema with Relationships
CREATE TABLE employees (
    ID_usr INT PRIMARY KEY,
    name VARCHAR(100)
);

CREATE TABLE salary (
    ID_usr INT,
    year DATE,
    salary FLOAT,
    FOREIGN KEY (ID_usr) REFERENCES employees(ID_usr)
);

CREATE TABLE studies (
    ID INT PRIMARY KEY,
    ID_usr INT,
    educational_level INT,
    Institution VARCHAR(100),
    Years DATE,
    Speciality VARCHAR(100),
    FOREIGN KEY (ID_usr) REFERENCES employees(ID_usr)
);

-- Sample Rows
INSERT INTO employees (ID_usr, name) VALUES (1, 'Alice'), (2, 'Bob'), (3, 'Charlie');
INSERT INTO salary (ID_usr, year, salary) VALUES (1, '2022-01-01', 70000), (2, '2022-01-01', 80000), (3, '2022-01-01', 90000);
INSERT INTO studies (ID, ID_usr, educational_level, Institution, Years, Speciality) VALUES 
(1, 1, 5, 'University A', '2015-01-01', 'Engineering'), 
(2, 2, 6, 'University B', '2016-01-01', 'Science'), 
(3, 3, 4, 'University C', '2017-01-01', 'Arts');

-- Column Values (SelectCol)
employees.ID_usr: [1, 2, 3]
employees.name: [STRING]
salary.year: [DATE]
salary.salary: [FLOAT]
studies.educational_level: [1-6]
studies.Institution: [STRING]
studies.Speciality: [STRING]
"""}]



In [10]:
#FEW SHOT SAMPLES
context.append({'role':'system', 'content':"""
-- Maintain the SQL order simple and efficient as you can, using valid SQLite, answer the following questions for the table provided above.

-- Query 1: Find highest salary
-- this is your SQL
SELECT MAX(salary) AS highest_salary
FROM salary;
-- how it works
-- This query finds the maximum salary value in the salary table.

-- Query 2: Find institution with highest average salary
-- this is your SQL
SELECT st.Institution, AVG(s.salary) AS avg_salary
FROM studies st
JOIN salary s ON st.ID_usr = s.ID_usr
GROUP BY st.Institution
ORDER BY avg_salary DESC
LIMIT 1;
-- how it works
-- Joins studies and salary tables, calculates average salary per institution, returns highest.

-- Query 3: Count employees with highest salary
-- this is your SQL
SELECT COUNT(*) AS count_highest_salary
FROM salary
WHERE salary = (SELECT MAX(salary) FROM salary);
-- how it works
-- Counts how many employees have the maximum salary value.

-- Query 4: Average salary by years of experience
-- this is your SQL
WITH experience AS (
    SELECT ID_usr,
           (julianday('now') - julianday(MIN(Years))) / 365.25 AS years_exp
    FROM studies
    GROUP BY ID_usr
)
SELECT ROUND(AVG(s.salary), 2) AS avg_salary,
       ROUND(e.years_exp, 0) AS years_experience
FROM experience e
JOIN salary s ON e.ID_usr = s.ID_usr
GROUP BY ROUND(e.years_exp, 0)
ORDER BY years_experience;
-- how it works
-- Calculates years of experience from studies dates, then finds average salary per experience group.
"""})

In [11]:
#Function to call the model.
def return_CCRMSQL(user_message, context):
    client = OpenAI(
    # This is the default and can be omitted
    api_key=OPENAI_API_KEY,
)

    newcontext = context.copy()
    newcontext.append({'role':'user', 'content':"question: " + user_message})

    response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=newcontext,
            temperature=0,
        )

    return (response.choices[0].message.content)

## NL2SQL Samples
We're going to review some examples generated with the old prompt and others with the new prompt.

In [12]:
#new
context_user = context.copy()
print(return_CCRMSQL("Whats the highest salary?", context_user))

highest_salary: 90000.0


In [13]:
#old
old_context_user = old_context.copy()
print(return_CCRMSQL("Whats the highest salary?", old_context_user))

This is your SQL:
```sql
SELECT MAX(salary) AS highest_salary FROM salary;
```

This SQL query selects the maximum (highest) salary from the "salary" table and aliases it as "highest_salary".


In [14]:
#new
print(return_CCRMSQL("Whats the Institution with a higher average salary", context_user))

The institution with the highest average salary is "University C" with an average salary of $90,000.


In [15]:
#old
print(return_CCRMSQL("Whats the Institution with a higher average salary", old_context_user))

This is your SQL:
```sql
SELECT s.Institution, AVG(s.salary) AS avg_salary
FROM salary s
JOIN employees e ON s.ID_usr = e.ID_usr
GROUP BY s.Institution
ORDER BY avg_salary DESC
LIMIT 1;
```

This SQL query retrieves the institution with the highest average salary by joining the `salary` table with the `employees` table on the `ID_usr` column. It calculates the average salary for each institution, orders the results in descending order of average salary, and limits the output to the institution with the highest average salary.


In [16]:
#new 
print(return_CCRMSQL("How many have a highest salary?", context_user))

There is 1 employee who has the highest salary.


In [17]:
#old
print(return_CCRMSQL("How many have a highest salary?", old_context_user))


This is your SQL:
```sql
SELECT COUNT(*) 
FROM salary 
WHERE salary = (SELECT MAX(salary) FROM salary);
```

This SQL query counts the number of employees who have the highest salary in the `salary` table. It uses a subquery to find the maximum salary in the table and then compares each employee's salary to that maximum value to count how many employees have the highest salary.


In [19]:
#new 
print(return_CCRMSQL("Find average salary by years of experience", context_user))

```sql
WITH experience AS (
    SELECT ID_usr,
           (julianday('now') - julianday(MIN(Years))) / 365.25 AS years_exp
    FROM studies
    GROUP BY ID_usr
)
SELECT ROUND(AVG(s.salary), 2) AS avg_salary,
       ROUND(e.years_exp, 0) AS years_experience
FROM experience e
JOIN salary s ON e.ID_usr = s.ID_usr
GROUP BY ROUND(e.years_exp, 0)
ORDER BY years_experience;
```


In [21]:
#old
print(return_CCRMSQL("Find average salary by years of experience", old_context_user))

This is your SQL:
```sql
SELECT AVG(s.salary) AS average_salary, e.Years
FROM salary s
JOIN employees e ON s.ID_usr = e.ID_usr
GROUP BY e.Years;
```

This SQL query retrieves the average salary of employees based on their years of experience. It joins the "salary" table with the "employees" table on the employee ID, calculates the average salary for each group of years of experience, and displays the result.


In [23]:
#new 
print(return_CCRMSQL("Find average salary by years of experience", context_user))


-- Query 4: Average salary by years of experience
WITH experience AS (
    SELECT ID_usr,
           (julianday('now') - julianday(MIN(Years))) / 365.25 AS years_exp
    FROM studies
    GROUP BY ID_usr
)
SELECT ROUND(AVG(s.salary), 2) AS avg_salary,
       ROUND(e.years_exp, 0) AS years_experience
FROM experience e
JOIN salary s ON e.ID_usr = s.ID_usr
GROUP BY ROUND(e.years_exp, 0)
ORDER BY years_experience;

-- how it works
-- This query calculates the years of experience for each employee based on their earliest study year, then calculates the average salary for each group of years of experience.


In [24]:
#old
print(return_CCRMSQL("Show me the salary trend over time ", old_context_user))

This is your SQL:
```sql
SELECT year, AVG(salary) AS average_salary
FROM salary
GROUP BY year
ORDER BY year;
```

This SQL query selects the year and the average salary for each year from the "salary" table. It calculates the average salary for each year and groups the results by year, then orders the results by year in ascending order.


In [25]:
#new
print(return_CCRMSQL("Show me the salary trend over time ", context_user))

-- Query: Salary trend over time
SELECT s.year, AVG(s.salary) AS avg_salary
FROM salary s
GROUP BY s.year
ORDER BY s.year;
-- This query calculates the average salary for each year, showing the salary trend over time.


# Exercise
 - Complete the prompts similar to what we did in class. 
     - Try at least 3 versions
     - Be creative
 - Write a one page report summarizing your findings.
     - Were there variations that didn't work well? i.e., where GPT either hallucinated or wrong.
     - What did you learn?


## What i learned 

The New prompts create more accurate and detailed SQL queries than old ones.

The New prompts explain the query step-by-step and format them nicely and are easier to understand 

Issues: Both prompts sometimes guess table relationships or make mistakes, but old prompts make more errors, like using wrong columns. 

What Works Best:
Clear table info helps.
Writing clear, detailed prompts with examples leads to better SQL queries.