# SQL query from table names - Continued

In [7]:
from openai import OpenAI
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')

## The old Prompt

In [9]:
#The old prompt
old_context = [ {'role':'system', 'content':"""
you are a bot to assist in create SQL commands, all your answers should start with \
this is your SQL, and after that an SQL that can do what the user request. \
Your Database is composed by a SQL database with some tables. \
Try to maintain the SQL order simple.
Put the SQL command in white letters with a black background, and just after \
a simple and concise text explaining how it works.
If the user ask for something that can not be solved with an SQL Order \
just answer something nice and simple, maximum 10 words, asking him for something that \
can be solved with SQL.
"""} ]

old_context.append( {'role':'system', 'content':"""
first table:
{
  "tableName": "employees",
  "fields": [
    {
      "nombre": "ID_usr",
      "tipo": "int"
    },
    {
      "nombre": "name",
      "tipo": "varchar"
    }
  ]
}
"""
})

old_context.append( {'role':'system', 'content':"""
second table:
{
  "tableName": "salary",
  "fields": [
    {
      "nombre": "ID_usr",
      "type": "int"
    },
    {
      "name": "year",
      "type": "date"
    },
    {
      "name": "salary",
      "type": "float"
    }
  ]
}
"""
})

old_context.append( {'role':'system', 'content':"""
third table:
{
  "tablename": "studies",
  "fields": [
    {
      "name": "ID",
      "type": "int"
    },
    {
      "name": "ID_usr",
      "type": "int"
    },
    {
      "name": "educational_level",
      "type": "int"
    },
    {
      "name": "Institution",
      "type": "varchar"
    },
    {
      "name": "Years",
      "type": "date"
    }
    {
      "name": "Speciality",
      "type": "varchar"
    }
  ]
}
"""
})

## New Prompt.
We are going to improve it following the instructions of a Paper from the Ohaio University: [How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings](https://arxiv.org/abs/2305.11853). I recommend you read that paper.

For each table, we will define the structure using the same syntax as in a SQL create table command, and add the sample rows of the content.

Finally, at the end of the prompt, we'll include some example queries with the SQL that the model should generate. This technique is called Few-Shot Samples, in which we provide the prompt with some examples to assist it in generating the correct SQL.


In [14]:
context = [ {'role':'system', 'content':"""
 CREATE SEVERAL (3+) TABLES HERE
"""} ]

In [16]:
#FEW SHOT SAMPLES
context.append( {'role':'system', 'content':"""
    You are an assistant designed to help generate SQL commands. All your answers should start with:
    'This is your SQL:', followed by an SQL query that satisfies the user's request.
    
    Your database consists of several tables, defined below. For any SQL generation task:
    - Keep the SQL syntax simple.
    - Display the SQL command in white letters with a black background.
    - After the SQL command, provide a concise explanation of how it works.
    
    If the user asks for something that cannot be solved with SQL, respond politely with a maximum of 10 words asking for a SQL-solvable request.

    Here are the tables in your database:

    CREATE TABLE employees (
        ID_usr INT PRIMARY KEY,
        name VARCHAR(100)
    );

    INSERT INTO employees (ID_usr, name) VALUES
    (1, 'John Doe'),
    (2, 'Jane Smith');

    CREATE TABLE salary (
        ID_usr INT,
        year DATE,
        salary FLOAT,
        FOREIGN KEY (ID_usr) REFERENCES employees(ID_usr)
    );

    INSERT INTO salary (ID_usr, year, salary) VALUES
    (1, '2022-01-01', 50000),
    (2, '2022-01-01', 60000);

    CREATE TABLE studies (
        ID INT PRIMARY KEY,
        ID_usr INT,
        educational_level INT,
        institution VARCHAR(100),
        years DATE,
        speciality VARCHAR(100),
        FOREIGN KEY (ID_usr) REFERENCES employees(ID_usr)
    );

    INSERT INTO studies (ID, ID_usr, educational_level, institution, years, speciality) VALUES
    (1, 1, 2, 'Harvard', '2020-01-01', 'Computer Science'),
    (2, 2, 3, 'MIT', '2021-01-01', 'Mechanical Engineering');

    Now, let's see some example queries and their corresponding SQL outputs:

    -- Example 1: Get all employee names
    Query: "Get all employee names"
    SQL: SELECT name FROM employees;

    -- Example 2: Get the salary of employees for the year 2022
    Query: "Get the salary of employees for the year 2022"
    SQL: SELECT e.name, s.salary FROM employees e JOIN salary s ON e.ID_usr = s.ID_usr WHERE s.year = '2022-01-01';

    -- Example 3: Get the educational level and institution of employees
    Query: "Get the educational level and institution of employees"
    SQL: SELECT e.name, st.educational_level, st.institution FROM employees e JOIN studies st ON e.ID_usr = st.ID_usr;

    Based on the above table structures and examples, generate the SQL for the following query:
    Query: {question}
    """})

In [17]:
#Functio to call the model.
def return_CCRMSQL(user_message, context):
    client = OpenAI(
    # This is the default and can be omitted
    api_key=OPENAI_API_KEY,
)

    newcontext = context.copy()
    newcontext.append({'role':'user', 'content':"question: " + user_message})

    response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=newcontext,
            temperature=0,
        )

    return (response.choices[0].message.content)

## NL2SQL Samples
We're going to review some examples generated with the old prompt and others with the new prompt.

In [22]:
#new
context_user = context.copy()
print(return_CCRMSQL("""please give me the names of the employees with the highest salery and level of eductaion""", context_user))

This is your SQL:
```sql
SELECT e.name, st.educational_level
FROM employees e
JOIN studies st ON e.ID_usr = st.ID_usr
JOIN (
    SELECT ID_usr, MAX(salary) AS max_salary
    FROM salary
    GROUP BY ID_usr
) s ON e.ID_usr = s.ID_usr
WHERE salary = max_salary;
```

This query retrieves the names of employees with the highest salary and their corresponding educational level. It achieves this by joining the `employees` table with the `studies` table and a subquery that calculates the maximum salary for each employee. The final result is filtered to only include employees whose salary matches the maximum salary calculated.


In [20]:
#old
old_context_user = old_context.copy()
print(return_CCRMSQL("please give me the names of the employees with the highest salery", old_context_user))

This is your SQL:
```sql
SELECT e.name
FROM employees e
JOIN salary s ON e.ID_usr = s.ID_usr
WHERE s.salary = (SELECT MAX(salary) FROM salary);
```

This SQL query retrieves the names of the employees with the highest salary by joining the "employees" table with the "salary" table on the employee ID. It then filters the result to only include employees whose salary matches the maximum salary found in the "salary" table.


In [21]:
#new
print(return_CCRMSQL("Yplease give me the names of the employees with the highest salery", context_user))

This is your SQL:
```sql
SELECT e.name
FROM employees e
JOIN salary s ON e.ID_usr = s.ID_usr
WHERE s.salary = (SELECT MAX(salary) FROM salary);
```

This query retrieves the names of employees who have the highest salary. It joins the `employees` and `salary` tables on the employee ID, then filters the result to only include employees whose salary matches the maximum salary in the `salary` table.


In [39]:
#old
print(return_CCRMSQL("YOUR QUERY HERE", old_context_user))

This is your SQL:
```sql
SELECT s.Institution
FROM studies s
JOIN salary sa ON s.ID_usr = sa.ID_usr
GROUP BY s.Institution
ORDER BY AVG(sa.salary) DESC
LIMIT 1;
```

This SQL query joins the "studies" and "salary" tables on the ID_usr column. It then calculates the average salary for each institution, orders the results in descending order based on the average salary, and returns the institution with the highest average salary.


# Exercise
 - Complete the prompts similar to what we did in class. 
     - Try at least 3 versions
     - Be creative
 - Write a one page report summarizing your findings.
     - Were there variations that didn't work well? i.e., where GPT either hallucinated or wrong.
     - What did you learn?

In [23]:
context_boxing = [
    {'role': 'system', 'content': """
    You are an assistant designed to help generate SQL commands. All your answers should start with:
    'This is your SQL:', followed by an SQL query that satisfies the user's request.
    
    Your database consists of several tables, defined below. Keep the SQL syntax simple.

    CREATE TABLE Boxers (
        boxer_id INT PRIMARY KEY,
        name VARCHAR(100),
        nationality VARCHAR(100),
        weight_class VARCHAR(50)
    );

    INSERT INTO Boxers (boxer_id, name, nationality, weight_class) VALUES
    (1, 'Mike Tyson', 'USA', 'Heavyweight'),
    (2, 'Muhammad Ali', 'USA', 'Heavyweight'),
    (3, 'Floyd Mayweather', 'USA', 'Welterweight');

    CREATE TABLE Wins (
        boxer_id INT,
        wins INT,
        FOREIGN KEY (boxer_id) REFERENCES Boxers(boxer_id)
    );

    INSERT INTO Wins (boxer_id, wins) VALUES
    (1, 50),
    (2, 56),
    (3, 50);

    CREATE TABLE Losses (
        boxer_id INT,
        losses INT,
        FOREIGN KEY (boxer_id) REFERENCES Boxers(boxer_id)
    );

    INSERT INTO Losses (boxer_id, losses) VALUES
    (1, 6),
    (2, 5),
    (3, 0);

    -- Example Queries
    -- Example 1: Get all boxer names and their total wins
    Query: "Get all boxer names and their total wins"
    SQL: SELECT b.name, w.wins FROM Boxers b JOIN Wins w ON b.boxer_id = w.boxer_id;

    -- Example 2: Get the name of the boxer with the most wins
    Query: "Get the name of the boxer with the most wins"
    SQL: SELECT b.name FROM Boxers b JOIN Wins w ON b.boxer_id = w.boxer_id ORDER BY w.wins DESC LIMIT 1;

    Based on the provided tables and examples, generate the SQL for the following query:
    Query: {question}
    """}
]

# Example of filling the query for boxing
query_boxing = "Get me the names of boxers with more than 50 wins"
filled_prompt_boxing = context_boxing[0]['content'].replace("{question}", query_boxing)
print(filled_prompt_boxing)



    You are an assistant designed to help generate SQL commands. All your answers should start with:
    'This is your SQL:', followed by an SQL query that satisfies the user's request.
    
    Your database consists of several tables, defined below. Keep the SQL syntax simple.

    CREATE TABLE Boxers (
        boxer_id INT PRIMARY KEY,
        name VARCHAR(100),
        nationality VARCHAR(100),
        weight_class VARCHAR(50)
    );

    INSERT INTO Boxers (boxer_id, name, nationality, weight_class) VALUES
    (1, 'Mike Tyson', 'USA', 'Heavyweight'),
    (2, 'Muhammad Ali', 'USA', 'Heavyweight'),
    (3, 'Floyd Mayweather', 'USA', 'Welterweight');

    CREATE TABLE Wins (
        boxer_id INT,
        wins INT,
        FOREIGN KEY (boxer_id) REFERENCES Boxers(boxer_id)
    );

    INSERT INTO Wins (boxer_id, wins) VALUES
    (1, 50),
    (2, 56),
    (3, 50);

    CREATE TABLE Losses (
        boxer_id INT,
        losses INT,
        FOREIGN KEY (boxer_id) REFERENCES Boxers(boxe

In [25]:
print(return_CCRMSQL("who was the best boxer", context_boxing))

This is your SQL: SELECT b.name 
FROM Boxers b 
JOIN Wins w ON b.boxer_id = w.boxer_id 
ORDER BY w.wins DESC 
LIMIT 1;


In [28]:
context_philosophy = [
    {'role': 'system', 'content': """
    You are an assistant designed to help generate SQL commands for a philosophy database.

    Your database consists of several tables, defined below. Keep the SQL syntax simple.

    CREATE TABLE Philosophers (
        philosopher_id INT PRIMARY KEY,
        name VARCHAR(100),
        era VARCHAR(50)
    );

    INSERT INTO Philosophers (philosopher_id, name, era) VALUES
    (1, 'Socrates', 'Ancient'),
    (2, 'Aristotle', 'Ancient'),
    (3, 'Nietzsche', 'Modern');

    CREATE TABLE Works (
        philosopher_id INT,
        title VARCHAR(100),
        publication_year INT,
        FOREIGN KEY (philosopher_id) REFERENCES Philosophers(philosopher_id)
    );

    INSERT INTO Works (philosopher_id, title, publication_year) VALUES
    (1, 'Apology', -399),
    (2, 'Metaphysics', -350),
    (3, 'Thus Spoke Zarathustra', 1883);

    -- Example Queries
    -- Example 1: Get all philosopher names and their eras
    Query: "Get all philosopher names and their eras"
    SQL: SELECT name, era FROM Philosophers;

    -- Example 2: Get the names of philosophers with works published after 1800
    Query: "Get the names of philosophers with works published after 1800"
    SQL: SELECT p.name FROM Philosophers p JOIN Works w ON p.philosopher_id = w.philosopher_id WHERE w.publication_year > 1800;

    Based on the provided tables and examples, generate the SQL for the following query:
    Query: {question}
    """}
]

# Example of filling the query for philosophy
query_philosophy = "Get the titles of works published before 0 AD"
filled_prompt_philosophy = context_philosophy[0]['content'].replace("{question}", query_philosophy)
print(filled_prompt_philosophy)


    You are an assistant designed to help generate SQL commands for a philosophy database.

    Your database consists of several tables, defined below. Keep the SQL syntax simple.

    CREATE TABLE Philosophers (
        philosopher_id INT PRIMARY KEY,
        name VARCHAR(100),
        era VARCHAR(50)
    );

    INSERT INTO Philosophers (philosopher_id, name, era) VALUES
    (1, 'Socrates', 'Ancient'),
    (2, 'Aristotle', 'Ancient'),
    (3, 'Nietzsche', 'Modern');

    CREATE TABLE Works (
        philosopher_id INT,
        title VARCHAR(100),
        publication_year INT,
        FOREIGN KEY (philosopher_id) REFERENCES Philosophers(philosopher_id)
    );

    INSERT INTO Works (philosopher_id, title, publication_year) VALUES
    (1, 'Apology', -399),
    (2, 'Metaphysics', -350),
    (3, 'Thus Spoke Zarathustra', 1883);

    -- Example Queries
    -- Example 1: Get all philosopher names and their eras
    Query: "Get all philosopher names and their eras"
    SQL: SELECT name, e

In [30]:
print(return_CCRMSQL("give me a list of modern philosophers", context_philosophy))

SQL: SELECT name FROM Philosophers WHERE era = 'Modern';


In [32]:
context_kid_education = [
    {'role': 'system', 'content': """
    You are an assistant designed to help generate SQL commands for a kid education database.

    Your database consists of several tables, defined below. Keep the SQL syntax simple.

    CREATE TABLE Students (
        student_id INT PRIMARY KEY,
        name VARCHAR(100),
        age INT
    );

    INSERT INTO Students (student_id, name, age) VALUES
    (1, 'Alice', 8),
    (2, 'Bob', 10),
    (3, 'Charlie', 9);

    CREATE TABLE Subjects (
        student_id INT,
        subject_name VARCHAR(100),
        grade CHAR(1),
        FOREIGN KEY (student_id) REFERENCES Students(student_id)
    );

    INSERT INTO Subjects (student_id, subject_name, grade) VALUES
    (1, 'Math', 'A'),
    (2, 'Science', 'B'),
    (3, 'History', 'A');

    -- Example Queries
    -- Example 1: Get all student names and their ages
    Query: "Get all student names and their ages"
    SQL: SELECT name, age FROM Students;

    -- Example 2: Get the students who have an A grade in any subject
    Query: "Get the students who have an A grade in any subject"
    SQL: SELECT s.name FROM Students s JOIN Subjects su ON s.student_id = su.student_id WHERE su.grade = 'A';

    Based on the provided tables and examples, generate the SQL for the following query:
    Query: {question}
    """}
]

# Example of filling the query for kid education
query_kid_education = "Get the names of students who are older than 9 years"
filled_prompt_kid_education = context_kid_education[0]['content'].replace("{question}", query_kid_education)
print(filled_prompt_kid_education)



    You are an assistant designed to help generate SQL commands for a kid education database.

    Your database consists of several tables, defined below. Keep the SQL syntax simple.

    CREATE TABLE Students (
        student_id INT PRIMARY KEY,
        name VARCHAR(100),
        age INT
    );

    INSERT INTO Students (student_id, name, age) VALUES
    (1, 'Alice', 8),
    (2, 'Bob', 10),
    (3, 'Charlie', 9);

    CREATE TABLE Subjects (
        student_id INT,
        subject_name VARCHAR(100),
        grade CHAR(1),
        FOREIGN KEY (student_id) REFERENCES Students(student_id)
    );

    INSERT INTO Subjects (student_id, subject_name, grade) VALUES
    (1, 'Math', 'A'),
    (2, 'Science', 'B'),
    (3, 'History', 'A');

    -- Example Queries
    -- Example 1: Get all student names and their ages
    Query: "Get all student names and their ages"
    SQL: SELECT name, age FROM Students;

    -- Example 2: Get the students who have an A grade in any subject
    Query: "Get 

In [33]:
print(return_CCRMSQL("gGet all student names and their Math grades", context_kid_education))

SQL: SELECT s.name, su.grade 
FROM Students s 
JOIN Subjects su ON s.student_id = su.student_id 
WHERE su.subject_name = 'Math';


## report

All the tasks work really well. This is a very good structure togenerate SQL queries