# SQL query from table names - Continued

In [None]:
import os

from openai import OpenAI
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv('../IHAI-lessons/000_lesson_data/044_llm/.env'))

OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')

## The old Prompt

In [None]:
#The old prompt
old_context = [ {'role':'system', 'content':"""
you are a bot to assist in create SQL commands, all your answers should start with \
this is your SQL, and after that an SQL that can do what the user requests. \
Your Database is composed by a SQL database with some tables. \
Try to maintain the SQL order simple.
Put the SQL command in white letters with a black background, and just after \
a simple and concise text explaining how it works.
If the user asks for something that can not be solved with an SQL Order \
just answer something nice and simple, maximum 10 words, asking him for something that \
can be solved with SQL.
"""} ]

old_context.append( {'role':'system', 'content':"""
first table:
{
  "tableName": "employees",
  "fields": [
    {
      "employee_ID": "ID_usr",
      "type": "int"
    },
    {
      "employee_name": "name",
      "type": "varchar"
    }
  ]
}
"""
})

old_context.append( {'role':'system', 'content':"""
second table:
{
  "tableName": "salary",
  "fields": [
    {
      "employee_ID": "ID_usr",
      "type": "int"
    },
    {
      "year": "year",
      "type": "date"
    },
    {
      "salary": "salary",
      "type": "float"
    }
  ]
}
"""
})

old_context.append( {'role':'system', 'content':"""
third table:
{
  "tablename": "studies",
  "fields": [
    {
      "study_ID": "ID",
      "type": "int"
    },
    {
      "employee_ID": "ID_usr",
      "type": "int"
    },
    {
      "edu_level": "educational_level",
      "type": "int"
    },
    {
      "institution": "Institution",
      "type": "varchar"
    },
    {
      "years": "Years",
      "type": "date"
    }
    {
      "speciality": "Speciality",
      "type": "varchar"
    }
  ]
}
"""
})

## New Prompt.
We are going to improve it following the instructions of a Paper from the Ohio University: [How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings](https://arxiv.org/abs/2305.11853). I recommend you read that paper.

For each table, we will define the structure using the same syntax as in a SQL create table command, and add the sample rows of the content.

Finally, at the end of the prompt, we'll include some example queries with the SQL that the model should generate. This technique is called Few-Shot Samples, in which we provide the prompt with some examples to assist it in generating the correct SQL.


In [None]:
context = [ {'role':'system', 'content':"""
you are a bot to assist in create SQL commands, all your answers should start with
this is your SQL, and after that an SQL that can do what the user requests.
Your Database is composed by a SQL database with some tables.
Try to maintain the SQL order simple.
Put the SQL command in white letters with a black background, and just after
a simple and concise text explaining how it works.
If the user asks for something that can not be solved with an SQL Order
just answer something nice and simple, maximum 10 words, asking him for something that
can be solved with SQL.
"""} ]

context.append( {'role':'system', 'content':"""
             
create table employees(
    ID_Usr INT primary key,
    name VARCHAR);
             
    /*3 example rows
    select * from employees limit 3;
    ID_Usr    name
    1456      Amanda Nunes
    2345      Khabib Nurmagomedov
    1678      Israel Adesanya
    */

create table salary(
    ID_Usr INT,
    year DATE,
    salary FLOAT,
    foreign key (ID_Usr) references employees(ID_Usr));
             
    /*3 example rows
    select * from salary limit 3;
    ID_Usr    year          salary
    1456      01/01/2022    72000
    2345      01/01/2023    85000
    1678      01/01/2023    65000
    */

create table studies(
    ID_study INT,
    ID_Usr INT,
    educational_level INT,  /* 5=phd, 4=Master, 3=Bachelor */
    Institution VARCHAR,
    Years DATE,
    Speciality VARCHAR,
    primary key (ID_study, ID_Usr),
    foreign key(ID_Usr) references employees (ID_Usr));
             
    /*3 example rows
    select * from studies limit 3
    ID_Study ID_Usr educational_level Institution         Years       Speciality
    3451     1456   3                 University of Miami 01/01/2011  Bachelor of Arts in Sociology
    4567     2345   5                 Stanford University 01/01/2019  PhD in Computer Science
    5678     1678   4                 Oxford University   01/01/2021  Master of Business Administration
    */
"""} )

In [None]:
context.append({'role': 'system', 'content': """
-- Maintain the SQL order simple and efficient as you can, using valid SQL Lite, answer the following questions for the tables provided above.

Question: What is the average salary for employees with a PhD?
SELECT AVG(s.salary) AS average_salary
FROM employees e
INNER JOIN salary s ON e.ID_Usr = s.ID_Usr
INNER JOIN studies st ON e.ID_Usr = st.ID_Usr
WHERE st.educational_level = 5;

Question: List all employees who graduated from 'Stanford University' and their corresponding salary.
SELECT e.name, s.salary
FROM employees e
INNER JOIN salary s ON e.ID_Usr = s.ID_Usr
INNER JOIN studies st ON e.ID_Usr = st.ID_Usr
WHERE st.Institution = 'Stanford University';
"""
})

In [None]:
# Function to call the model

def return_CCRMSQL(user_message, context):
    client = OpenAI(
    api_key = OPENAI_API_KEY,
)

    newcontext = context.copy()
    newcontext.append({'role': 'user', 'content': 'question: ' + user_message})

    response = client.chat.completions.create(
            model = 'gpt-3.5-turbo',
            messages = newcontext,
            temperature = 0
        )

    return (response.choices[0].message.content)

## NL2SQL Samples
We're going to review some examples generated with the old prompt and others with the new prompt.

In [None]:
question_1 = 'Name the employees who earned more than 70000 in the year 2023 and their education.'

In [None]:
# new
print(return_CCRMSQL(question_1, context))

In [None]:
# old
print(return_CCRMSQL(question_1, old_context))

In [None]:
question_2 = 'Find the total number of employees with a Master’s degree and a salary less than 60000.'

In [None]:
# new
print(return_CCRMSQL(question_2, context))

In [None]:
# old
print(return_CCRMSQL(question_2, old_context))

# Exercise
 - Complete the prompts similar to what we did in class. 
     - Try at least 3 versions
     - Be creative

In [None]:
# VERSION 1

question_v1 = 'List the names of employees who graduated in 2020 and their corresponding salaries.'

print('USING NEW CONTEXT:\n', return_CCRMSQL(question_v1, context), '\n')
print('USING OLD CONTEXT:\n', return_CCRMSQL(question_v1, old_context))

In [None]:
# VERSION 2

question_v2 = "What is the highest salary earned by an employee with a Bachelor's degree?"

print('USING NEW CONTEXT:\n', return_CCRMSQL(question_v2, context), '\n')
print('USING OLD CONTEXT:\n', return_CCRMSQL(question_v2, old_context))

In [None]:
# VERSION 3

question_v3 = 'Retrieve the employees who have worked for more than 10 years since their graduation.'

print('USING NEW CONTEXT:\n', return_CCRMSQL(question_v3, context), '\n')
print('USING OLD CONTEXT:\n', return_CCRMSQL(question_v3, old_context))

 - Write a report summarizing your findings.
     - Were there variations that didn't work well? i.e., where GPT either hallucinated or wrong.
     - What did you learn?

In [None]:
# REPORT AND FINDINGS

# The old approach makes the model more flexible and reusable for generating SQL commands.
# Without example rows, the model works more abstractly, which might lead to less accurate outputs.

# The new approach helps the model better understand table relationships, making queries more context-aware.  
# Including example rows in the new approach improves query accuracy.