In [None]:
import re 
import token

In [None]:
# SQL output format
## Number of attributes : CDC data = 12 ; FRED = 2 ; WEGOVY = 2  
'''
1) SELECT ... WHERE ... 

2) SELECT ... GROUP BY ... 
 2.1) SELECT ... GROUP BY ... HAVING ...

3) SELECT ... WHERE ... ORDER BY ... LIMIT ... 

4) Aggregation functions: 
 4.1) SELECT SUM()... FROM ...
 4.2) SELECT AVG()... FROM ...  
 4.3) SELECT MIN()... FROM ...
 4.4) SELECT MAX()... FROM ...
 4.5) SELECT COUNT()... FROM ...

5) Patterns of WHERE clauses -- 
 5.1) Basic comparison: 
    SELECT * FROM table_name WHERE column_name = 'value';
 
 5.2) Range Queries: 
    SELECT * FROM table_name WHERE column_name BETWEEN value1 AND value2;
    SELECT * FROM table_name WHERE column_name >= value;

 5.3) Pattern matching: 
    SELECT * FROM table_name WHERE column_name LIKE '%pattern%';
    SELECT * FROM table_name WHERE column_name LIKE '_A_';

 5.4) Checking for NULL: 
    SELECT * FROM table_name WHERE column_name IS NULL;
    SELECT * FROM table_name WHERE column_name IS NOT NULL;

 5.5) Using IN: 
    SELECT * FROM table_name WHERE column_name IN (value1, value2, value3);

 5.6) Logical condition:   
    SELECT * FROM table_name 
    WHERE condition1 AND condition2;

    SELECT * FROM table_name 
    WHERE condition1 OR condition2;

    SELECT * FROM table_name 
    WHERE NOT condition; 

 5.7) Using EXISTS
    SELECT * FROM table_name WHERE EXISTS (subquery);
 
 5.8) Subqueries 
    SELECT * FROM table_name WHERE column_name = (SELECT value FROM another_table WHERE condition);

 5.9) Conditional Aggregations 
    SELECT field, COUNT(*) FROM employees 
    GROUP BY field 
    HAVING COUNT(*) > 5;

 5.10) Case-Insentive Filtering 
    SELECT * FROM table_name WHERE UPPER(column_name) = 'VALUE';

'''


'\n1) SELECT ... WHERE ... \n\n2) SELECT ... GROUP BY ... \n2.1) SELECT ... GROUP BY ... HAVING ...\n\n3) SELECT ... WHERE ... ORDER BY ... LIMIT ... \n\n4.1) SELECT SUM()... FROM ...\n4.2) SELECT AVG()... FROM ...  \n4.3) SELECT MIN()... FROM ...\n4.4) SELECT MAX()... FROM ...\n4.5) SELECT COUNT()... FROM ...\n\n'

In [42]:
def parse(sample_nlq, split_criteria): 
    left_token, right_token = sample_nlq.split(split_criteria, 1)        
    return left_token, right_token 

class nlq_format_checker: 
    def __init__(self):
        self.all_att_list= {'FRED': ['date','income'], 
                            'WEGOVY': ['quarter','sales'], 
                            'CDC': ['SUBTOPIC', 'SUBTOPIC_ID', 'CLASSIFICATION', 'CLASSIFICATION_ID', 'GROUP_NAME', 'GROUP_ID', 
                                    'SUBGROUP', 'SUBGROUP_ID', 'ESTIMATE_TYPE', 'ESTIMATE_TYPE_ID', 'TIME_PERIOD', 
                                    'TIME_PERIOD_ID', 'ESTIMATE', 'STANDARD_ERROR']} 
        self.all_table_list = ['FRED', 'WEGOVY', 'CDC']  

        self.required_att_list = []         
        self.required_table_list = []
        self.context_group_set = []  

        self.subquery = ""
        self.where = ""
        self.groupby = "" 

        self.start_nlq = ["find", "group", "show", "retrieve", "calculate", "identify", "count", "list", "display"]
        self.att_nlq = [] 

    def add_template(self, sample_nlq):
        nlq = sample_nlq.split(" ")
        for i in self.all_table_list: 
            if i in nlq: 
                self.required_table_list.append(i) 
        
        for i in self.all_att_list: 
            if i in nlq: 
                self.required_att_list.append(i)

        # Divide query into two chunks: chunk_1 before 'FROM' and chunk_2 after. 
        l1, r1 = parse(sample_nlq, "in the")
        
        # Create a new dictionary for each sample_nlq to store context
        context_group = {}  
        context_group["FROM"] = parse(r1, " ")[1].replace(".", "")  
        context_group["SELECT"] = parse(l1, " ")[1:] 

        # Append the new context group to the context_group_set
        self.context_group_set.append(context_group)

In [44]:
checker = nlq_format_checker()
checker.add_template("Find all records in the CDC data where the age is 65.")
checker.context_group_set 

[{'FROM': 'CDC data where the age is 65', 'SELECT': ('all records ',)}]

In [45]:
checker.add_template("Group all entries in the FRED dataset by state and display each state's population.")
checker.context_group_set

[{'FROM': 'CDC data where the age is 65', 'SELECT': ('all records ',)},
 {'FROM': "FRED dataset by state and display each state's population",
  'SELECT': ('all entries ',)}]

In [46]:
checker.add_template("Retrieve all records in the CDC data for patients aged between 20 and 30.")
checker.context_group_set

[{'FROM': 'CDC data where the age is 65', 'SELECT': ('all records ',)},
 {'FROM': "FRED dataset by state and display each state's population",
  'SELECT': ('all entries ',)},
 {'FROM': 'CDC data for patients aged between 20 and 30',
  'SELECT': ('all records ',)}]

Example queries:

In [None]:
### 1) `SELECT ... WHERE ...`
   - **Example Query** "Find all records in the CDC data where the age is 65."

### 2) `SELECT ... GROUP BY ...`
   - **Example Query:** "Group all entries in the FRED dataset by state and display each state's population."

#### 2.1) `SELECT ... GROUP BY ... HAVING ...`
   - **Example Query:** "Show all cities in the CDC data that have more than 10,000 cases, grouped by city."

### 3) `SELECT ... WHERE ... ORDER BY ... LIMIT ...`
   - **Example Query:** "Retrieve the top 5 states from the WEGOVY dataset where the usage is highest, sorted by usage in descending order."

### 4) Aggregation functions:

#### 4.1) `SELECT SUM() ... FROM ...`
   - **Example Query:** "Calculate the total number of vaccinations recorded in the CDC data."

#### 4.2) `SELECT AVG() ... FROM ...`
   - **Example Query:** "Find the average number of hospitalizations in each region of the FRED dataset."

#### 4.3) `SELECT MIN() ... FROM ...`
   - **Example Query:** "Identify the minimum dosage amount in the WEGOVY dataset."

#### 4.4) `SELECT MAX() ... FROM ...`
   - **Example Query:** "Find the highest population among all counties in the FRED dataset."

#### 4.5) `SELECT COUNT() ... FROM ...`
   - **Example Query:** "Count the total number of entries in the CDC data."

### 5) Patterns of WHERE clauses:

#### 5.1) Basic Comparison
   - **Example Query:** "List all records in the WEGOVY dataset where the dosage is exactly 2 mg."

#### 5.2) Range Queries
   - **Example Query:** "Retrieve all records in the CDC data for patients aged between 20 and 30."
   - **Example Query:** "Find entries in the CDC dataset where the blood pressure reading is greater than 120."

#### 5.3) Pattern Matching
   - **Example Query:** "List all counties in the FRED data that contain 'New' in their names."
   - **Example Query:** "Find all records in the CDC dataset where the region code is 'CA'."

#### 5.4) Checking for NULL
   - **Example Query:** "Show all entries in the CDC data where the vaccination status is unknown."

#### 5.5) Using IN
   - **Example Query:** "Display all records in the FRED dataset where the state is either 'California,' 'Texas,' or 'New York'."

#### 5.6) Logical Conditions
   - **Example Query (AND):** "List all CDC records where age is above 50 and has received a vaccination."
   - **Example Query (OR):** "Retrieve FRED dataset records where the state is 'Texas' or population is above 1 million."
   - **Example Query (NOT):** "Find CDC data entries where the vaccination status is not 'complete'."

#### 5.7) Using EXISTS
   - **Example Query:** "List all patients in the CDC dataset who have a recorded entry in the FRED dataset."

#### 5.8) Subqueries
   - **Example Query:** "Retrieve all records in the CDC data where the population is equal to the maximum population recorded in the FRED dataset."

#### 5.9) Conditional Aggregations
   - **Example Query:** "List all regions in the CDC data where there are more than 1,000 cases recorded, grouped by region."

#### 5.10) Case-Insensitive Filtering
   - **Example Query:** "Find all CDC entries where the condition name matches 'covid' (case-insensitive)."
