In [81]:
from textwrap import dedent

In [103]:
import sys
sys.path.append('../utils/')
sys.path.append('../queryProcessing/')

from utils import *
from TableMapper import TableMapper

from tqdm.notebook import tqdm
tqdm.pandas()

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

class TQLRunner():

    def __init__(self, schema_id):

        if(schema_id is None):
            raise Exception("Schema ID is needed")

        self.query, self.schema = get_spider_schema_table_files()
        self.tableMapper = TableMapper(self.query, self.schema)

        self.s, self.t = self.tableMapper.get_filtered_schema(schema_id)

        print('All libraries loaded')

        # self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        # self.tokenizer = T5Tokenizer.from_pretrained('t5-small')
        # self.model = T5ForConditionalGeneration.from_pretrained("/home/jupyter/model")
        # self.model.to(self.device)

        print('LLM Model initialized')


    def create_schema_natural_language(self, row):

        schema_id = row['schema_id']
        table_name = row['table_name']
        primary_key = row['primary_key']
        column_list = eval(row['column_list_original'])
        datatype_list = eval(row['column_datatypes'])
        foreign_key = eval(row['foreign_keys'])

        column_list_with_datatype = []
        for column, datatype in zip(column_list, datatype_list):
            column_list_with_datatype.append(' '.join([column, datatype]))


        schema_natural_language = f"CREATE TABLE {table_name} ({', '.join(column_list_with_datatype)}) which has {primary_key} as primary key"

        return schema_natural_language

    
    def get_schema_details(self):
        return self.schema


    def get_table_prompt(self, input_text):

        table_names_from_tql = self.tableMapper.get_table_names_tql(self.s, input_text)

        if(len(table_names_from_tql) == 0):
            raise Exception("No tables found, please repharse the query and try again")

        prompt_tables = []
        for i in table_names_from_tql:
            prompt_tables.append(
                self.s[self.s['table_name_original'] == i].apply(
                    self.create_schema_natural_language, axis = 1
                ).reset_index(drop = True).iloc[0]
            )

        return '\n\n'.join(prompt_tables)


    def get_final_prompt(self, input_text):

        table_prompt = self.get_table_prompt(input_text)
        
        prompt = f"""\
            ### Instructions:
            Your task is convert a question into a SQL query, given a Postgres database schema.\
            Adhere to these rules:
            - **Deliberately go through the question and database schema word by word** to appropriately answer the question
            - **Use Table Aliases** to prevent ambiguity. For example, `SELECT table1.col1, table2.col1 FROM table1 JOIN table2 ON table1.id = table2.id`.
            - When creating a ratio, always cast the numerator as float

            ### Input:
            Generate a SQL query that answers the question '{input_text}`.\n
            This query will run on a database whose schema is represented in this string:

            {table_prompt}

            ### Response:
            Based on your instructions, here is the SQL query I have generated to answer the question `{input_text}`:
            ```sql
        """

        return prompt


    def get_SQL_query(self, input_text):

        prompt = self.get_final_prompt(input_text)
        print(prompt)
#         tokens = self.tokenizer(prompt, 
#                            return_tensors="pt", max_length=512, 
#                            truncation=True, padding="max_length")

#         outputs = self.model.generate(input_ids=tokens.input_ids.to(self.device), max_new_tokens = 512)
#         predicted_query = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        return '' #predicted_query

In [104]:
tqlRunner = TQLRunner('college_2')

All libraries loaded
LLM Model initialized


In [105]:
tqlRunner.get_SQL_query('What are the room numbers and corresponding buildings for classrooms which can seat between 50 to 100 students?')

            ### Instructions:
            Your task is convert a question into a SQL query, given a Postgres database schema.            Adhere to these rules:
            - **Deliberately go through the question and database schema word by word** to appropriately answer the question
            - **Use Table Aliases** to prevent ambiguity. For example, `SELECT table1.col1, table2.col1 FROM table1 JOIN table2 ON table1.id = table2.id`.
            - When creating a ratio, always cast the numerator as float

            ### Input:
            Generate a SQL query that answers the question 'What are the room numbers and corresponding buildings for classrooms which can seat between 50 to 100 students?`.

            This query will run on a database whose schema is represented in this string:

            CREATE TABLE classroom (building text, room_number text, capacity number) which has building as primary key

CREATE TABLE student (ID text, name text, dept_name text, tot_cred number) wh

''

In [6]:
tqlRunner = TQLRunner('yelp')

All libraries loaded
LLM Model initialized


In [7]:
input_text = 'How many reviews are there in the database'
tqlRunner.get_SQL_query(input_text)

SQL query for: How many reviews are there in the database with tables: CREATE TABLE review (rid number, business_id text, user_id text, rating number, text text, year number, month text) which has rid as primary key


NameError: name 'predicted_query' is not defined

In [8]:
input_text = 'How many businesses are there in "Vegas"'
tqlRunner.get_SQL_query(input_text)

SQL query for: How many businesses are there in "Vegas" with tables: CREATE TABLE business (bid number, business_id text, name text, full_address text, city text, latitude text, longitude text, review_count number, is_open number, rating number, state text) which has bid as primary key


NameError: name 'predicted_query' is not defined

In [None]:
tqlRunner.get_SQL_query('How many professors are there in chem department')

In [None]:
# tqlRunner.get_SQL_query('How many professors are there in chem dept')

In [None]:
tqlRunner.get_SQL_query('students name beginning with "Nam"')

In [None]:
tqlRunner.get_SQL_query('What is the name and building of the departments whose budget is more than the average budget?')

In [None]:
tqlRunner = TQLRunner('student_assessment')

In [None]:
tqlRunner.get_SQL_query('What are the id of students who registered courses or attended courses?')