In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoModelForSeq2SeqLM
import torch
import bitsandbytes
import pandas as pd
from sqlglot import parse_one, exp, parse, table, column, to_identifier
from Levenshtein import distance
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time

translation_model_name = 'VietAI/envit5-translation'
translation_tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name).to('cuda')

sql_coder_name = "defog/sqlcoder-7b-2"
sql_tokenizer = AutoTokenizer.from_pretrained(sql_coder_name)
sql_model = AutoModelForCausalLM.from_pretrained(
    sql_coder_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map='auto'
)
embedding_model_name = 'mixedbread-ai/mxbai-embed-large-v1'
embedding_model = SentenceTransformer(embedding_model_name, device='cuda')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [None]:
def convert_to_mysql(query):
    """
    Convert SQL queries to be MySQL compatible

    Args:
        query (str): The input SQL query

    Returns:
        str: MySQL compatible query
    """
    if query == "SELECT 'I do not know'":
      return query
    # Remove leading/trailing whitespace
    query = query.strip()

    # Convert ILIKE to LIKE
    query = query.replace(" ilike ", " LIKE ")

    # Convert double quotes to backticks for identifiers
    in_string = False
    result = []
    i = 0

    while i < len(query):
        if query[i] == "'":
            in_string = not in_string
            result.append(query[i])
        elif query[i] == '"' and not in_string:
            result.append('`')
        else:
            result.append(query[i])
        i += 1

    query = ''.join(result)

    # Convert boolean literals
    query = query.replace(" true ", " TRUE ")
    query = query.replace(" false ", " FALSE ")

    # Convert concatenation operator || to CONCAT
    parts = query.split("||")
    if len(parts) > 1:
        query = "CONCAT(" + ",".join(parts) + ")"

    # Convert LIMIT-OFFSET syntax
    if " offset " in query.lower():
        # Extract limit and offset values
        match = re.search(r"limit\s+(\d+)\s+offset\s+(\d+)", query.lower())
        if match:
            limit = match.group(1)
            offset = match.group(2)
            # Replace with MySQL syntax
            old_clause = f"LIMIT {limit} OFFSET {offset}"
            new_clause = f"LIMIT {offset}, {limit}"
            query = query.replace(old_clause, new_clause)

    return query

def generate_query(prompt, device='cuda'):
    inputs = sql_tokenizer(prompt, return_tensors="pt").to(device)
    generated_ids = sql_model.generate(
        **inputs,
        num_return_sequences=1,
        eos_token_id = sql_tokenizer.eos_token_id,
        pad_token_id = sql_tokenizer.eos_token_id,
        max_new_tokens=400, #default = 400
        do_sample=False,
        num_beams=1,
    )
    outputs = sql_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    return outputs[0].split("[SQL]")[-1]

class queryPostprocessing:

    def __init__(self, query, table_metadata, embedding_model_name):
        self.query = query.upper().split(';')[0]
        self.table_metadata = table_metadata
        self.col_mapping = {}
        self.table_mapping = {}
        self.embedding_model = SentenceTransformer(embedding_model_name, device='cuda')

    def getIncorrectColumns(self):
        column_names_query = [col.name for col in parse_one(self.query).find_all(exp.Column)]
        column_names_query = np.unique(column_names_query).tolist()
        column_names_table = self.table_metadata['columns']

        matching_cols = list(set(column_names_query).intersection(set(column_names_table)))
        invalid_cols = list(set(column_names_query).difference(set(matching_cols)))
        available_set = list(set(column_names_table).difference(set(matching_cols)))

        # check if enough columns are available to substitute (3 columns in query but only 2 in table)
        for col in matching_cols:
            self.col_mapping[col] = col

        for col in invalid_cols:
            closest_col = self.embedding_distance(col, available_set)
            self.col_mapping[col] = closest_col
            available_set.remove(closest_col)

        return self.col_mapping

    def getIncorrectTables(self):
        ast = parse_one(self.query)
        table_list = []
        for tbl in ast.find_all(exp.Table):
            tbl.set('alias',None)
            table_name = tbl.sql()
            table_list.append(table_name)

        unique_tables = np.unique(table_list).tolist()
        if len(unique_tables):
            table_name = unique_tables[0]
            self.table_mapping[table_name] = self.table_metadata['table_name']

        return self.table_mapping

    def lv_distance(self, col, available_set):
        distances = []
        for column in available_set:
            d = distance(col, column)
            distances.append(d)
        most_similar_column = available_set[np.argmin(distances)]
        return most_similar_column

    def embedding_distance(self, col, available_set):
        col_embedding = self.embedding_model.encode([col])
        available_set_embedding = self.embedding_model.encode(available_set)
        similarity_list = cosine_similarity(col_embedding, available_set_embedding)
        most_similar_column = available_set[np.argmax(similarity_list)]
        return most_similar_column

    def formatQuery(self):
        _ = self.getIncorrectColumns()
        _ = self.getIncorrectTables()
        updated_query = self.query

        for tbl, updated_tbl in self.table_mapping.items():
            updated_query = updated_query.replace(tbl, updated_tbl)

        for col, updated_col in self.col_mapping.items():
            updated_query = updated_query.replace(col, updated_col)

        return updated_query

    def formatQuerySQLglot(self):
        _ = self.getIncorrectColumns()
        _ = self.getIncorrectTables()
        query_ast = parse_one(self.query)
        alias_cols = [al.alias for al in query_ast.find_all(exp.Alias)]

        for col in alias_cols:
            self.col_mapping.pop(col, None)

        for tbl in query_ast.find_all(exp.Table):
            table_alias = None
            if 'alias' in tbl.args:
                table_alias = tbl.alias
            tbl.set('alias',None)
            table_name = tbl.sql()
            if table_name in self.table_mapping:
                new_table = table(table=self.table_mapping[table_name], quoted=False, alias=table_alias)
                tbl.replace(new_table)

        for col in query_ast.find_all(exp.Column):
            column_name = col.this.this
            if column_name in self.col_mapping:
                col.this.set('this',self.col_mapping[column_name])

        return query_ast.sql(dialect='mysql')

In [None]:
inputs = ['vi: Tôi đang tìm kiếm những công việc liên quan tới excel']
outputs = translation_model.generate(translation_tokenizer(inputs, return_tensors='pt',padding=True).input_ids.to('cuda'),max_length=512)
outputs = translation_tokenizer.batch_decode(outputs, skip_special_tokens=True)[0][4:]

df = pd.read_csv('./result.csv')
database_sample = df.rename(columns={'pic':'contributor',
                                     'dc':'design_center'})

schema = """
    create table jidouka (
	  id int not null primary key auto_increment,
    innovation_name nvarchar(50) not null,
    task_type nvarchar(20) not null,
    tool nvarchar(50) not null,
    describe_innovation text,
    software nvarchar(50),
    product nvarchar(50),
    contributor nvarchar(30),
    design_center nvarchar(10),
    num_tasks int,
    saved_hours int,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    information text
)
"""

question = outputs #they consider Python as software

prompt_template = """### Task
Generate a SQL query to answer [QUESTION]{question}[/QUESTION]

### instructions
- If you cannot answer the question with the available database schema, return 'I do not know'
- Follow Database Schema, if you meet the name that are not in schema, try to find synonyms in the schema
- Think step by step and always check schema and question and the column names before writing the
query.
## Database Schema
This query will run on a database whose schema is represented in this string: {db_schema}

## Database sample
You will see same sample based on the actual database to see which data need to be consider to each column: {database_sample}

##Type of SQL query
You need to create a valid command for MYSQL server


### Answer
Given the database schema, here is the SQL query that answer [QUESTION]{question}[/QUESTION] [SQL]
"""

prompt = prompt_template.format(question=question, database_sample = database_sample,db_schema=schema)

generated_sql = generate_query(prompt, device='cuda')

table_name = 'jidouka'
columns = ['id','innovation_name','task_type','tool','describe_innovation','software','product','contributor','design_center','num_tasks','saved_hours','created_at','information']

print('Running post-processing...')
try:
    qp = queryPostprocessing(generated_sql.upper(),
                             {'table_name':table_name, 'columns':columns},
                             embedding_model_name='mixedbread-ai/mxbai-embed-large-v1')
    processed_query = qp.formatQuerySQLglot()
except Exception as e:
    print(e)
    processed_query = generated_sql

processed_query = processed_query.replace('""', '"')
print(f"Done! Processed query: {processed_query}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 13.06 MiB is free. Process 103301 has 14.73 GiB memory in use. Of the allocated memory 14.38 GiB is allocated by PyTorch, and 236.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)