In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


# Use the "meta-llama/Llama-3.2-1B-Instruct" model
model_name = "Qwen/Qwen2-0.5B-Instruct"  # Llama 1B instruction-following model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Define the text-generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)




tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cuda:0


In [2]:
# Before Fine Tuning

# System instruction for formatting SQL and explanation
system_instruction = (
    "Translate the natural language query into SQL. "
    "Return the SQL code in a separate code block with an explanation."
    "Explain the SQL code in simple words"
)

# User query with schema and request
user_query = """
What is the total number of animals adopted by each community?

CREATE TABLE CommunityEducation(Community VARCHAR(20), AnimalsAdopted INT);

INSERT INTO CommunityEducation VALUES ('CommunityA', 35), ('CommunityB', 28), ('CommunityC', 42);

"""


# Expected: SELECT Community, SUM(AnimalsAdopted) FROM CommunityEducation GROUP BY Community;
# Construct prompt
prompt = f"{system_instruction}\n\nUser Query:\n{user_query}"

# Generate SQL output
with torch.no_grad():  # Disable gradient tracking to save memory
    response = pipe(prompt, max_length=1024, do_sample=True, temperature=0.7)

# Print the generated SQL query and explanation
print(response[0]['generated_text'])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Translate the natural language query into SQL. Return the SQL code in a separate code block with an explanation.Explain the SQL code in simple words

User Query:

What is the total number of animals adopted by each community?

CREATE TABLE CommunityEducation(Community VARCHAR(20), AnimalsAdopted INT);

INSERT INTO CommunityEducation VALUES ('CommunityA', 35), ('CommunityB', 28), ('CommunityC', 42);

SELECT Community, COUNT(*) AS TotalAnimalsAdopted
FROM CommunityEducation
GROUP BY Community;


In [3]:

# Expected: SELECT Community, SUM(AnimalsAdopted) FROM CommunityEducation GROUP BY Community;
# Construct prompt
prompt = f"{system_instruction}\n\nUser Query:\n{user_query}"

# Generate SQL output
with torch.no_grad():  # Disable gradient tracking to save memory
    response = pipe(prompt, max_length=1024, do_sample=True, temperature=0.7)

# Print the generated SQL query and explanation
print(response[0]['generated_text'])


Translate the natural language query into SQL. Return the SQL code in a separate code block with an explanation.Explain the SQL code in simple words

User Query:

What is the total number of animals adopted by each community?

CREATE TABLE CommunityEducation(Community VARCHAR(20), AnimalsAdopted INT);

INSERT INTO CommunityEducation VALUES ('CommunityA', 35), ('CommunityB', 28), ('CommunityC', 42);

SELECT Community, SUM(AnimalsAdopted) AS TotalAnimalsAdopted
FROM CommunityEducation
GROUP BY Community;


## Get the data

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [5]:
from datasets import load_dataset
import pandas as pd

# Load the dataset
dataset = load_dataset("gretelai/synthetic_text_to_sql", split='train')

# Filter rows where 'domain' is 'wildlife conservation'
filtered_dataset = dataset.filter(lambda x: x['domain'] == 'wildlife conservation')

# Convert filtered dataset to a pandas DataFrame for easier manipulation
df = filtered_dataset.to_pandas()

# Remove rows with any null values
df_cleaned = df.dropna()

# Take only the first 250 rows
# df_cleaned = df_cleaned.head(500)

# Convert back to a Hugging Face Dataset for further processing
cleaned_dataset = dataset.from_pandas(df_cleaned)

# Check the number of rows after cleaning
print(f"Number of rows after removing nulls: {len(df_cleaned)}")

# Now you can use the cleaned dataset for fine-tuning


README.md:   0%|          | 0.00/8.18k [00:00<?, ?B/s]

(…)nthetic_text_to_sql_train.snappy.parquet:   0%|          | 0.00/32.4M [00:00<?, ?B/s]

(…)ynthetic_text_to_sql_test.snappy.parquet:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5851 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100000 [00:00<?, ? examples/s]

Number of rows after removing nulls: 996


In [6]:
# Set the pad_token to the eos_token
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to the end of sequence token


def tokenize_function(examples):
    # Tokenize the examples and shift the labels by one for causal language modeling
    inputs = tokenizer(examples['sql_prompt'], truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(examples['sql'], truncation=True, padding="max_length", max_length=128)

    # Shift labels to the right by one position
    inputs['labels'] = labels['input_ids']

    return inputs

# Tokenize dataset and add labels
tokenized_dataset = cleaned_dataset.map(tokenize_function, batched=True)


print(tokenized_dataset[0])

# Check the modified dataset
print(tokenized_dataset['input_ids'])
print(tokenized_dataset['labels'])



Map:   0%|          | 0/996 [00:00<?, ? examples/s]

{'id': 5123, 'domain': 'wildlife conservation', 'domain_description': 'Animal population data, habitat preservation efforts, and community education programs.', 'sql_complexity': 'basic SQL', 'sql_complexity_description': 'basic SQL with a simple select statement', 'sql_task_type': 'analytics and reporting', 'sql_task_type_description': 'generating reports, dashboards, and analytical insights', 'sql_prompt': 'Retrieve the names and species of all animals that weigh more than 500 kg', 'sql_context': "CREATE TABLE animals (id INT, name VARCHAR(20), species VARCHAR(20), weight DECIMAL(5, 2)); INSERT INTO animals (id, name, species, weight) VALUES (1, 'Elephant', 'African', 6000), (2, 'Lion', 'African', 400), (3, 'Hippo', 'African', 3000), (4, 'Tiger', 'Asian', 300), (5, 'Crane', 'African', 100), (6, 'Rhinoceros', 'African', 2000), (7, 'Zebra', 'African', 450), (8, 'Giraffe', 'African', 1200), (9, 'Bear', 'Eurasian', 600), (10, 'Crocodile', 'Nile', 700);", 'sql': 'SELECT name, species FROM

In [8]:



from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,                      # Rank of the LoRA matrices
    lora_alpha=16,            # Scaling factor
    lora_dropout=0.1,         # Dropout probability
    target_modules=["q_proj", "v_proj"],  # Target specific layers (LLM attention heads)
    bias="none",
    task_type="CAUSAL_LM",    # For causal language modeling
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_loraQ",
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=8,
    save_steps=300,
    fp16=True,
    save_total_limit=2,
    push_to_hub=False,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

tokenizer.pad_token = tokenizer.eos_token  # Set padding token to the end of sequence token

# Train the model
trainer.train()

# Save LoRA adapter
model.save_pretrained("./fine_tuned_loraQ")
tokenizer.save_pretrained("./fine_tuned_loraQ")



  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkaushikred293[0m ([33mkaushikred293-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,3.3791
1000,1.6222


('./fine_tuned_loraQ/tokenizer_config.json',
 './fine_tuned_loraQ/special_tokens_map.json',
 './fine_tuned_loraQ/vocab.json',
 './fine_tuned_loraQ/merges.txt',
 './fine_tuned_loraQ/added_tokens.json',
 './fine_tuned_loraQ/tokenizer.json')

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

# Load the base model
base_model_name = "Qwen/Qwen2-0.5B-Instruct"    # Replace with your model
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Load the fine-tuned LoRA adapter
fine_tuned_model = PeftModel.from_pretrained(base_model, "./fine_tuned_loraQ")

# Merge LoRA adapter into base model (optional, for inference)
fine_tuned_model = fine_tuned_model.merge_and_unload()

# Move the model to GPU (if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
fine_tuned_model.to(device)

# Define the text-generation pipeline
pipe_fine_tuned = pipeline("text-generation", model=fine_tuned_model, tokenizer=tokenizer)

# System instruction for formatting SQL and explanation
system_instruction = (
    "Translate the natural language query into SQL. "
    "Return the SQL code in a separate code block with an explanation."

)

# User query with schema and request
user_query = """
What is the total number of animals adopted by each community?

CREATE TABLE CommunityEducation(Community VARCHAR(20), AnimalsAdopted INT);

INSERT INTO CommunityEducation VALUES ('CommunityA', 35), ('CommunityB', 28), ('CommunityC', 42);
"""

# Construct prompt
prompt = f"{system_instruction}\n\nUser Query:\n{user_query}"

# Generate SQL output from the fine-tuned model
with torch.no_grad():  # Disable gradient tracking to save memory
    response = pipe_fine_tuned(prompt,
                           max_length=1024,
                           do_sample=True,
                           temperature=0.7,
                           repetition_penalty=1.2,
                           top_k=50,
                           top_p=0.9)

# Print the generated SQL query and explanation from the fine-tuned model
print(response[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Translate the natural language query into SQL. Return the SQL code in a separate code block with an explanation.

User Query:

What is the total number of animals adopted by each community?

CREATE TABLE CommunityEducation(Community VARCHAR(20), AnimalsAdopted INT);

INSERT INTO CommunityEducation VALUES ('CommunityA', 35), ('CommunityB', 28), ('CommunityC', 42);
SELECT Community, COUNT(*) AS TotalAnimalsAdoption
FROM CommunityEducation
GROUP BY Community;


In [12]:
!pip install gradio -q

import gradio as gr

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.1/322.1 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m131.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:

def generate_sql(query, context):
    system_instruction = (
        "Translate the natural language query into SQL. "
        "Return the SQL code in a separate code block with an explanation."
    )
    prompt = f"{system_instruction}\n\nContext:\n{context}\n\nUser Query:\n{query}"

    with torch.no_grad():  # Disable gradient tracking to save memory
        response = pipe_fine_tuned(prompt,
                               max_length=1024,
                               do_sample=True,
                               temperature=0.7,
                               repetition_penalty=1.2,
                               top_k=50,
                               top_p=0.9)

    return response[0]['generated_text']

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# SQL Query Generator Qwen")
    gr.Markdown("Enter your natural language query and database context to generate SQL")

    with gr.Row():
        with gr.Column():
            query_input = gr.Textbox(
                label="Natural Language Query",
                placeholder="Find all customers who made purchases over $1000",
                lines=3
            )

            context_input = gr.Textbox(
                label="Database Context",
                placeholder="Tables: customers(id, name, email), orders(id, customer_id, amount, date)",
                lines=5
            )

            submit_btn = gr.Button("Generate SQL")

        with gr.Column():
            output = gr.Markdown(label="Generated SQL")

    submit_btn.click(
        fn=generate_sql,
        inputs=[query_input, context_input],
        outputs=output
    )

# Launch the interface
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e10091f299dfec9454.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [21]:
import gradio as gr
import torch

def generate_sql(query, context):
    system_instruction = (
        "Translate the natural language query into SQL. "
        "Return the SQL code in a separate code block with an explanation."
    )
    prompt = f"{system_instruction}\n\nContext:\n{context}\n\nUser Query:\n{query}"

    with torch.no_grad():  # Disable gradient tracking to save memory
        response = pipe(prompt,
                               max_length=2048,
                               do_sample=True,
                               temperature=0.7,
                               repetition_penalty=1.2,
                               top_k=50,
                               top_p=0.9)

    return response[0]['generated_text']

# Custom CSS for styling
custom_css = """
.gradio-container {
    background-color: #000000;
}
.gradio-header {
    background: linear-gradient(to right, #4B79A1, #283E51);
    color: white !important;
}
.gradio-button {
    background-color: #4B79A1 !important;
    color: white !important;
}
.gradio-button:hover {
    background-color: #283E51 !important;
}
.gradio-input-box, .gradio-output-box {
    border: 1px solid #000000 !important;
    border-radius: 8px !important;
    background-color: black !important;
}
"""

# Create the Gradio interface
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🔍 SQL Query Generator Qwen Pre-FT")
    gr.Markdown("Enter your natural language query and database context to generate SQL")

    with gr.Row():
        with gr.Column():
            query_input = gr.Textbox(
                label="Natural Language Query",
                placeholder="Find all customers who made purchases over $1000",
                lines=3,
                elem_classes="gradio-input-box"
            )

            context_input = gr.Textbox(
                label="Database Context",
                placeholder="Tables: customers(id, name, email), orders(id, customer_id, amount, date)",
                lines=5,
                elem_classes="gradio-input-box"
            )

            submit_btn = gr.Button("Generate SQL", elem_classes="gradio-button")

        with gr.Column():
            output = gr.Markdown(label="Generated SQL", elem_classes="gradio-output-box")

    submit_btn.click(
        fn=generate_sql,
        inputs=[query_input, context_input],
        outputs=output
    )

    gr.HTML("""
    <div style="text-align: center; margin-top: 20px; padding: 10px; background-color: #283E51; color: white; border-radius: 8px;">
        <p>SQL Query Generator powered by Transformer Models</p>
    </div>
    """)

# Launch the interface
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://45cf11bb91c5af5f71.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


