# Fine-Tuning GPT2 for SQL

In [1]:
import os
import json
import torch
import pandas as pd
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling

2024-10-06 13:39:27.847377: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Environment
Check up environment settings

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"        
print(f"Using device: {device}")

Using device: cpu


## Data
Load SQL context dataset

In [6]:
dataset = load_dataset("b-mc2/sql-create-context", split='train[:1%]')
dataset

Using the latest cached version of the dataset since b-mc2/sql-create-context couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/johnmoses/.cache/huggingface/datasets/b-mc2___sql-create-context/default/0.0.0/9d80a6a118b838d9defc3798d659a54a2ac2ff37 (last modified on Mon Sep 16 10:40:01 2024).


Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 786
})

In [7]:
dataset[0]

{'context': 'CREATE TABLE head (age INTEGER)',
 'question': 'How many heads of the departments are older than 56 ?',
 'answer': 'SELECT COUNT(*) FROM head WHERE age > 56'}

Generate text dataset from the dataset

In [8]:
def generate_text_dataset(data, train_name):
    # Extract questions and answers
    questions = [entry['question'] for entry in data]
    answers = [entry['answer'] for entry in data]

    # Store questions and answers in a text file
    with open(train_name, 'w') as text_file:
        for q, a in zip(questions, answers):
            text_file.write(f"[Q] {q}\n[A] {a}\n\n")

In [9]:
generate_text_dataset(dataset, 'train.txt')

## Model
Define model and tokenizer

In [10]:
train_file_path = os.path.join(os.getcwd(),"train.txt" )
model_name = 'gpt2'
output_dir = 'gpt2-text2sql-v1'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 20
save_steps = 50000

Utility functions

In [11]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [12]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

In [13]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)
        
    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

    trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
    )
        
    trainer.train()
    trainer.save_model()

In [14]:
%%time
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



  0%|          | 0/820 [00:00<?, ?it/s]

{'loss': 0.9177, 'grad_norm': 4.5685930252075195, 'learning_rate': 1.9512195121951222e-05, 'epoch': 12.2}
{'train_runtime': 543.0562, 'train_samples_per_second': 12.08, 'train_steps_per_second': 1.51, 'train_loss': 0.7455828224740377, 'epoch': 20.0}
CPU times: user 4min 28s, sys: 23.1 s, total: 4min 51s
Wall time: 9min 5s


## Inferencing

## Evaluation