# Fine-Tuning GPT2 for SQL

In [1]:
import os
import json
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import TextDataset, DataCollatorForLanguageModeling

2024-09-27 08:02:56.662011: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Load SQL context dataset

In [20]:
from datasets import load_dataset

dataset = load_dataset("b-mc2/sql-create-context", split='train[:1%]')

In [21]:
dataset

Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 786
})

In [22]:
def train_text_generate_question_answer(data, train_name):
    # Extract questions and answers
    questions = [entry['question'] for entry in data]
    answers = [entry['answer'] for entry in data]

    # Store questions and answers in a text file
    with open(train_name, 'w') as text_file:
        for q, a in zip(questions, answers):
            text_file.write(f"[Q] {q}\n[A] {a}\n\n")

In [25]:
train_text_generate_question_answer(dataset, 'train.txt')

In [26]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)
        
    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

    trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
    )
        
    trainer.train()
    trainer.save_model()

In [27]:
train_file_path = os.path.join(os.getcwd(),"train.txt" )
model_name = 'gpt2'
output_dir = 'model'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 20
save_steps = 50000

In [28]:
%%time
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



  0%|          | 0/820 [00:00<?, ?it/s]

{'loss': 0.9177, 'grad_norm': 4.568658828735352, 'learning_rate': 1.9512195121951222e-05, 'epoch': 12.2}
{'train_runtime': 543.0141, 'train_samples_per_second': 12.081, 'train_steps_per_second': 1.51, 'train_loss': 0.7455823758753335, 'epoch': 20.0}
CPU times: user 4min 24s, sys: 26.4 s, total: 4min 50s
Wall time: 9min 5s
