In [None]:
import torch
from datasets import Dataset
import pandas as pd
import numpy as np
import json
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments

In [None]:
torch.cuda.empty_cache()
payloads = pd.read_csv('./data/SQLi/sqli.txt', names=["payloads"], nrows=1000000, on_bad_lines='skip')

In [None]:
dataset = Dataset.from_pandas(payloads)
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
model_name = "EleutherAI/gpt-neo-125m"
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
with open('.data/attack-grammars/sqli/vocab.json/vocab.json') as file:
    vocab = json.load(file)

# Extract the values into a list
special_tokens = list(vocab.keys())
print(special_tokens)

In [None]:
tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

tokenizer.pad_token = tokenizer.eos_token

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.nn.DataParallel(model)
model.to(device)

In [None]:
def tokenize_function(examples):
    # Tokenize the text and prepare labels
    encoding = tokenizer(examples["payloads"], truncation=True, padding="max_length", max_length=128)
    encoding["labels"] = encoding["input_ids"]
    return encoding

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["payloads"])

In [None]:
tokenized_dataset

In [None]:
training_args = TrainingArguments(
    output_dir="./models/pretrain-models/gpt-neo-checkpoints",
    per_device_train_batch_size=2,   # Adjust batch size based on your GPU memory1
    per_device_eval_batch_size=2,
    eval_strategy="epoch",
    learning_rate=2e-5,
    gradient_accumulation_steps=8,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./models/pretrain-models/gpt-neo-checkpoints/logs",
    logging_steps=10,
    fp16=True,
    remove_unused_columns=False,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer
)

In [None]:
# Fine-tune the model
torch.cuda.empty_cache()
trainer.train()

In [None]:
model.module.save_pretrained('./models/pretrain-models/gpt_neo_1m')
tokenizer.save_pretrained('./models/pretrain-models/gpt_neo_1m')