# Finetuning FratGPT

Sources: 
- https://github.com/arham-kk/gpt2-finetune
- https://huggingface.co/docs/datasets/v1.12.1/use_dataset.html
- https://huggingface.co/docs/transformers/model_doc/gpt2

In [1]:
import os
import csv
import torch
from datasets import load_dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\jthoe\miniconda3\envs\fratgpt\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\jthoe\miniconda3\envs\fratgpt\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\jthoe\miniconda3\envs\fratgpt\Lib\site-packages\ipykernel\kernelapp.py"

In [2]:
from config import OUTPUT_TRAINING_DATA_DIRECTORY_PATH
csv_name: str = "output_data_2024-06-21-14-23-30.csv" # change this to be your csv
data_file_path = os.path.join(
    OUTPUT_TRAINING_DATA_DIRECTORY_PATH, csv_name
)

In [3]:
from config import MODELS_DIRECTORY_PATH
model_save_path = os.path.join(MODELS_DIRECTORY_PATH, "fratgpt")

In [None]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
with open(data_file_path, mode="r") as data_file_csv:
    csv_reader = csv.reader(data_file_csv, delimiter=",")
    column_names = next(csv_reader)

dataset = load_dataset(
    "csv",
    data_files=data_file_path,
    column_names=column_names,
    delimiter=",",
    skiprows=1,
)

train_data = dataset["train"].select(
    [i for i in range(len(dataset["train"])) if i % 10 != 0]
)
eval_data = dataset["train"].select(
    [i for i in range(len(dataset["train"])) if i % 10 == 0]
)

In [None]:
def tokenize(examples):
    inputs = tokenizer(
        examples["text"], padding="max_length", max_length=512, truncation=True
    )
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

train_data = train_data.map(
    tokenize,
    batched=True,
)
eval_data = eval_data.map(
    tokenize,
    batched=True,
)

In [None]:
training_args = TrainingArguments(
    output_dir="./model",
    overwrite_output_dir=True,
    num_train_epochs=0.5,
    per_device_train_batch_size=2,
    eval_strategy="steps",
    eval_steps=100,
    save_steps=500,
    logging_steps=100,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
)
trainer.train()

In [None]:
trainer.save_model(model_save_path)