# Dataset Construction
## Song Lyric Generation with GPT-2
**Data Mining Final Project**<br>
Khyatee Desai<br>Dec. 19, 2022

In [32]:
# !pip install torch
# !pip install transformers

In [33]:
import pandas as pd
import numpy as np
import re
import torch
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer


In [22]:
def load_dataset(file_path, tokenizer, block_size = 100):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer = tokenizer, 
        mlm = mlm,
    )
    return data_collator


def train(train_file_path,
          model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)
      
    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)
    training_args = TrainingArguments(
          output_dir = output_dir,
          overwrite_output_dir = overwrite_output_dir,
          per_device_train_batch_size = per_device_train_batch_size,
          num_train_epochs=num_train_epochs
      )

    trainer = Trainer(
          model = model,
          args = training_args,
          data_collator = data_collator,
          train_dataset = train_dataset,
  )
      
    trainer.train()
    trainer.save_model()

In [23]:
# training parameters

train_file_path = "training_lyrics.txt"
model_name = 'gpt2-medium'
output_dir = 'results'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 20
save_steps = 500

In [26]:
train(
    train_file_path= train_file_path,
    model_name= model_name,
    output_dir= output_dir,
    overwrite_output_dir= overwrite_output_dir,
    per_device_train_batch_size= per_device_train_batch_size,
    num_train_epochs= num_train_epochs,
    save_steps= save_steps
)

In [39]:
torch.load('results_report/training_args.bin')

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_nam