In [1]:
# !pip install torch
# !pip install transformers

In [1]:
import pandas as pd
import numpy as np
import re
import torch

from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer


In [2]:
def load_dataset(file_path, tokenizer, block_size = 100):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)
      
    model = GPT2LMHeadModel.from_pretrained(model_name)

    model.save_pretrained(output_dir)
    training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

    trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
    trainer.train()
    trainer.save_model()

In [15]:
df = pd.read_parquet('beatles_lyrics.parquet').drop_duplicates('lyrics')
df.drop(df[df['lyrics'].str.contains('Speech')].index, inplace=True)
test_set = df.sample(35)
train_set = df.drop(test_set.index)


In [17]:
# test_set.to_parquet('testing_lyrics.parquet')
# train_set.to_parquet('training_lyrics.parquet')
# f = open('training_lyrics.txt', 'w')
# train_string = ' '.join(list(train_set.lyrics.values))
# f.write(train_string)


238706

In [3]:
# training parameters

train_file_path = "training_lyrics.txt"
model_name = 'gpt2-medium'
output_dir = 'results'
overwrite_output_dir = True
per_device_train_batch_size = 8
num_train_epochs = 20
save_steps = 500

In [4]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

***** Running training *****
  Num examples = 881
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2220
  Number of trainable parameters = 354823168


Step,Training Loss
500,1.9145
1000,0.6273
1500,0.2456
2000,0.1446


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to results
Configuration saved in results/config.json
Model weights saved in results/pytorch_model.bin
