In [1]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.25.1-py3-none-any.whl (5.8 MB)
Collecting tqdm>=4.27
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m643.8 kB/s[0m eta [36m0:00:00[0m1m575.0 kB/s[0m eta [36m0:00:01[0m
[?25hCollecting filelock
  Using cached filelock-3.9.0-py3-none-any.whl (9.7 kB)
Collecting regex!=2019.12.17
  Using cached regex-2022.10.31-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (770 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
Collecting pyyaml>=5.1
  Using cached PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (682 kB)
Collecting huggingface-hub<1.0,>=0.10.0
  Using cached huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
Installing collected packages: tokenizers, tqdm, r

In [2]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [3]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator

def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [4]:
train_file_path = "./song_dataset.txt"
model_name = 'gpt2'
output_dir = './result'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

In [5]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 2781
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1740
  Number of trainable parameters = 124439808


Step,Training Loss
500,3.1004
1000,2.7809
1500,2.6093


Saving model checkpoint to /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-500
Configuration saved in /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-500/config.json
Model weights saved in /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-1000
Configuration saved in /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-1000/config.json
Model weights saved in /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-1500
Configuration saved in /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-1500/config.json
Model weights saved in /home/iddqd/Projects/Python/OOP/OOP_2/result/checkpoint-1500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /home/iddqd/Projects/Python/OOP/