In [19]:
import os
import sys
import requests
import re
import pickle
import json

import numpy as np
import torch
torch.cuda.empty_cache()
from tqdm.notebook import tqdm as bar
import pathlib

In [20]:
# !pip install transformers
# !pip install datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

In [8]:
COLAB = True

USE_CUDA = False
if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = 'gdrive/MyDrive/lyricGenerator/'
    sys.path.append('gdrive/MyDrive/lyricGenerator/')

    USE_CUDA = torch.cuda.is_available()

    if USE_CUDA:
        DEVICE = torch.device('cuda')
        print("Using cuda.")
    else:
        DEVICE = torch.device('cpu')
        print("Using cpu.")

    os.chdir(os.path.join(os.getcwd(),'gdrive/MyDrive/lyricGenerator'))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Using cuda.


FileNotFoundError: ignored

In [46]:
def get_datasets(artist_name):
    if artist_name.find(' ') > -1:
        artist_name = '-'.join(artist_name.lower().split(' '))
    ds_file = f'./models/{artist_name}/datasets.p'
    if os.path.exists(ds_file):
        with open(ds_file, 'rb') as handle:
            dataset = pickle.load(handle)
        return dataset, os.getcwd() + f'/models/{artist_name}'
    else:
        print("Cant find data associated with this artist! Please try again!")
        return None, None


In [48]:
lm_datasets, artist_folder = get_datasets('The National')
model_name = artist_folder[artist_folder.rfind('/') + 1:]

In [49]:
trainer_state_path = f'{artist_folder}/output/trainer_state.json'

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2", cache_dir=pathlib.Path('cache').resolve())

EPOCHS = 5
num_train_epochs = EPOCHS
if os.path.isfile(trainer_state_path):
  f = open (trainer_state_path, "r")
  trainer_state = json.loads(f.read()) 
  f.close()
  epoch = trainer_state['epoch']
  num_train_epochs += epoch

  model = AutoModelForCausalLM.from_pretrained(f'{artist_folder}/output')


Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/6c0e6080953db56375760c0471a8c5f2929baf11/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-g

In [50]:
training_args = TrainingArguments(output_dir = artist_folder + '/output',
                                  evaluation_strategy = 'epoch',
                                  learning_rate=5e-5,
                                  weight_decay=0.01,
                                  logging_strategy = 'epoch',
                                  num_train_epochs=num_train_epochs,
                                  save_strategy = 'epoch',
                                  save_total_limit=10,
                                  load_best_model_at_end=True)

trainer = Trainer(
    model=model,
    # tokenizer=tokenizer,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["valid"]
)

from transformers import get_cosine_schedule_with_warmup
train_dataloader = trainer.get_train_dataloader()
num_train_steps = len(train_dataloader)
trainer.create_optimizer_and_scheduler(num_train_steps)
trainer.lr_scheduler = get_cosine_schedule_with_warmup(
      trainer.optimizer,
      num_warmup_steps=0,
      num_training_steps=num_train_steps
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [51]:
if os.path.isfile(trainer_state_path):
  try:
    data = trainer.train(resume_from_checkpoint=True)
  except:
    data = trainer.train()
else:
  data = trainer.train()
print(data)

try:
  with open(f'{artist_folder}/output/evaluation.txt') as json_file:
      evaluation = json.load(json_file)
  eval_loss = evaluation['eval_loss']
except:
  eval_loss = 9999999
  
evaluation = trainer.evaluate()
if evaluation['eval_loss'] < eval_loss:
  save_model = True
  with open(f'{artist_folder}/output/evaluation.txt', 'w') as outfile:
    json.dump(evaluation, outfile)
  print("Saved evaluation results")
else:
  save_model = False

trainer.save_model(f'{artist_folder}/output')
trainer.save_state()

***** Running training *****
  Num examples = 185
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 120


Epoch,Training Loss,Validation Loss
1,2.6346,2.763496
2,2.4728,2.733399
3,2.3129,2.715698
4,2.2662,2.710329
5,2.2032,2.724017


***** Running Evaluation *****
  Num examples = 23
  Batch size = 8
Saving model checkpoint to /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/checkpoint-24
Configuration saved in /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/checkpoint-24/config.json
Model weights saved in /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/checkpoint-24/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23
  Batch size = 8
Saving model checkpoint to /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/checkpoint-48
Configuration saved in /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/checkpoint-48/config.json
Model weights saved in /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/checkpoint-48/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23
  Batch size = 8
Saving model checkpoint to /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/checkpoint-72
Con

TrainOutput(global_step=120, training_loss=2.377942117055257, metrics={'train_runtime': 105.2428, 'train_samples_per_second': 8.789, 'train_steps_per_second': 1.14, 'total_flos': 120847564800000.0, 'train_loss': 2.377942117055257, 'epoch': 5.0})


Saving model checkpoint to /content/gdrive/MyDrive/lyricGenerator/models/the-national/output
Configuration saved in /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/config.json


Saved evaluation results


Model weights saved in /content/gdrive/MyDrive/lyricGenerator/models/the-national/output/pytorch_model.bin


In [52]:
evaluation = trainer.evaluate()
evaluation

***** Running Evaluation *****
  Num examples = 23
  Batch size = 8


{'eval_loss': 2.710329055786133,
 'eval_runtime': 0.6102,
 'eval_samples_per_second': 37.696,
 'eval_steps_per_second': 4.917,
 'epoch': 5.0}