In [1]:
from random import randint
from datasets import Dataset
from datasets import load_dataset

# Specify dataset
single_dataset_file = "dataset_512.txt"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import GPT2TokenizerFast

full_train_dataset = load_dataset("text", data_files=single_dataset_file)
print(full_train_dataset)

def get_training_corpus(dataset=full_train_dataset):
      data = dataset["train"]["text"]
      for measure in data:
          yield measure

# Train tokenizer if necessary 
tokenizer = None
try: 
  tokenizer = GPT2TokenizerFast.from_pretrained("./music-gpt2-tokenizer")
except:
  tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
  tokenizer = tokenizer.train_new_from_iterator(get_training_corpus(), 1000)
  tokenizer.save_pretrained("music-gpt2-tokenizer")

Using custom data configuration default-884e21f88136b3d0


Downloading and preparing dataset text/default to /Users/jonathan/.cache/huggingface/datasets/text/default-884e21f88136b3d0/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 6626.07it/s]
Extracting data files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 462.34it/s]
                                                                  

Dataset text downloaded and prepared to /Users/jonathan/.cache/huggingface/datasets/text/default-884e21f88136b3d0/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 179.84it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 33290
    })
})





In [3]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = full_train_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.shuffle()
tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.001)
print(tokenized_datasets)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:14<00:00,  2.40ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 33256
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 34
    })
})





In [4]:
import numpy as np
import evaluate

from transformers import GPT2LMHeadModel, GPT2Config, TrainingArguments, Trainer, DataCollatorForLanguageModeling

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    return {"accuracy": 0}

model = GPT2LMHeadModel.from_pretrained("music-gpt2-2_2")
training_args = TrainingArguments("trainer", 
    label_names=["input_ids", "attention_mask"],
    evaluation_strategy="steps", 
    eval_steps=10000,
    save_steps=10000,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    num_train_epochs=1,    
    per_device_train_batch_size=2, 
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=1)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
tokenizer.pad_token = tokenizer.eos_token
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [5]:
trainer.train()
trainer.save_model("music-gpt2-2.3")

Step,Training Loss,Validation Loss


In [9]:
from transformers import pipeline

generator = pipeline('text-generation', model='./music-gpt2-2.0')
generator("4/4 F4 1.0 0.0 56 G4 1.0 1.0 60", max_length=256)

loading configuration file ./music-gpt2-2.0/config.json
Model config GPT2Config {
  "_name_or_path": "./music-gpt2-2.0",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "use_cache": true,
  "vocab_size": 1000
}

loading configuration file ./music-gpt2-2.0/config.json
Model config GPT2Config {
  "_name_or_path": "./music-gpt2-2.0",
  "activa

[{'generated_text': '4/4 F4 1.0 0.0 56 G4 1.0 1.0 60 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 53 G4 1.0 1.0 57 G4 1.0 1.0 57 G4 1.0 1.0 57 G4 1.0 1.0 57 G4 1.0 1.0 57 G4 1.0 1.0 57 G4 1.0 1.0 57 G4 1.0 1.0 57 G4 1.0 1.0 57 G4 1.0 1.0 57 G'}]

In [17]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

import numpy as np

tokenizer = GPT2TokenizerFast.from_pretrained("./music-gpt2-tokenizer")
model = GPT2LMHeadModel.from_pretrained("./music-gpt2-2.0/")
input = tokenizer("4/4", return_tensors="pt")
output = model(**input)
print(output.last_hidden_state.shape)
predictions = np.argmax(output.logits.detach().numpy(), axis=-1)
tokenizer.decode(predictions[0])

AttributeError: 'CausalLMOutputWithCrossAttentions' object has no attribute 'last_hidden_state'