In [None]:
import torch

### Train the tokenizer

In [None]:
# For the user's convenience `tokenizers` provides some very high-level classes encapsulating
# the overall pipeline for various well-known tokenization algorithm. 
# Everything described below can be replaced by the ByteLevelBPETokenizer class. 

from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel

# First we create an empty Byte-Pair Encoding model (i.e. not trained model)
_tokenizer = Tokenizer(BPE())

# Our tokenizer also needs a pre-tokenizer responsible for converting the input to a ByteLevel representation.
_tokenizer.pre_tokenizer = ByteLevel()

# And finally, let's plug a decoder so we can recover from a tokenized input to the original one
_tokenizer.decoder = ByteLevelDecoder()

In [None]:
from tokenizers.trainers import BpeTrainer

# We initialize our trainer, giving him the details about the vocabulary we want to generate
trainer = BpeTrainer(vocab_size=1000, show_progress=True, initial_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])
_tokenizer.train(files=["tokenized_tunes.txt"], trainer=trainer)



In [None]:
# saving the tokenized data in our specified folder 
import os
save_path = 'tokenized_data'

In [None]:
if not os.path.exists(save_path):
    os.makedirs(save_path)
_tokenizer.model.save(save_path)

In [None]:
_tokenizer

In [None]:
#_tokenizer.enable_truncation(max_length=512)

In [None]:
_tokenizer

### Convert the tokenizer 

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(save_path)

In [None]:
tokenizer.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

In [None]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(dataset['train']['text'][0]))

### Load Dataset

In [None]:
from datasets import load_dataset, GenerateMode

dataset = load_dataset('text', data_files='tokenized_tunes.txt', download_mode = GenerateMode.FORCE_REDOWNLOAD)


In [None]:
datasets = dataset['train'].train_test_split(0.2)

In [None]:
tokenized_dataset = datasets.map(lambda x: tokenizer(x['text']), batched=True, num_proc=4, remove_columns=["text"])

In [None]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_dataset.map(
    group_texts,
    batched=True,
    batch_size=100,
    num_proc=4,
)

### Configure Model

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length = 1024)

In [None]:
# I chose the hyperparameters here to get an architecture that
# had roughly the same number of paramters as FolkRNN. But I changed a lot of 
# things and IDK what the trade offs are.
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Config
config = GPT2Config(
    vocab_size = tokenizer.vocab_size,
    n_positions = 512, 
    n_embd = 384,
    n_ctx = 512,
    n_layer = 6,
    n_inner = 6,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
model = AutoModelForCausalLM.from_config(config)

In [None]:
model.num_parameters()

### Configure Training

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir='./results',          # output directory
  num_train_epochs=10,              # total # of training epochs
  per_device_train_batch_size=16,  # batch size per device during training
  per_device_eval_batch_size=16,   # batch size for evaluation
  evaluation_strategy = "epoch",
  load_best_model_at_end = True
  logging_dir='./logs',            # directory for storing logs
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=lm_datasets['train'],
    eval_dataset=lm_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
def generate(start_text = "A "):
    # encoding the input text
    input_ids = tokenizer.encode(start_text, return_tensors='pt')
    # getting out output
    beam_output = model.generate(
      input_ids,
      max_length = 128,
      temperature = 0.7,
      no_repeat_ngram_size=4, #
      num_return_sequences=1
    )
    return beam_output[0]

In [None]:
tokenizer.decode(generate())

### TODO / things to try:
#### Pre-processing:
    - regenerate dataset with the proper key, mode, and type
    - fix tokenization issues
    - possibly shift to the same scale 
#### Tokenization:
    - try byte-pair-encoding with key / mode / type
    - try getting simple spaced based tokenization to work
    - look at other archtitectures for domain specific special tokens
    - if we can't get good results, data process + train manually to use our own function 
#### Data-processing:
    - find a better way to create training labels
    - tune batch_size and block_size
    - figure out how padding works
#### Model
    - tune architecture parameters
    - try alternative architectures (ProphetNet?)
#### Training
    - tune training parameters
#### Generation
    - figure out what all of the parameters in generate do 
    - write our own generation code?
#### Evaluation
    - look into quality metrics from existing research
    - write some code to detect training data plagiarism (generation spitting back an input)
    - generate audio from our output for subjective evaluation
    - write some code to check for structural integrity ?
#### Misc
    - figure out a good workflow to train and share model versions 
    