In [None]:
import os
import sys
import requests
import re
import pickle
import json

import numpy as np
import torch
torch.cuda.empty_cache()
from tqdm.notebook import tqdm as bar
import pathlib

In [None]:
# !pip install transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [75]:
COLAB = True

USE_CUDA = False
if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = 'gdrive/MyDrive/lyricGenerator/'
    sys.path.append('gdrive/MyDrive/lyricGenerator/')

    USE_CUDA = torch.cuda.is_available()

    if USE_CUDA:
        DEVICE = torch.device('cuda')
        print("Using cuda.")
    else:
        DEVICE = torch.device('cpu')
        print("Using cpu.")
        
    os.chdir(os.path.join(os.getcwd(),'gdrive/MyDrive/lyricGenerator'))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Using cpu.


FileNotFoundError: ignored

In [100]:
artist_name = input("Please enter the name of an artist...")

Please enter the name of an artist...The National


In [101]:
# Point to the model
# Then generate the text
def get_model_path(artist_name):
    if artist_name.find(' ') > -1:
        artist_name = '-'.join(artist_name.lower().split(' '))
    model_path = os.getcwd() + f'/models/{artist_name}'
    if os.path.exists(model_path):
        return model_path
    else:
        print("Model hasn't been generated yet for this artist!")
        return None

model_path = get_model_path(artist_name)
output_path = f'{model_path}/output'

In [102]:
model_path

'/content/gdrive/MyDrive/lyricGenerator/models/the-national'

In [103]:
model = AutoModelForCausalLM.from_pretrained(f'{model_path}/output')

In [104]:
start = 'I'
num_sequences = 10
min_length = 100
max_length = 150
temperature = 1
top_p = 0.95
top_k = 50
repetition_penalty = 1.005

tokenizer = AutoTokenizer.from_pretrained("gpt2")
encoded_prompt = tokenizer(start, add_special_tokens = False, return_tensors = 'pt').input_ids
encoded_prompt = encoded_prompt.to(model.device)

# prediction
output_sequences = model.generate(
                        input_ids=encoded_prompt,
                        max_length=max_length,
                        min_length=min_length,
                        temperature=float(temperature),
                        top_p=float(top_p),
                        top_k=int(top_k),
                        do_sample=True,
                        repetition_penalty=repetition_penalty,
                        num_return_sequences=num_sequences
                        )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [105]:
print(tokenizer.decode(output_sequences[1]))

I is a liar and I'm an asshole
Why are you still there, isn't that the truth?
Why are you still here?
And I guess what makes you think so much about me
You know the man is getting better each month
What does he want?
The good feeling you get
From your wife in the pool
To the new house
You have nothing to lose but your heart
What makes you think so much about me
Why are you still here, isn't that the truth?
Don’t blame yourself for it
For me
Because I didn’t ask, didn’t make my choice
When I was young, I thought I was lost
When I am young


In [106]:
def save_generated(output_sequences, model_path = model_path, start = start):
    gen_path = model_path.replace('models','generated')
    if not os.path.exists(gen_path):
        os.mkdir(gen_path)
    
    prompt = '-'.join(start.split(' '))
    prompt_file = f'{gen_path}/{prompt}.txt'

    with open(prompt_file, 'w') as f:
        f.write('')
    for num, output_sequence in enumerate(output_sequences):
        with open(prompt_file, 'a') as f:
            f.write('\nSong number: ' + str(num +1) + '\n')
            f.write(tokenizer.decode(output_sequence) + '\n\n')
            

In [107]:
save_generated(output_sequences)