## Install Dependencies

In [1]:
# Install transformers
!pip install transformers

# Install openpyxl
!pip install openpyxl

 # Install Libraries

In [2]:
# Transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

# Torch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Numpy and pandas
import numpy as np
import pandas as pd

# Helper libraries
import random,os
from tqdm import tqdm, trange
import csv

## Read Data (lyrics) from gloom_index.xlsx file

In [3]:
lyrics_file_path = '/content/lyrics/gloom_index.xlsx'
generated_lyrics_file_path = "/content/generated_lyrics.txt"

In [4]:
data = pd.read_excel(lyrics_file_path)
data = data[['lyrics']]

# Generate Lyrics using data

In [5]:
class GetDataset(Dataset):  
    def __init__(self, code, gpt2_type="gpt2", maximum_len=1024):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in data['lyrics']:
          try:
              self.lyrics.append(torch.tensor(self.tokenizer.encode(f"<|{code}|>{row[:maximum_len]}<|endoftext|>")))
          except:
              pass      
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

## Pack Tensor

In [6]:
def pack_tensor(first_tensor, compressed_tensor, maximum_sequence_length):
    if compressed_tensor is None:
        return first_tensor, True, None
    if first_tensor.size()[1] + compressed_tensor.size()[1] > maximum_sequence_length:
        return compressed_tensor, False, first_tensor
    compressed_tensor = torch.cat([first_tensor, compressed_tensor[:, 1:]], dim=1)
    return compressed_tensor, True, None

## Train using dataset and model

In [7]:
def TrainModel(dataset, model, tokenizer, batch_size=32, epochs=10, lr=1e-4, max_seq_len=400, warmup_steps=200, gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar", test_mode=False):
    model = model.cuda()
    model.train()

    opti = AdamW(model.parameters(), lr=lr)
    schedulerWithWarmUp = get_linear_schedule_with_warmup(opti, num_warmup_steps=warmup_steps, num_training_steps=-1)
    dataLoaderOfTrain = DataLoader(dataset, batch_size=1, shuffle=True)
    trainingLoss = 0
    batchCountSoFar = 0
    inputTensor = None

    for epoch in range(epochs):
        print("Training epoch: {} with loss: {}".format(epoch, trainingLoss))
        for idx, entry in tqdm(enumerate(dataLoaderOfTrain)):
            (inputTensor, carry_on, _) = pack_tensor(entry, inputTensor, 768)
            if carry_on and idx != len(dataLoaderOfTrain) - 1:
                continue
            inputTensor = inputTensor.to(torch.device("cuda"))
            trainingLoss = model(inputTensor, labels=inputTensor)[0]
            trainingLoss.backward()
            if (batchCountSoFar % batch_size) == 0:
                opti.step()
                schedulerWithWarmUp.step()
                opti.zero_grad()
                model.zero_grad()
            batchCountSoFar += 1
            inputTensor = None

    return model

## For an input 'generate' function returns the lyrics 

In [8]:
def generate(model, tokenizer, prompt, entry_count=10, entry_length=20, top_p=0.8, temperature=1):
    
    negativeFilterValue = -float("Inf")
    lyricsList = []
    noOfGeneratedLyrics = 0

    # Evaluate the model
    model.eval()

    # Loop till no change in grad or lyric is completed
    with torch.no_grad():
        for _ in trange(entry_count):
            lyricCompletedFlag = False
            tensorGenerated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputFromModel = model(tensorGenerated, labels=tensorGenerated)
                _, logitValues = outputFromModel[:2]
                logitValues = logitValues[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sortedLogitValues, sortedIndexes = torch.sort(logitValues, descending=True)
                cumProbabilities = torch.cumsum(F.softmax(sortedLogitValues, dim=-1), dim=-1)
                sortedIndexesToBeRemoved = cumProbabilities > top_p
                sortedIndexesToBeRemoved[..., 1:] = sortedIndexesToBeRemoved[..., :-1].clone()
                sortedIndexesToBeRemoved[..., 0] = 0
                indices_to_remove = sortedIndexes[sortedIndexesToBeRemoved]
                logitValues[:, indices_to_remove] = negativeFilterValue

                upcomingToken = torch.multinomial(F.softmax(logitValues, dim=-1), num_samples=1)
                tensorGenerated = torch.cat((tensorGenerated, upcomingToken), dim=1)
                if upcomingToken in tokenizer.encode("<|endoftext|>"):
                    lyricCompletedFlag = True
                if lyricCompletedFlag:
                    noOfGeneratedLyrics = noOfGeneratedLyrics + 1
                    outputListGenerated = list(tensorGenerated.squeeze().numpy())
                    outputTextDecoded = tokenizer.decode(outputListGenerated)
                    lyricsList.append(outputTextDecoded)
                    break
            
            if not lyricCompletedFlag:
              outputListGenerated = list(tensorGenerated.squeeze().numpy())
              outputTextDecoded = tokenizer.decode(outputListGenerated)
              lyricsList.append(outputTextDecoded)
                
    return lyricsList

# Invoke Generate and get Lyrics

In [9]:
# Invoke generate method along with model to get different lyrics on string 'I love deep Learning'
TEXT_FOR_GENERATING_LYRICS = "I love Deep Learning"

# Get the dataset
dataset = GetDataset(data['lyrics'], gpt2_type="gpt2")  

# Get the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Get the model using the dataset and tokenizer from pretrained model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = TrainModel(dataset, model, tokenizer)

lyrics = []
for i in range(20):
  lyrics.append(generate(model.to('cpu'), tokenizer, TEXT_FOR_GENERATING_LYRICS, entry_count=1))

with open(generated_lyrics_file_path, 'w') as f:
    for one_lyric in lyrics:
        f.write('{} '.format(one_lyric[0].strip()))
        print(one_lyric[0].strip())