## Install Dependencies

In [34]:
# Install transformers
!pip install transformers

# Install openpyxl
!pip install openpyxl

 # Install Libraries

In [35]:
# Transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup

# Torch
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# Numpy and pandas
import numpy as np
import pandas as pd

# Helper libraries
import random,os
from tqdm import tqdm, trange
import csv

## Read Data (lyrics) from gloom_index.xlsx file

In [36]:
lyrics_file_path = '/content/gloom_index.xlsx'
generated_lyrics_file_path = "/content/generated_lyrics.txt"

In [37]:
data = pd.read_excel(lyrics_file_path)
data = data[['lyrics']]

# Generate Lyrics using data

In [38]:
class Generate_data(Dataset):  
    def __init__(self, control_code, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in data['lyrics']:
          try:
              self.lyrics.append(torch.tensor(self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")))
          except:
              pass      
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

## Pack Tensor

In [39]:
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

## Train using dataset and model

In [40]:
def train(dataset, model, tokenizer, batch_size=32, epochs=10, lr=1e-4, max_seq_len=400, warmup_steps=200, gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar", test_mode=False):
    device = torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)
    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss = 0
    batch_count_so_far = 0
    input_tensor = None

    for epoch in range(epochs):
        print("Training epoch: {} with loss: {}".format(epoch, loss))
        for idx, entry in tqdm(enumerate(train_dataloader)):
            
            # Pack the tensor
            (input_tensor, carry_on, _) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (batch_count_so_far % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            batch_count_so_far += 1
            input_tensor = None
        
    return model

## For an input 'generate' function returns the lyrics 

In [41]:
def generate(model, tokenizer, prompt, entry_count=10, entry_length=20, top_p=0.8, temperature=1):
    model.eval()
    no_of_generated_lyrics = 0
    lyrics_list = []
    filter_value = -float("Inf")

    with torch.no_grad():
        for _ in trange(entry_count):
            lyric_completed = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                output_from_model = model(generated, labels=generated)
                _, logits = output_from_model[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0
                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)
                if next_token in tokenizer.encode("<|endoftext|>"):
                    lyric_completed = True
                if lyric_completed:
                    no_of_generated_lyrics = no_of_generated_lyrics + 1
                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    lyrics_list.append(output_text)
                    break
            
            if not lyric_completed:
              output_list = list(generated.squeeze().numpy())
              output_text = tokenizer.decode(output_list)
              lyrics_list.append(output_text)
                
    return lyrics_list

# Invoke Generate and get Lyrics

In [42]:
# Invoke generate method along with model to get different lyrics on string 'I love deep Learning'
TEXT_FOR_GENERATING_LYRICS = "I love Deep Learning"

# Get the dataset
dataset = Generate_data(data['lyrics'], gpt2_type="gpt2")  

# Get the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Get the model using the dataset and tokenizer from pretrained model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = train(dataset, model, tokenizer)

lyrics = []
for i in range(20):
  lyrics.append(generate(model.to('cpu'), tokenizer, TEXT_FOR_GENERATING_LYRICS, entry_count=1))

with open(generated_lyrics_file_path, 'w') as f:
    for one_lyric in lyrics:
        f.write('{} '.format(one_lyric[0].strip()))
        print(one_lyric[0].strip())