In [None]:
!pip install transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm, trange

def loadData():
    df = pd.read_csv('/content/drive/MyDrive/Hackerland/lyrics-data.csv', engine = 'python', error_bad_lines = False)
    df = df[df['ALink'] == '/queen/']
    df = df.drop(columns=['language', 'ALink', 'SLink'])
    return df

df = loadData() 

In [5]:
class SongLyrics(Dataset):  
    def __init__(self, control, truncate = False, gpt2_type = "gpt2", max_length = 1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in control:
          self.lyrics.append(torch.tensor(self.tokenizer.encode(f"<|{control}|>{row[:max_length]}<|endoftext|>")))               
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [6]:
dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2")  

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [9]:
def train(dataset, model, tokenizer, batch_size=15, epochs=5, lr=2e-5, max_seq_len=400, warmupSteps=200, saveModel=False):
  
    device = torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmupSteps, num_training_steps=-1)

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss = 0
    accumulating_batch_count = 0
    inputTensor = None

    for epoch in range(epochs):
        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (inputTensor, carry_on, remainder) = pack_tensor(entry, inputTensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            inputTensor = inputTensor.to(device)
            outputs = model(inputTensor, labels=inputTensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            inputTensor = None

    if saveModel:
      torch.save(model.state_dict(), '/content/drive/MyDrive/Hackerland/modelWeights.pt')
            
    return model

In [11]:
model = train(dataset, model, tokenizer, epochs=50, saveModel=True)



Training epoch 0
0


253it [00:22, 11.29it/s]


Training epoch 1
tensor(1.4301, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:23, 10.97it/s]


Training epoch 2
tensor(1.2992, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.13it/s]


Training epoch 3
tensor(1.0317, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.17it/s]


Training epoch 4
tensor(1.2558, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.22it/s]


Training epoch 5
tensor(1.0312, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.09it/s]


Training epoch 6
tensor(0.0880, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.14it/s]


Training epoch 7
tensor(0.0516, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.42it/s]


Training epoch 8
tensor(1.3586, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.07it/s]


Training epoch 9
tensor(1.4163, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.24it/s]


Training epoch 10
tensor(0.8806, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.44it/s]


Training epoch 11
tensor(1.0319, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.07it/s]


Training epoch 12
tensor(1.4554, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.32it/s]


Training epoch 13
tensor(0.4170, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.33it/s]


Training epoch 14
tensor(0.6301, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.24it/s]


Training epoch 15
tensor(0.0573, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.46it/s]


Training epoch 16
tensor(0.7908, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.42it/s]


Training epoch 17
tensor(1.2079, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.31it/s]


Training epoch 18
tensor(0.4708, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.36it/s]


Training epoch 19
tensor(1.2126, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.26it/s]


Training epoch 20
tensor(0.3938, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.32it/s]


Training epoch 21
tensor(1.0436, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.41it/s]


Training epoch 22
tensor(0.0352, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:21, 11.59it/s]


Training epoch 23
tensor(0.7889, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.36it/s]


Training epoch 24
tensor(1.2401, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.30it/s]


Training epoch 25
tensor(1.0642, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.33it/s]


Training epoch 26
tensor(1.2627, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.32it/s]


Training epoch 27
tensor(1.2980, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.19it/s]


Training epoch 28
tensor(0.0515, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.19it/s]


Training epoch 29
tensor(1.0076, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.49it/s]


Training epoch 30
tensor(0.9389, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:23, 10.94it/s]


Training epoch 31
tensor(0.0573, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.18it/s]


Training epoch 32
tensor(0.9993, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.31it/s]


Training epoch 33
tensor(1.0173, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.29it/s]


Training epoch 34
tensor(0.9159, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.38it/s]


Training epoch 35
tensor(1.5416, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.08it/s]


Training epoch 36
tensor(1.3342, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.19it/s]


Training epoch 37
tensor(1.3106, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.07it/s]


Training epoch 38
tensor(1.1424, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.44it/s]


Training epoch 39
tensor(0.9841, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.36it/s]


Training epoch 40
tensor(1.2003, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.17it/s]


Training epoch 41
tensor(1.1629, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.43it/s]


Training epoch 42
tensor(1.6885, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.14it/s]


Training epoch 43
tensor(0.0578, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.23it/s]


Training epoch 44
tensor(0.1653, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.21it/s]


Training epoch 45
tensor(1.2531, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.31it/s]


Training epoch 46
tensor(0.7071, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.21it/s]


Training epoch 47
tensor(1.1306, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.18it/s]


Training epoch 48
tensor(0.8905, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.26it/s]


Training epoch 49
tensor(0.0423, device='cuda:0', grad_fn=<NllLossBackward0>)


253it [00:22, 11.25it/s]


In [21]:
def generate(model, tokenizer, prompt, entryCount=1, entryLength=75, topP=0.8, temp=1.0):

    model.eval()
    generatedList = []
    filterVal = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entryCount):

            entryFin = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entryLength):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temp if temp > 0 else 1.0)

                sortedLogits, sortedIndices = torch.sort(logits, descending=True)
                cumulativeProbs = torch.cumsum(torch.nn.functional.softmax(sortedLogits, dim=-1), dim=-1)

                sortedIndicesToRemove = cumulativeProbs > topP
                sortedIndicesToRemove[..., 1:] = sortedIndicesToRemove[
                    ..., :-1
                ].clone()
                sortedIndicesToRemove[..., 0] = 0

                indices_to_remove = sortedIndices[sortedIndicesToRemove]
                logits[:, indices_to_remove] = filterVal

                next_token = torch.multinomial(torch.nn.functional.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entryFin = True

                if entryFin:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generatedList.append(output_text)
                    break
            
            if not entryFin:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}" 
              generatedList.append(output_text)
                
    return generatedList[0]

In [25]:
generate(model.to('cpu'), tokenizer, "I miss you").replace("\n\n", " ")

100%|██████████| 1/1 [00:28<00:00, 28.51s/it]


'I miss you my friend!" To be honest I was the only one who really believed this was the real story, but the truth was I was in a rush and couldn\'t seem to stop it. When my mom sent me on a Monday to see me my sense of purpose was turning to gloom and sadness. I had just been crowned queen of the country and everything had just been thrown away. I couldn\'t take it anymore and I didn\'t want to go back to my'