In [5]:
%%capture
!pip install tokenizers

In [6]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
from pathlib import Path
import random
import math
from tqdm import tqdm

from tokenizers import Tokenizer
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.models import WordLevel, BPE
from tokenizers.pre_tokenizers import Whitespace

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
config = {
    'data_file_path': '/content/drive/My Drive/Notebooks/lyricalGPT/df_eng_pop_2015.csv',
    'context_size': 64, # must be < max_token_length
    'min_token_length': 100,
    'max_token_length': 500,
    'batch_size': 256,
    'epochs': 100,
    'n_heads': 8,
    'n_layers': 6,
    'embed_size': 512,
    'ff_size': 2048,
    'dropout': 0.1,
    'lr': 3e-4,
    'preload': 0,
    'model_folder': 'lyricalGPT',
    'model_basename': 'lyricalGPT',
    'model_filename': 'lyricalGPT20.pt',
    'tokenizer_file': 'tokenizer_{0}.json',
}

In [9]:
def prepare_data(file_path):
    df = pd.read_csv(file_path, header=0)
    data = [df['lyrics'][i] for i in range(len(df))]
    return data

In [10]:
#text = prepare_data(config['data_file_path'])

In [11]:
#len(text)

In [12]:
#text[0]

In [13]:
def retrieve_lyric(data):
    for i in range(len(data)):
        yield data[i]

def build_tokenizer(config, raw_data):
    tokenizer_path = Path(config["tokenizer_file"])
    if not Path.exists(tokenizer_path):
        tokenizer = Tokenizer(BPE(unk_token="<UNK>"))
        tokenizer.pre_tokenizer = Whitespace()
        trainer = BpeTrainer(special_tokens=["<UNK>", "<PAD>", "<SOS>", "<EOS>"])
        tokenizer.train_from_iterator(
            retrieve_lyric(raw_data), trainer=trainer
        )
        tokenizer.save(str(tokenizer_path))
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path))

    return tokenizer

In [14]:
#tokenizer = build_tokenizer(config, text)

In [15]:
#print("Number of tokens:", tokenizer.get_vocab_size())

In [16]:
#z = tokenizer.encode(text[0])
#print(z.ids)

In [17]:
#print(tokenizer.decode(z.ids))

In [18]:
"""
max_seq_len = 0
min_seq_len = 10000
for t in text:
    seq_len = len(tokenizer.encode(t).ids)
    if seq_len > max_seq_len:
        max_seq_len = seq_len

    if seq_len < min_seq_len:
        min_seq_len = seq_len
print("Max sequence length: ", max_seq_len)
print("Min sequence length: ", min_seq_len)
"""

'\nmax_seq_len = 0\nmin_seq_len = 10000\nfor t in text:\n    seq_len = len(tokenizer.encode(t).ids)\n    if seq_len > max_seq_len:\n        max_seq_len = seq_len\n\n    if seq_len < min_seq_len:\n        min_seq_len = seq_len\nprint("Max sequence length: ", max_seq_len)\nprint("Min sequence length: ", min_seq_len)\n'

In [19]:
#print("original length: ", len(text))
#new_text = [t for t in text if len(tokenizer.encode(t).ids) <= 500 and len(tokenizer.encode(t).ids) > 100]
#print("new length: ", len(new_text))

In [20]:
#tokenizer.token_to_id("<SOS>")

In [21]:
class LyricsDataset(Dataset):
    def __init__(self, data, tokenizer, context_size):
        super().__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.context_size = context_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        lyric = self.data[idx]
        input_ids = self.tokenizer.encode(lyric).ids

        size = len(input_ids)-self.context_size
        random_number = random.randint(0, size - 1)
        x = torch.tensor(input_ids[random_number:random_number+self.context_size],dtype=torch.long)
        y = torch.tensor(input_ids[random_number+1:random_number+self.context_size+1],dtype=torch.long)

        return x, y

In [22]:
#dataset = LyricsDataset(new_text, tokenizer, config['context_size'])

In [23]:
#len(dataset)

In [24]:
#x, y = dataset[0]

In [25]:
#x

In [26]:
#y

In [27]:
#print(tokenizer.decode(x.tolist()))

In [28]:
#print(tokenizer.decode(y.tolist()))

In [29]:
def build_dataloader_and_tokenizers(config):
    text = prepare_data(config['data_file_path'])
    tokenizer = build_tokenizer(config, text)
    new_text = [t for t in text if len(tokenizer.encode(t).ids) > config['min_token_length']]
    train_size = int(0.9 * len(new_text))
    val_size = len(new_text) - train_size
    raw_train, raw_val = random_split(new_text, [train_size, val_size])

    train = LyricsDataset(
        raw_train,
        tokenizer,
        config["context_size"],
    )
    val = LyricsDataset(
        raw_val,
        tokenizer,
        config["context_size"],
    )

    train_dataloader = DataLoader(train, batch_size=config["batch_size"], shuffle=True)
    val_dataloader = DataLoader(val, batch_size=config["batch_size"], shuffle=True)

    return train_dataloader, val_dataloader, tokenizer

In [30]:
#train, val, tokenizer = build_dataloader_and_tokenizers(config)

In [31]:
#train.dataset[0]

In [32]:
#x,y = train.dataset[355]

In [33]:
#print(len(x), len(y))

In [34]:
#x.shape

In [35]:
class Head(nn.Module):
  def __init__(self, embed_size: int, head_size: int, context_size: int, dropout: float):
    super().__init__()
    self.query = nn.Linear(embed_size, head_size)
    self.key = nn.Linear(embed_size, head_size)
    self.value = nn.Linear(embed_size, head_size)
    self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    # (batch_size, context_size, embed_size) --> (batch_size, context_size, head_size)

    # (batch_size, context_size, head_size)
    q, k, v = self.query(x), self.key(x), self.value(x)

    # (batch_size, context_size, head_size) @ (batch_size, head_size, context_size) --> (batch_size, context_size, context_size)
    attention_scores = (q @ k.transpose(-2,-1)) / math.sqrt(k.shape[-1])
    attention_scores = attention_scores.masked_fill(self.tril[:, :] == 0, float('-inf'))
    attention_scores = F.softmax(attention_scores, dim=-1)
    attention_scores = self.dropout(attention_scores)

    # (batch_size, context_size, context_size) @ (batch_size, context_size, head_size) --> (batch_size, context_size, head_size)
    out = attention_scores @ v
    return out

In [36]:
class MultiHeadAttention(nn.Module):
  def __init__(self, embed_size: int, head_size: int, n_heads: int, context_size: int, dropout: float):
    super().__init__()
    self.heads = nn.ModuleList([Head(embed_size, head_size, context_size, dropout) for _ in range(n_heads)])
    self.linear = nn.Linear(head_size * n_heads, embed_size)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(out)
    return out

In [37]:
class FeedForwardBlock(nn.Module):
  def __init__(self, embed_size: int, ff_size: int, dropout: float) -> None:
    super().__init__()
    self.linear_1 = nn.Linear(embed_size, ff_size)
    self.dropout = nn.Dropout(dropout)
    self.linear_2 = nn.Linear(ff_size, embed_size)

  def forward(self, x):
    # (batch_size, context_size, embed_size) --> (batch_size, context_size, ff_size) --> (batch_size, context_size, embed_size)
    return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [38]:
class DecoderBlock(nn.Module):
  def __init__(self, embed_size: int, n_heads: int, context_size: int, ff_size: int, dropout: float) -> None:
    super().__init__()
    assert embed_size % n_heads == 0, "embed_size is not divisible by n_heads"
    head_size = embed_size // n_heads
    self.multi_head_attention = MultiHeadAttention(embed_size, head_size, n_heads, context_size, dropout)
    self.feed_forward = FeedForwardBlock(embed_size, ff_size, dropout)
    self.lnorm = nn.ModuleList([nn.LayerNorm(embed_size) for _ in range(2)])

  def forward(self, x):
    x = x + self.multi_head_attention(self.lnorm[0](x))
    x = x + self.feed_forward(self.lnorm[1](x))
    return x

In [39]:
class GPTModel(nn.Module):
  def __init__(self, vocab_size: int, embed_size: int, n_heads: int, context_size: int, ff_size: int, n_layers: int, dropout: float) -> None:
    super().__init__()
    self.embeds = nn.Embedding(vocab_size, embed_size)
    self.pos_embeds = nn.Embedding(context_size, embed_size)
    self.decoder = nn.Sequential(*[DecoderBlock(embed_size, n_heads, context_size, ff_size, dropout) for _ in range(n_layers)])
    self.fnorm = nn.LayerNorm(embed_size)
    self.linear = nn.Linear(embed_size, vocab_size)

  def forward(self, inputs, targets=None):
    batch_size, context_size = inputs.shape

    embeds = self.embeds(inputs)
    pos_embeds = self.pos_embeds(torch.arange(context_size).to(inputs.device))
    x = embeds + pos_embeds
    x = self.decoder(x)
    x = self.fnorm(x)
    logits = self.linear(x)

    if targets is not None:
      batch_size, context_size, embed_size = logits.shape
      logits = logits.view(batch_size*context_size, embed_size)
      targets = targets.view(batch_size*context_size)
      loss = F.cross_entropy(logits, targets)
    else:
      loss = None

    return logits, loss

In [40]:
def get_weights_file_path(config, epochs: str):
  model_basename = config['model_basename']
  model_filename = f"{model_basename}{epochs}.pt"
  return str(Path('.') / model_filename)

In [41]:
def estimate_val_loss(model, val, tokenizer, device):
  model.eval()
  running_vloss = 0

  with torch.no_grad():
    for i,(xb, yb) in enumerate(val):
      logits, loss = model(xb.to(device), yb.to(device))
      running_vloss += loss

  avg_vloss = running_vloss / (i + 1)
  print('LOSS valid {}'.format(avg_vloss))

In [42]:
def train_model(config):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f'Using device {device}')

  train, val, tokenizer = build_dataloader_and_tokenizers(config)
  model = GPTModel(tokenizer.get_vocab_size(), config['embed_size'], config['n_heads'], config['context_size'], config['embed_size']*4, config['n_layers'], config['dropout'])
  model.to(device)

  optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr'])

  for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

  initial_epoch = 0
  global_step = 0
  if config['preload']:
    model_filename = get_weights_file_path(config, config['preload'])
    print(f'Preloading model: {model_filename}')
    state = torch.load(model_filename)
    initial_epoch = state['epoch'] + 1
    optimizer.load_state_dict(state['optimizer_state_dict'])
    model.load_state_dict(state['model_state_dict'])
    global_step = state['global_step']

  for epoch in range(initial_epoch, config['epochs']):
    model.train()
    batch_iterator = tqdm(train, desc=f'Processing epoch {epoch:02d}')
    for (xb, yb) in batch_iterator:
      logits, loss = model(xb.to(device), yb.to(device))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      global_step += 1

    estimate_val_loss(model, val, tokenizer, device)
    print("LOSS train: ", loss)

    # Save the model
    model_filename = get_weights_file_path(config, f'{epoch:02d}')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'global_step': global_step,
    }, model_filename)

In [43]:
#train_model(config)

In [44]:
def generate(config, max_new_tokens: int = 500):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print("Using device:", device)

  tokenizer = Tokenizer.from_file(str(Path('./'+config['tokenizer_file'])))

  model = GPTModel(tokenizer.get_vocab_size(), config['embed_size'], config['n_heads'], config['context_size'], config['embed_size']*4, config['n_layers'], config['dropout'])
  model.to(device)

  state = torch.load('./'+config['model_filename'])
  model.load_state_dict(state['model_state_dict'])

  model.eval()

  inputs = torch.randint(100,1000,(config['context_size'], config['context_size']), dtype=torch.long, device=device)
  with torch.no_grad():
    for _ in range(max_new_tokens):
      inputs_cropped = inputs[:, -config['context_size']:]
      logits, _ = model(inputs_cropped.to(device), None)
      logits = logits[:, -1, :]
      probs = F.softmax(logits, dim=-1)

      inputs_next = torch.multinomial(probs, 1)
      inputs = torch.cat((inputs, inputs_next), dim=1)

  return tokenizer.decode(inputs.squeeze()[0].tolist())

In [47]:
x = generate(config)

Using device: cuda


In [48]:
len(x)

2562

In [49]:
x[0:600]

'babe feet wear hell coming hold head Of ocean sorry play Santa somewhere Show hands kids Get Without Mr green longer pass alive comin won d again hundred against learned mad past minute against walked floor played alive y believe second wasn hair wish ago bring break His goes beat used cool until Black cause doin Before funny Its dance took men fast pretty keep The down So by Him what arms strong me cheese my gentle or hard Speaking blurred now window or down . Mix now real me thinking talking unafraid my dies " window As jammed old hurts on it weep into take turns all dusk in hear on My Alway'

In [50]:
x[700:1000]

'ear kind hard skull to hard rain hear end or With loved Is home on it hard WOULD His you mass You hard Going he home on it were to As [ hard leavin yellow Wings to Gives at black some me when she ever not need on My equal aches hear around Sweeter hear city you hurts Oh now hear monkey hard instead '