In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/SloganGenerator/venv/lib/python3.10/site-packages')

#Load the model

In [None]:
from transformers import BertTokenizer, BertLMHeadModel

MODEL_NAME = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertLMHeadModel.from_pretrained(MODEL_NAME)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#Create Dataset

In [None]:
import csv

import torch
from torch.utils.data import Dataset


class SloganDataset(Dataset):
  def __init__(self, filename, tokenizer, seq_length=64):

    self.examples = []
    with open(filename) as csvfile:
      reader = csv.reader(csvfile)
      for row in reader:

        encode = tokenizer(row[0], row[1], truncation=True, padding='max_length', max_length=64, return_tensors='pt')
        input_ids = encode['input_ids'][0].tolist()  #tokens of context and slogan together

        # tokens of context and slogan (used for segments lenght)
        context = tokenizer.encode(row[0], padding=False)
        slogan = tokenizer.encode(row[1], padding=False)

        # Annotate each token with its corresponding segment:
        segments = [0] * ( len(context)) + [1] * ( seq_length - len(context) )

        # Ignore the context, padding, and <slogan> tokens by setting their labels to -100
        labels = [-100] * (len(context)+1) + slogan[1:] + [-100] * ( seq_length - len(context) - len(slogan) )
    
        # Add the preprocessed example to the dataset
        self.examples.append((input_ids, segments, labels))

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, item):
    return torch.tensor(self.examples[item])

# Build the dataset and display the dimensions of the 1st batch for verification:
slogan_dataset = SloganDataset('/content/drive/My Drive/SloganGenerator/dataset/slogans.csv', tokenizer)
print(len(next(iter(slogan_dataset))))
print(len(slogan_dataset))

3
9520


In [None]:
import math, random

from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

# Create data indices for training and validation splits:
indices = list(range(len(slogan_dataset)))

random.seed(42)
random.shuffle(indices)

split = math.floor(0.1 * len(slogan_dataset))
train_indices, val_indices = indices[split:], indices[:split]

# Build the PyTorch data loaders:

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_loader = DataLoader(slogan_dataset, batch_size=16, sampler=train_sampler)
val_loader = DataLoader(slogan_dataset, batch_size=16, sampler=val_sampler)

#Training

In [None]:
import numpy as np
from tqdm import tqdm

def fit(model, optimizer, train_dl, val_dl, epochs=1, device=torch.device('cpu')):

  for i in range(epochs):

    print('\n--- Starting epoch #{} ---'.format(i))

    model.train()

    # These 2 lists will keep track of the batch losses and batch sizes over one epoch:
    losses = []
    nums = []

    for xb in tqdm(train_dl, desc="Training"):
      # Move the batch to the training device:
      inputs = xb.to(device) 

      # Call the model with the token ids, segment ids, and the ground truth (labels)
      outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
      
      # Add the loss and batch size to the list:
      loss = outputs[0]

      losses.append(loss.item())
      nums.append(len(xb))

      loss.backward()

      optimizer.step()
      model.zero_grad()

    # Compute the average cost over one epoch:
    train_cost = np.sum(np.multiply(losses, nums)) / sum(nums)


    # Now do the same thing for validation:
    model.eval()
    
    with torch.no_grad():
      losses = []
      nums = []

      for xb in tqdm(val_dl, desc="Validation"):
        inputs = xb.to(device)

        outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
        losses.append(outputs[0].item())
        nums.append(len(xb))

    val_cost = np.sum(np.multiply(losses, nums)) / sum(nums)

    print('\n--- Epoch #{} finished --- Training cost: {} / Validation cost: {}'.format(i, train_cost, val_cost))


In [None]:
from transformers import AdamW
import os

# Move the model to the GPU:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Fine-tune for two epochs:
optimizer = AdamW(model.parameters())

fit(model, optimizer, train_loader, val_loader, epochs=2, device=device)


--- Starting epoch #0 ---


Training: 100%|██████████| 536/536 [02:02<00:00,  4.37it/s]
Validation: 100%|██████████| 60/60 [00:03<00:00, 15.09it/s]



--- Epoch #0 finished --- Training cost: 5.875601913637363 / Validation cost: 5.870087126723859

--- Starting epoch #1 ---


Training: 100%|██████████| 536/536 [02:00<00:00,  4.44it/s]
Validation: 100%|██████████| 60/60 [00:04<00:00, 14.99it/s]


--- Epoch #1 finished --- Training cost: 5.776770967061446 / Validation cost: 5.901931325928504





In [None]:
import os

dir_name = "/content/drive/MyDrive/SloganGenerator/models/bert"
if not os.path.exists(dir_name):
  os.makedirs(dir_name)
model.save_pretrained(dir_name)
tokenizer.save_pretrained(dir_name)

('/content/drive/MyDrive/SloganGenerator/models/bert/tokenizer_config.json',
 '/content/drive/MyDrive/SloganGenerator/models/bert/special_tokens_map.json',
 '/content/drive/MyDrive/SloganGenerator/models/bert/vocab.txt',
 '/content/drive/MyDrive/SloganGenerator/models/bert/added_tokens.json')

#Generation

In [None]:
from transformers import BertTokenizer, BertLMHeadModel

dir_name = "/content/drive/MyDrive/SloganGenerator/models/bert"
tokenizer = BertTokenizer.from_pretrained(dir_name)
model = BertLMHeadModel.from_pretrained(dir_name)

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [None]:
# Sampling functions with top k and top p from HuggingFace:

import torch.nn.functional as F
from tqdm import trange


def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (batch size x vocabulary size)
            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
    """
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # scatter sorted tensors to original indexing
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits


# From HuggingFace, adapted to work with the context/slogan separation:
def sample_sequence(model, length, context, segments_tokens=None, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,
                    device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context

    with torch.no_grad():
        for _ in trange(length):

            inputs = {'input_ids': generated}
            if segments_tokens != None:
              
              inputs['token_type_ids'] = torch.tensor(segments_tokens[:generated.shape[1]]).unsqueeze(0).repeat(num_samples, 1)

            outputs = model(**inputs)
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty
                
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # greedy sampling:
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated


In [None]:
import torch
context = "Olivetti S.p.A. is an Italian manufacturer of computers, tablets, smartphones, printers and other such business products as calculators and fax machines that designed the first personal computer."

input_ids = tokenizer.encode(context, padding=False)

# Annotate each token with its corresponding segment:
segments = [0] * ( len(input_ids) - 1) + [1] * ( 64 - len(input_ids) )

# Move the model back to the CPU for inference:
device = torch.device('cpu')
model.to(device)

# Generate 20 samples of max length 20
generated = sample_sequence(model, length=20, context=input_ids, segments_tokens=segments, num_samples=20, device=device)

print('\n\n--- Generated Slogans ---\n')

for g in generated:
  slogan = tokenizer.decode(g)
  slogan = slogan.split('[CLS]')[1].split('[SEP]')[1]
  print(slogan)

100%|██████████| 20/20 [00:16<00:00,  1.23it/s]



--- Generated Slogans ---

 
 romeo information. home see healthy you to s fan 
 
 
. exceptional humans.. life america into 
 us of renault, 
 a todd 
 and on 
... the with the since mel care verbal 
s 
. em ba day advantage be your. 
. is alleviate sox happy have a victor serious donerit perfection se you gladly fitness 
 mind design treats you possible best. 
! as your feel work! demand growth and. it. dull in customers we your 
. official taylor a uniquely powerful. line. best 
 the count like 
 are 
. 
 li. medical to arehot down shine 
 you your the, made straight. 





In [None]:
dir_name = "/content/drive/MyDrive/SloganGenerator/models/bert"
if not os.path.exists(dir_name):
  os.makedirs(dir_name)
model.save_pretrained(dir_name)