In [1]:
from dataclasses import dataclass
import collections
from tqdm.auto import tqdm

import numpy as np
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import default_data_collator
from transformers import pipeline
from transformers import get_scheduler
import transformers
from datasets import load_dataset
import evaluate
from accelerate import Accelerator
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW


print('transformers', transformers.__version__)
print('pytorch', torch.version.__version__)

2023-01-26 20:01:28.673771: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


transformers 4.26.0
pytorch 1.13.0a0+git49444c3


In [2]:
@dataclass
class HyperParameters:
    chunk_size = 128
    model_name = 'gpt2'
    batch_size = 8
    learning_rate = 5e-5
    epochs = 6


params = HyperParameters()

In [3]:
model = AutoModelForCausalLM.from_pretrained(params.model_name)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(params.model_name)

In [5]:
text = 'This is an example text'

inputs = tokenizer(text, return_tensors='pt')
token_logits = model(**inputs).logits
print(token_logits.shape)

torch.Size([1, 5, 50257])


In [6]:
imdb_dataset = load_dataset('imdb')
imdb_dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [7]:
sample = imdb_dataset['train'].shuffle().select(range(3))

for row in sample:
    print(f"\n>>> Review: {row['text']}")
    print(f">>> Label: {row['label']}")


>>> Review: A wonderfully quirky film with enough twists for a sack of pretzels. Parker Posey plays Fay Grim as a sexy, vulnerable, loving mother who may or may not be what she seems. The story is very tongue in cheek, and the dialog skillfully understated. Hints of humor and intrigue, neither of which overpower the characterization Posey pulls off so well. The supporting cast is stellar. The downside? This film needs your full attention, almost to the point of stopping the film and taking notes. Posey has more sex appeal in her lifting of an eyebrow than most actresses have in their entire body. She's worth your time, even if you don't understand the denouement.
>>> Label: 1

>>> Review: The script for "Scary Movie 2" just wasn't ready to go. This is a problem with the film that is blatantly evident, to the actors and the audience alike. Director Keenan Ivory Wayans, and many of the actors are funny people; and so the movie isn't completely humorless. To their credit, the film has se

In [8]:
def tokenize_function(examples):
    return tokenizer(examples['text'])


tokenized_datasets = imdb_dataset.map(
    tokenize_function, batched=True, remove_columns=['text', 'label']
)
tokenized_datasets



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [9]:
tokenizer.model_max_length

1024

In [10]:
min([len(sample) for sample in tokenized_datasets['train']])

2

In [11]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

'>>> Review 0 length: 358'
'>>> Review 1 length: 284'
'>>> Review 2 length: 119'


In [12]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

'>>> Concatenated reviews length: 761'


In [13]:
print(concatenated_examples['input_ids'])

[40, 26399, 314, 3001, 327, 47269, 20958, 12, 56, 23304, 3913, 422, 616, 2008, 3650, 780, 286, 477, 262, 10386, 326, 11191, 340, 618, 340, 373, 717, 2716, 287, 15904, 13, 314, 635, 2982, 326, 379, 717, 340, 373, 12000, 416, 471, 13, 50, 13, 17112, 611, 340, 1683, 3088, 284, 3802, 428, 1499, 11, 4361, 852, 257, 4336, 286, 7328, 3177, 366, 3642, 46927, 1, 314, 1107, 550, 284, 766, 428, 329, 3589, 29847, 1671, 1220, 6927, 1671, 11037, 464, 7110, 318, 19254, 1088, 257, 1862, 14023, 10512, 3710, 3706, 44131, 508, 3382, 284, 2193, 2279, 673, 460, 546, 1204, 13, 554, 1948, 673, 3382, 284, 2962, 607, 32649, 507, 284, 1642, 617, 3297, 286, 11648, 319, 644, 262, 2811, 7289, 68, 1807, 546, 1728, 1964, 2428, 884, 355, 262, 10836, 1810, 290, 3234, 2428, 287, 262, 1578, 1829, 13, 554, 1022, 4737, 7602, 290, 8850, 2853, 44908, 286, 29679, 546, 511, 9317, 319, 4819, 11, 673, 468, 1714, 351, 607, 10512, 4701, 11, 28999, 11, 290, 6405, 1450, 29847, 1671, 1220, 6927, 1671, 11037, 2061, 12847, 502, 546, 3

In [14]:
total_length = ((total_length + params.chunk_size -1) // params.chunk_size) * params.chunk_size

chunks = {}
for k, t in concatenated_examples.items():
    t = np.pad(t, (0, total_length - len(t)))
    chunks[k] = [t[i : i + params.chunk_size] for i in range(0, total_length, params.chunk_size)]


print('input ids')
for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

input ids
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 128'


In [15]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = ((total_length + params.chunk_size -1) // params.chunk_size) * params.chunk_size
    result = {}
    for k, t in concatenated_examples.items():
        t = np.pad(t, (0, total_length - len(t)))
        result[k] = [t[i : i + params.chunk_size] for i in range(0, total_length, params.chunk_size)]
    result['labels'] = result['input_ids'].copy()
    return result


lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets



DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 58542
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 57203
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 117415
    })
})

In [16]:
tokenizer.decode(lm_datasets['train'][0]['input_ids'])

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues'

In [17]:
tokenizer.decode(lm_datasets['train'][0]['labels'])

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues'

In [18]:
print(lm_datasets['train'][0]['input_ids'])
print(lm_datasets['train'][0]['labels'])
print(lm_datasets['train'][0]['attention_mask'])

[40, 26399, 314, 3001, 327, 47269, 20958, 12, 56, 23304, 3913, 422, 616, 2008, 3650, 780, 286, 477, 262, 10386, 326, 11191, 340, 618, 340, 373, 717, 2716, 287, 15904, 13, 314, 635, 2982, 326, 379, 717, 340, 373, 12000, 416, 471, 13, 50, 13, 17112, 611, 340, 1683, 3088, 284, 3802, 428, 1499, 11, 4361, 852, 257, 4336, 286, 7328, 3177, 366, 3642, 46927, 1, 314, 1107, 550, 284, 766, 428, 329, 3589, 29847, 1671, 1220, 6927, 1671, 11037, 464, 7110, 318, 19254, 1088, 257, 1862, 14023, 10512, 3710, 3706, 44131, 508, 3382, 284, 2193, 2279, 673, 460, 546, 1204, 13, 554, 1948, 673, 3382, 284, 2962, 607, 32649, 507, 284, 1642, 617, 3297, 286, 11648, 319, 644, 262, 2811, 7289, 68, 1807, 546, 1728, 1964, 2428]
[40, 26399, 314, 3001, 327, 47269, 20958, 12, 56, 23304, 3913, 422, 616, 2008, 3650, 780, 286, 477, 262, 10386, 326, 11191, 340, 618, 340, 373, 717, 2716, 287, 15904, 13, 314, 635, 2982, 326, 379, 717, 340, 373, 12000, 416, 471, 13, 50, 13, 17112, 611, 340, 1683, 3088, 284, 3802, 428, 1499, 11

In [19]:
data_collator = default_data_collator

In [20]:
data_size = len(lm_datasets['train'])
train_size = int(0.9 * data_size)
test_size = data_size - train_size

downsampled_dataset = lm_datasets['train'].train_test_split(
    train_size=train_size, test_size=test_size,
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 52687
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5855
    })
})

In [21]:
train_dataloader = DataLoader(
    downsampled_dataset['train'],
    shuffle=True,
    batch_size=params.batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    downsampled_dataset['test'], batch_size=params.batch_size, collate_fn=default_data_collator
)

In [22]:
accelerator = Accelerator()
optimizer = AdamW(model.parameters(), lr=params.learning_rate)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [23]:
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = params.epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [24]:
def evaluation(model):
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(params.batch_size)))
        
    losses = torch.cat(losses)
    losses = losses[: len(downsampled_dataset['test'])]
    try:
        perplexity = torch.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")
    return perplexity


evaluation(model)

tensor(49.6785, device='cuda:0')

In [25]:
progress_bar = tqdm(range(num_training_steps))
output_dir = f'{params.model_name}-imdb'

for epoch in range(params.epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    perplexity = evaluation(model)
    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)

  0%|          | 0/39516 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 37.515811920166016
>>> Epoch 1: Perplexity: 36.330135345458984
>>> Epoch 2: Perplexity: 35.84846115112305
>>> Epoch 3: Perplexity: 35.65235900878906
>>> Epoch 4: Perplexity: 35.824527740478516
>>> Epoch 5: Perplexity: 35.863651275634766


In [26]:
model.to('cpu')
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [27]:
generator('I enjoy', max_length=128)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "I enjoy when movies are made so they can appeal to a wide audience, and this movie certainly delivers.<br /><br />But, this film goes beyond what most films can deliver by simply putting on a little bit of personality on top of the material. I think that it becomes realistic to the human nature of human beings, especially at a time when you are dealing with 9/11. <br /><br />As well as the message it tries to communicate, I appreciate the effort that they put into making this so convincing to the audience (I will say no more about the characters - they've left out a few of the"}]

In [28]:
generator('This movie', max_length=128)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "This movie was good for sure I hope it works as well for other people like me. This is a must see movie for all children.What can i say about an old, sweet movie? This one has it all: it has an excellent cast, all of them are wonderful. I have always thought of it as a movie that everyone should have had in their life, just because it's a classic. I don't think that I have much to say about Mr. Spielberg (himself at least) other than his amazing story of adventure. The characters are so different and believable and he takes this journey so far that even though it"}]