# Train models

In [4]:
from tqdm.notebook import tqdm
from datasets import load_dataset, DatasetDict, load_from_disk
from transformers import GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import random

## Load data

In [5]:
def load_data_in_splits(data_dir, train=0.8, val=0.1, test=0.1):
    data = load_dataset(data_dir)
    train_valtest = data['train'].train_test_split(test_size = 1 - train)
    test_valid = train_valtest['test'].train_test_split(test_size = test / (val + test))
    out = DatasetDict({
            'train': train_valtest['train'],
            'val': test_valid['train'],
            'test': test_valid['test']
        })
    return out

coca_dir = "../data/coca_spoken/text_chunk_cleaned/"

coca_dsdict = load_data_in_splits(coca_dir, .8, .1, .1)
coca_dsdict

Resolving data files:   0%|          | 0/23 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 470771
    })
    val: Dataset({
        features: ['text'],
        num_rows: 58846
    })
    test: Dataset({
        features: ['text'],
        num_rows: 58847
    })
})

In [6]:
print(coca_dsdict['train'].column_names)
for split in ['train', 'val', 'test']:
    print(split, random.choice(coca_dsdict[split]))

['text']
train {'text': 'that . SOUNDBITE OF TELEPHONE RINGING DAVID SIMON , AUTHOR , " HOMICIDE , " WRITER AND PRODUCER FOR " HOMICIDE : LIFE ON THE STREET " : My name \'s David Simon . I wrote the book " Homicide , " non-fictional account that the TV show \'s based on , and I \'m also a writer and producer with the show @!HANSEN Why is this TV series more like an epic novel ? SIMON : Oh , I never really thought of it in those terms . I never really thought of it in epic novel terms @!HANSEN In 1988 , former " Baltimore Sun " reporter , David Simon , wrote the book , " Homicide : A Year on the Killing Streets . " Movie director Barry Levinson , screenwriter Paul Attanasio , and television producer Tom Fontana adapted his real-life stories of Baltimore detectives to the small screen @!SIMON I think one of the things that appeals to me about the show is it \'s almost a demythification of police in the sense that all the cop shows prior , in some sense , have been'}
val {'text': 'bring i

## Tokenize (or load tokenized data)

In [13]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
context_length = 1024

In [14]:
print("Vocabulary size:", tokenizer.vocab_size)
print("Max Model Input Sizes:", tokenizer.model_max_length)
print("Padding token:", tokenizer.pad_token)
print("Special tokens:", tokenizer.all_special_tokens)

Vocabulary size: 50257
Max Model Input Sizes: 1024
Padding token: None
Special tokens: ['<|endoftext|>']


In [15]:
tokenized_data_path = '../data/coca_spoken/tokens_chunk/'

In [17]:
# Tokenize:
encoded_datasets = coca_dsdict.map(
    lambda chunk: tokenizer(
        chunk['text'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
        ), 
    batched=True)
encoded_datasets.save_to_disk(tokenized_data_path)

Map:   0%|          | 0/470771 [00:00<?, ? examples/s]

Map:   0%|          | 0/58846 [00:00<?, ? examples/s]

Map:   0%|          | 0/58847 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/470771 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/58846 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/58847 [00:00<?, ? examples/s]

In [18]:
# Load pretokenized data:
encoded_datasets = load_from_disk(tokenized_data_path)

In [19]:
print(encoded_datasets)
print(encoded_datasets['train'][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 470771
    })
    val: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 58846
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 58847
    })
})
{'text': 'risk worth taking . So , I think , in two years , let everybodys taxes go up , and so we can have some sort of fiscal future . @!MARK-SHIELDS : Who are we kidding -- two years , let them go up ? They were going to go up after 10 years . Thats -- the Bush tax cuts were supposed to expire , have to expire , by law , at 10 years . All right ? Now , were on the cusp by every indication -- David agrees ... @!JIM-LEHRER : Sort of . He sort of agrees . @!MARK-SHIELDS : ... of a Republican tsunami -- no , a Republican tsunam

In [21]:
tokenized_datasets = encoded_datasets.remove_columns(['text'])
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 470771
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 58846
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 58847
    })
})

## Initialize model

In [None]:
configuration = GPT2Config()
model = GPT2LMHeadModel(configuration)
configuration = model.config
configuration

In [None]:
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

In [None]:
tokenizer.pad_token = tokenizer.eos_token # why?
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
example_data_collation = data_collator([tokenized_datasets['train'][i] for i in range(40)])
for key in example_data_collation:
    print(f"{key} shape: {example_data_collation[key].shape}")

## Train

In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

In [None]:
import torch
torch.cuda.is_available()
torch.cuda.get_device_name(0)

In [None]:
training_output_dir = '../models/test2/'
args = TrainingArguments(
    training_output_dir,
    per_device_train_batch_size=128, # change to fit GPU specs
    per_device_eval_batch_size=128,
    group_by_length=True, # bucketing
)

In [None]:
args.device

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
)

In [None]:
# to resume
trainer.train(resume_from_checkpoint=True)

## Test model

In [None]:
input_text = "Did you know that the first person to"
inputs = tokenizer.encode(input_text, return_tensors="pt")

In [None]:
trained_model = GPT2LMHeadModel.from_pretrained("../models/test2/checkpoint-37500")

In [None]:
outputs = trained_model.generate(inputs, max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text

In [None]:
from transformers import set_seed

In [None]:
set_seed(2)

In [None]:
outputs = trained_model.generate(
    inputs,
    max_new_tokens=40,
    do_sample=True,
    top_k=50,
    # temperature=0.6,
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text