<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [6]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Config
from transformers import TrainingArguments, Trainer
from datasets import Dataset

import pandas as pd
import numpy as np

import torch as T

import spacy

In [7]:
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.unk_token

In [8]:
df = pd.read_csv('data/ecb_intro_text_only.csv', index_col=0)
df.rename(columns={'intro_text':'text'}, inplace=True)
df.reset_index(drop=True, inplace=True)
df.index.name = 'index'

In [9]:
dataset = Dataset.from_pandas(df, preserve_index=False)

In [10]:
block_size = 256

def group_texts(examples):
    total_length = len(examples[list(examples.keys())[0]])
    
    result = {k:[] for k in examples.keys()}
    for k, v in examples.items():
        if not isinstance(v, list):
            v = [v]
        for l in v:
            for i in range(0, len(l), block_size):
                block = l[i:i+block_size]
                pad_len = block_size - len(block)
                
                if k == 'input_ids':
                    block += ([tokenizer.pad_token_id] * pad_len)
                elif k == 'attention_mask':
                    block += ([0] * pad_len)
                
                assert len(block) == block_size
                result[k].append(block)
            
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length")

tokenized_datasets = dataset.map(tokenize_function, batched=False, remove_columns=['text'])
chunked_dataset = tokenized_datasets.map(group_texts, batched=True)

  0%|          | 0/251 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
chunked_dataset.set_format('pt', columns=['input_ids', 'attention_mask'])
chunked_dataset = chunked_dataset.train_test_split(test_size=0.1)

In [15]:
train_data = chunked_dataset['train']
test_data  = chunked_dataset['test']

In [16]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments("gpt2_ecb_finetune",
                                  evaluation_strategy='epoch',
                                  num_train_epochs=3,
                                  per_device_train_batch_size=5,
                                  per_device_eval_batch_size=5,
                                  gradient_accumulation_steps=100,
                                  save_strategy='steps',
                                  save_steps=10000,
                                  save_total_limit=10,
                                  learning_rate=3e-4)

In [17]:
def collect_data(features):
    batch = {}
    
    batch['input_ids'] = T.stack([f['input_ids'] for f in features])
    batch['attention_mask'] = T.stack([f['attention_mask'] for f in features])
    batch['labels'] = T.stack([f['input_ids'].clone() for f in features])
        
    return batch

In [18]:
trainer = Trainer(model,
                  args=training_args,
                  train_dataset=train_data,
                  eval_dataset=test_data,
                  data_collator=collect_data,
                  tokenizer=tokenizer)

In [19]:
initial_eval = trainer.evaluate()
print(f'Eval loss before fine tuning: {initial_eval["eval_loss"]:0.3f}')

***** Running Evaluation *****
  Num examples = 185
  Batch size = 5


Eval loss before fine tuning: 3.845


In [20]:
trainer.train()

***** Running training *****
  Num examples = 1662
  Num Epochs = 3
  Instantaneous batch size per device = 5
  Total train batch size (w. parallel, distributed & accumulation) = 500
  Gradient Accumulation steps = 100
  Total optimization steps = 9


Epoch,Training Loss,Validation Loss
0,No log,2.81546
1,No log,2.563984
2,No log,2.485929


***** Running Evaluation *****
  Num examples = 185
  Batch size = 5
***** Running Evaluation *****
  Num examples = 185
  Batch size = 5
***** Running Evaluation *****
  Num examples = 185
  Batch size = 5


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=9, training_loss=3.1564848158094616, metrics={'train_runtime': 719.6196, 'train_samples_per_second': 6.929, 'train_steps_per_second': 0.013, 'total_flos': 630236381184000.0, 'train_loss': 3.1564848158094616, 'epoch': 2.9})

In [21]:
model.save_pretrained('gpt2_ecb_finetune')

Configuration saved in gpt2_ecb_finetune\config.json
Model weights saved in gpt2_ecb_finetune\pytorch_model.bin
