In [7]:
import sys

module_path = "../src"

if module_path not in sys.path:
    sys.path.append(module_path)

In [8]:
# Load dataset
from dataset import get_dataset
dataset = get_dataset()

In [15]:
# Load libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig
import torch

model_name = 'gpt2'
epochs = 3
num_labels = 4
batch_size = 32

In [16]:
# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=256)

# check which one is required
tokenizer.pad_token = tokenizer.eos_token

def tokenize(input):
    return tokenizer(input['text'], padding="max_length", truncation=True)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize, batched=True)

# Shuffle and pick subset from dataset
train_dataset = tokenized_dataset['train'].shuffle(seed=442333+424714).select(range(5000))
eval_dataset = tokenized_dataset['test'].shuffle(seed=442333+424714).select(range(1000))

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [17]:
import evaluate
import numpy as np
metric = evaluate.load("accuracy")

# Prepare evaluation callback, metric = accuracy
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [18]:
# Define custom classification head, 2 layers deep

class CustomClassificationHead(torch.nn.Module):
    def __init__(self, input_dim, num_labels):
        super().__init__()
        self.dense = torch.nn.Linear(input_dim, 512)
        self.dense2nd = torch.nn.Linear(512, 256)
        self.dropout = torch.nn.Dropout(0.2)
        self.out_proj = torch.nn.Linear(256, num_labels)

    def forward(self, x):
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dense2nd(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [19]:
# Create model
config = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
model =  AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

model.config.pad_token_id = model.config.eos_token_id
# Connect new classfier to model
model.transformer.add_module("classifier", CustomClassificationHead(config.n_embd, config.num_labels))

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [21]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkpierzynski[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.299594,0.91
2,No log,0.280762,0.916
3,No log,0.280154,0.915


TrainOutput(global_step=471, training_loss=0.2876964010250796, metrics={'train_runtime': 176.7944, 'train_samples_per_second': 84.844, 'train_steps_per_second': 2.664, 'total_flos': 1971881994240000.0, 'train_loss': 0.2876964010250796, 'epoch': 3.0})

In [22]:
print(model)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (classifier): CustomClassificationHead(
      (dense): Linear(in_features=768, out_features=512, bias=True)
      (dense2nd): Linear(in_features=512, out_features=256, b