In this notebook, we'll take the processed data and train a language model. We want to try the following models:

- GPT-2
- BERT
- RoBERTa
- GPT-Neo
- GPT-J
- XLNet
- T5

The training steps are as follows:

1. Load the data
2. Tokenize the data
3. Create a PyTorch dataset
4. Create a PyTorch dataloader
5. Create a PyTorch model
6. Create a PyTorch optimizer
7. Create a PyTorch scheduler
8. Train the model
9. Save the model

In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
# from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import torch

ModuleNotFoundError: No module named 'tensorflow'

In [17]:
# load the data
df = pd.read_csv('data/penny_arcade_processed.csv')
df.head()

Unnamed: 0,title,text
0,"Exhilarating,",We only know two people who don't live within ...
1,"Penny Arcade is A Comedy Bistro,","Over at Gabe's yesterday, I mentioned that Gui..."
2,"Minus The Pope And A Rabbi,","I thought that I had uploaded the comic, but I..."
3,"Jealousy,","It's fine and everything, but there's all thes..."
4,"I Forgot,",My man Pork mentioned this Star Wars fan-film ...


In [25]:
# tokenize the data
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, pad_to_max_length=True)))

loading configuration file config.json from cache at /Users/odai/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/odai/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /Users/odai/.cache/huggingface/hub/models--distilbert-b

In [28]:
# pad the data

# padded = pad_sequences(tokenized, maxlen=max_length, dtype="long", 
#                         value=0, truncating="post", padding="post")

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [29]:
# create the attention mask
attention_mask = np.where(padded != 0, 1, 0)

In [31]:
# split the data
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, df['title'].values, random_state=2018, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_mask, padded, random_state=2018, test_size=0.1)

In [33]:
# convert the data to tensors

# train_inputs = torch.tensor(train_inputs)
# validation_inputs = torch.tensor(validation_inputs)
# train_labels = torch.tensor(train_labels)
# validation_labels = torch.tensor(validation_labels)
# train_masks = torch.tensor(train_masks)
# validation_masks = torch.tensor(validation_masks)

train_inputs = torch.tensor(train_inputs.astype(np.int64))
validation_inputs = torch.tensor(validation_inputs.astype(np.int64))
train_labels = torch.tensor(train_labels.astype(np.int64))
validation_labels = torch.tensor(validation_labels.astype(np.int64))
train_masks = torch.tensor(train_masks.astype(np.int64))
validation_masks = torch.tensor(validation_masks.astype(np.int64))


AttributeError: 'Tensor' object has no attribute 'astype'

In [None]:


# create the dataloaders
batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# load the model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# freeze the model
for param in model.base_model.parameters():
    param.requires_grad = False

# move the model to the GPU
model.cuda()

# define the optimizer
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)

# define the loss function
from torch.nn import CrossEntropyLoss

loss_fn = CrossEntropyLoss()

# define the training loop
def train(epoch):
    model.train()
    total_loss, total_accuracy = 0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_accuracy += flat_accuracy(logits, b_labels)
    print(f'Epoch {epoch}')
    print(f'Training loss: {total_loss/len(train_dataloader)}')
    print(f'Training accuracy: {total_accuracy/len(train_dataloader)}')

# define the validation loop
def evaluate(epoch):
    print("")
    print("Running Validation...")
    model.eval()
    total_loss, total_accuracy = 0, 0
    for step, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
        total_loss += loss.item()
        total_accuracy += flat_accuracy(logits, b_labels)
    print(f'Validation loss: {total_loss/len(validation_dataloader)}')
    print(f'Validation accuracy: {total_accuracy/len(validation_dataloader)}')
    print("")

# define the accuracy function
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# train the model
from tqdm import tqdm, trange

epochs = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for _ in trange(epochs, desc="Epoch"):
    train(_)
    evaluate(_)

# save the model
model.save_pretrained('models/distilbert-base-uncased')

# save the tokenizer
tokenizer.save_pretrained('models/distilbert-base-uncased')

# # Path: 4_model_evaluation.ipynb
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import pandas as pd
# import torch

# # load the data
# df = pd.read_csv('data/penny_arcade_processed.csv')
# df.head()

# # tokenize the data
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# tokenized = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))


