In [13]:
from pathlib import Path

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('../data/imdb/aclImdb/train')
test_texts, test_labels = read_imdb_split('../data/imdb/aclImdb/test')

In [2]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [12]:
train_labels[0]

NameError: name 'train_labels' is not defined

In [3]:
from transformers import BertTokenizer
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [4]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_token_type_ids=False, return_tensors='pt')
val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_token_type_ids=False, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_token_type_ids=False, return_tensors='pt')

In [5]:
import torch
torch.cuda.empty_cache()

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [6]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
torch.cuda.empty_cache()

t = torch.cuda.get_device_properties(0).total_memory
c = torch.cuda.memory_cached(0)
a = torch.cuda.memory_allocated(0)
f = c-a  # free inside cache

#GiB, GiB, MiB
print(c*9.31323e-10, a*9.31323e-10, f*9.53674e-7)

0.0 0.0 0.0


In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

t = torch.cuda.get_device_properties(0).total_memory
c = torch.cuda.memory_cached(0)
a = torch.cuda.memory_allocated(0)
f = c-a  # free inside cache

#GiB, GiB, MiB
print(c*9.31323e-10, a*9.31323e-10, f*9.53674e-7)

0.460937710534656 0.408918090674688 53.268048733184


In [9]:
from transformers import AdamW
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

  # Remove the CWD from sys.path while we load stuff.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [16]:
model.save_pretrained('../data/imdb/saved_model/imdb_bert_base_uncased_finetuned_normal')

In [None]:
from transformers import AdamW
from torch.utils.data import DataLoader

EPOCHS = 10

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)

#hyperparameters from BERT authors' recommendations
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * EPOCHS

from transformers import get_linear_schedule_with_warmup
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

#loss_fn = nn.CrossEntropyLoss().to(device)

## Test

In [3]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('../data/imdb/saved_model/imdb_bert_base_uncased_finetuned_normal')

In [5]:
from transformers import BertTokenizer
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [7]:
text_batch = ["this movie is boring."]
encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

outputs = model(input_ids, attention_mask=attention_mask)

In [10]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.5830, -2.3512]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)

In [11]:
outputs[0]

tensor([[ 2.5830, -2.3512]], grad_fn=<AddmmBackward>)

In [30]:
_, preds = torch.max(outputs.logits, dim=1)

In [31]:
preds

tensor([0])