In [3]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.optim import Adam
import torch.nn.functional as F

train_df = pd.read_json (r'../data/imdb/train.json')
test_df = pd.read_json (r'../data/imdb/test.json')

In [4]:
for c in train_df:
    if train_df[c].dtype == 'object':
        print('Max length of column %s: %s\n' %  (c, train_df[c].map(len).max()))

Max length of column text_a: 13704

Max length of column label: 3



In [5]:
class NsmcDataset(Dataset):
    ''' Naver Sentiment Movie Corpus Dataset '''
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, 0]
        label = self.df.iloc[idx, 2]
        if (label == 'pos'):
            label = 1
        else:
            label = 0
        return text, label

In [6]:
nsmc_train_dataset = NsmcDataset(train_df)
train_loader = DataLoader(nsmc_train_dataset, batch_size=2, shuffle=True, num_workers=2)

In [7]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')


In [8]:
device = torch.device("cuda")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [12]:
optimizer = Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 500
epochs = 1
total_loss = 0
total_len = 0
total_correct = 0



model.train()
for epoch in range(epochs):
    
    for text, label in train_loader:
        optimizer.zero_grad()
        
        # encoding and zero padding
        encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
        padded_list =  [e[:511] + [0] * (512-len(e[:511])) for e in encoded_list]
        
        sample = torch.tensor(padded_list)
        sample = sample.to(device)
        label = label.to(device)
        labels = torch.tensor(label)
        outputs = model(sample, labels=labels)
        loss, logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(labels)
        total_correct += correct.sum().item()
        total_len += len(labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        if itr % p_itr == 0:
            print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'
                  .format(epoch+1, epochs, itr, total_loss/p_itr, total_correct/total_len))
            total_loss = 0
            total_len = 0
            total_correct = 0

        itr+=1



tensor([[-0.0096, -0.0519],
        [-0.0466,  0.1544]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.0302, -0.0831],
        [-0.0817, -0.1281]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (641 > 512). Running this sequence through the model will result in indexing errors


tensor([[-0.1815, -0.3093],
        [-0.0492, -0.0725]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[0.0649, 0.0915],
        [0.0608, 0.1773]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors


tensor([[0.0233, 0.0972],
        [0.1188, 0.0499]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[-0.1512, -0.0317],
        [-0.0357, -0.0593]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[-0.1535, -0.1727],
        [-0.0693, -0.0969]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.0704,  0.1212],
        [ 0.0484, -0.1383]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (719 > 512). Running this sequence through the model will result in indexing errors


tensor([[ 0.0520, -0.0221],
        [-0.0571, -0.1030]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[0.0668, 0.1669],
        [0.0214, 0.0923]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.0292,  0.1217],
        [ 0.0326, -0.0157]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.0254,  0.0836],
        [-0.0751, -0.1190]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (1032 > 512). Running this sequence through the model will result in indexing errors


tensor([[ 0.0562,  0.0550],
        [-0.0584, -0.1652]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[0.0767, 0.3059],
        [0.0456, 0.0570]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (1241 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (555 > 512). Running this sequence through the model will result in indexing errors


tensor([[-0.1346, -0.1989],
        [-0.0109, -0.0859]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[-0.1254, -0.1954],
        [-0.1568, -0.2424]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (574 > 512). Running this sequence through the model will result in indexing errors


tensor([[-0.0002,  0.0451],
        [ 0.1168,  0.0230]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.0744, -0.1200],
        [ 0.1544,  0.1802]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.0293, -0.0264],
        [ 0.1881, -0.0424]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.0461, -0.0210],
        [-0.2350, -0.1560]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (692 > 512). Running this sequence through the model will result in indexing errors


tensor([[ 0.1492,  0.1338],
        [ 0.0027, -0.1099]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.1365,  0.0191],
        [-0.0660,  0.0345]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (1123 > 512). Running this sequence through the model will result in indexing errors


tensor([[ 0.0481, -0.0730],
        [-0.0024,  0.1444]], device='cuda:0', grad_fn=<AddmmBackward>)
tensor([[ 0.1347, -0.3828],
        [ 0.0246, -0.0798]], device='cuda:0', grad_fn=<AddmmBackward>)


Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors


tensor([[ 0.0260,  0.0611],
        [ 0.1738, -0.0979]], device='cuda:0', grad_fn=<AddmmBackward>)


KeyboardInterrupt: 

In [None]:
# evaluation
model.eval()

nsmc_eval_dataset = NsmcDataset(test_df)
eval_loader = DataLoader(nsmc_eval_dataset, batch_size=2, shuffle=False, num_workers=2)

total_loss = 0
total_len = 0
total_correct = 0

for text, label in eval_loader:
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e[:511] + [0] * (512-len(e[:511])) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample, label = sample.to(device), label.to(device)
    labels = torch.tensor(label)
    outputs = model(sample, labels=labels)
    _, logits = outputs

    pred = torch.argmax(F.softmax(logits), dim=1)
    correct = pred.eq(labels)
    total_correct += correct.sum().item()
    total_len += len(labels)

print('Test accuracy: ', total_correct / total_len)
 