In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from sklearn.model_selection import train_test_split
from torch.optim import Adam
import torch.nn.functional as F
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
train_df = pd.read_json(r'../data/imdb/train.json').drop('text_b', axis=1)
test_df = pd.read_json(r'../data/imdb/test.json').drop('text_b', axis=1)

In [3]:
train_df = train_df[:100]
test_df = test_df[:100]
train_text = train_df.text_a.values
train_label = [1 if i=='pos' else 0 for i in train_df.label.values]
test_text = test_df.text_a.values
test_label = [1 if i=='pos' else 0 for i in test_df.label.values]

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [5]:
def tokenize_data(data):
    ids = []
    for d in data:
        encoded = tokenizer.encode(d, add_special_tokens=True)
        ids.append(encoded)
        
    print('Max sentence length: ', max([len(sen) for sen in ids]))
    ids = pad_sequences(ids, maxlen=512, dtype="long", 
                        value=0, truncating="post", padding="post") 
    
    attention_masks = []
    n = 0
    for i in ids:
        temp = [float(t>0) for t in i]
        attention_masks.append(temp)
        n += 1
    print('number:'+str(n))
    return ids, attention_masks

In [6]:
train_ids, train_masks = tokenize_data(train_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1135 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (886 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (803 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Max sentence length:  1361
number:100


In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('Using GPU:', torch.cuda.get_device_name(0))
else:
    print('Using CPU')
    device = torch.device("cpu")

# Using 'BertForSequenceClassification'
model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2,
        output_attentions = False, output_hidden_states = False).to(device)
optimizer = Adam(model.parameters(), lr=1e-6)

Using GPU: GeForce RTX 2080 Ti


In [11]:
SEED = 2020
train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_ids, train_label, 
                                                            random_state=SEED, test_size=0.1)
train_masks, val_masks, _, _ = train_test_split(train_masks, train_ids,
                                             random_state=SEED, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

In [12]:
BATCH_SIZE = 8

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE)

In [13]:
def train(model, optimizer, dataloader):
    epoch_loss = 0
    epoch_acc = 0
    total_len = 0
    
    model.train()
    
    for step, batch in enumerate(dataloader):
        # batch: [ids, mask, label]
        b_temp = tuple(b.to(device) for b in batch)
        b_ids, b_mask, b_labels = b_temp
        
        optimizer.zero_grad()
        
        outputs = model(b_ids, token_type_ids=None,
                      attention_mask=b_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]
        
        pred = torch.argmax(F.softmax(logits), dim=1)
        acc = pred.eq(b_labels).sum().item()
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
        
        total_len += len(b_labels)
    return epoch_loss/total_len, epoch_acc/total_len

In [14]:
def evaluate(model, dataloader):
    model.eval()
    
    epoch_loss = 0
    epoch_acc = 0
    total_len = 0
    
    for step, batch in enumerate(dataloader):
        # batch: [ids, mask, label]
        b_temp = tuple(b.to(device) for b in batch)
        b_ids, b_mask, b_labels = b_temp
        
        with torch.no_grad():
            outputs = model(b_ids, token_type_ids=None,
                          attention_mask=b_mask)

        logits = outputs[0]#.detach.cpu().numpy()
        pred = torch.argmax(F.softmax(logits), dim=1)
        acc = pred.eq(b_labels).sum().item()
        
        epoch_acc += acc
        
        total_len += len(b_labels)
    return epoch_acc / total_len

In [15]:
import time
EPOCHES = 5

train_loss = []
for epoch in range(EPOCHES):
    start_time = time.time()
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, EPOCHES))
    
    train_loss, train_acc = train(model, optimizer, train_dataloader)
    val_accuracy = evaluate(model, val_dataloader)
    
    print("Train loss: %.3f | Train acc: %.2f | Val accuracy : %5.2f | Time: %f" 
          %(train_loss,train_acc,val_accuracy, (time.time() - start_time)/60))
    





Train loss: 0.045 | Train acc: 0.85 | Val accuracy :  0.91 | Time: 17.768819
Train loss: 0.023 | Train acc: 0.93 | Val accuracy :  0.92 | Time: 17.684740
Train loss: 0.020 | Train acc: 0.94 | Val accuracy :  0.92 | Time: 17.676667
Train loss: 0.018 | Train acc: 0.95 | Val accuracy :  0.92 | Time: 17.720844


In [9]:
test_ids, test_masks = tokenize_data(test_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (1324 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (940 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (640 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Max sentence length:  1326
number:100


In [13]:
test_inputs = torch.tensor(test_ids)
test_labels = torch.tensor(test_label)
test_masks = torch.tensor(test_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
test_acc = evaluate(model, test_dataloader)
print("Test acc: %.3f" %(test_acc))



Test acc: 0.930
