In [1]:
import torch
import torch.nn as nn
from transformers import BertTokenizer,BertModel

import os
import re
from itertools import chain
import numpy as np

In [None]:
# download data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [2]:
TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
    # Removing html tags
    sentence = TAG_RE.sub('', sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

def readIMDB(path, mode):
    classes = ['pos', 'neg']
    data = []
    for label in classes:
        files = os.listdir(os.path.join(path, mode, label))
        for file in files:
            with open(os.path.join(path, mode, label, file), 'r', encoding='utf8') as rf:
                review = rf.read().replace('\n', '')
                if label == 'pos':
                    data.append([preprocess_text(review), 1])
                elif label == 'neg':
                    data.append([preprocess_text(review), 0])
    return data

train_data = readIMDB('aclImdb', 'train')
test_data = readIMDB('aclImdb', 'test')


In [3]:
print(train_data[1])

['Bizarre horror movie filled with famous faces but stolen by Cristina Raines later of TV Flamingo Road as pretty but somewhat unstable model with gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell The scenes with Raines modeling are very well captured the mood music is perfect Deborah Raffin is charming as Cristina pal but when Raines moves into creepy Brooklyn Heights brownstone inhabited by blind priest on the top floor things really start cooking The neighbors including fantastically wicked Burgess Meredith and kinky couple Sylvia Miles Beverly Angelo are diabolical lot and Eli Wallach is great fun as wily police detective The movie is nearly cross pollination of Rosemary Baby and The Exorcist but what combination Based on the best seller by Jeffrey Konvitz The Sentinel is entertainingly spooky full of shocks brought off well by director Michael Winner who mounts thoughtfully downbeat ending with skill from ', 1]


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
bert = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
vocab = tokenizer.vocab
print("dict size", len(vocab))

dict size 30522


In [5]:
train_sentences, y_train = [], []
for sentence, y in train_data:
    train_sentences.append(sentence)
    y_train.append(y)

train_input = tokenizer(train_sentences, max_length=200, add_special_tokens=True, truncation=True, 
                        padding='max_length', return_tensors="pt")

In [6]:
test_sentences, y_test = [], []
for sentence, y in test_data:
    test_sentences.append(sentence)
    y_test.append(y)

test_input = tokenizer(test_sentences, max_length=200, add_special_tokens=True, 
                       truncation=True, padding='max_length', return_tensors="pt")

In [7]:
from torch.utils.data import Dataset, random_split, DataLoader

#create Dataset and dataloader
class MyDataset(Dataset):
    def __init__(self, wrapped_input, labels):
        self.wrapped_input = wrapped_input
        self.labels = labels
    def __getitem__(self, idx):
        input_dict = {}
        for k in self.wrapped_input.keys():
            input_dict[k] = self.wrapped_input[k][idx]
        return input_dict, self.labels[idx]
    
    def __len__(self):
        return len(self.labels)

trainset = MyDataset(train_input, y_train)
testset = MyDataset(test_input, y_test)


#split val from trainset
val_size = int(trainset.__len__()  *0.04)  # 切出1000筆當validation
trainset, valset = random_split(trainset, [trainset.__len__() - val_size, val_size])
print('trainset size:', trainset.__len__())
print('valset size:', valset.__len__())
print('testset size:', testset.__len__())

BATCH_SIZE = 8
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

trainset size: 24000
valset size: 1000
testset size: 25000


In [8]:
# define model architecture
class BERT_classifier(nn.Module):
    def __init__(self, bertmodel, num_label):
        super(BERT_classifier, self).__init__()
        self.bertmodel = bertmodel
        self.dropout = nn.Dropout(p=bertmodel.config.hidden_dropout_prob)
        self.classifier = nn.Linear(bertmodel.config.hidden_size, num_label)

    def forward(self, wrapped_input):
        hidden = self.bertmodel(**wrapped_input)
        last_hidden_state, pooler_output = hidden[0], hidden[1]
        logits = self.classifier(pooler_output)

        return logits

In [9]:
model = BERT_classifier(bert, 2)
model

BERT_classifier(
  (bertmodel): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [7]:
# from transformers import BertForSequenceClassification
# model2 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
# # which will be same as model above
# model2

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [33]:
# model_parameters = filter(lambda p: p.requires_grad, model.parameters())
# params = sum([np.prod(p.size()) for p in model_parameters])
# print(params)

# model_parameters = filter(lambda p: p.requires_grad, model2.parameters())
# params2 = sum([np.prod(p.size()) for p in model_parameters])
# print(params2)

109483778
109483778


In [10]:
# forward one batch to test model and dataloader
batch_data, batch_label = next(iter(train_loader))
batch_logits = model(batch_data)
print(batch_logits.shape)
batch_logits

torch.Size([8, 2])


tensor([[-0.0004, -0.1572],
        [-0.0011, -0.0235],
        [ 0.0362, -0.1831],
        [ 0.1604, -0.0216],
        [-0.2426, -0.2314],
        [ 0.2011,  0.3028],
        [ 0.1825,  0.0548],
        [ 0.0307, -0.2264]], grad_fn=<AddmmBackward0>)

In [12]:
from nn_factory import nn_factory
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
nn_obj = nn_factory(model, device, tokenizer)

cpu


In [13]:
import torch.nn.functional as F
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
nn_obj.fit(5, optimizer, train_loader, val_loader, './')

[epoch 1]train on 24000 data......


  0%|                                                                | 2/3000 [01:03<26:29:36, 31.81s/it]


KeyboardInterrupt: 