In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 12.2 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 36.9 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 

In [3]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import time
import os
import re
from itertools import chain
from transformers import BertTokenizer
PRETRAINED_MODEL_NAME = "bert-base-uncased" #英文pretrain模型
print(torch.__version__)

1.9.0+cu102


In [5]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("dict size", len(vocab))

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

dict size 30522


In [6]:
from torch.utils.data import Dataset,random_split

In [7]:


TAG_RE = re.compile(r'<[^>]+>')
def preprocess_text(sen):
    # Removing html tags
    sentence = TAG_RE.sub('', sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence


def readIMDB(path, seg):
    classes = ['pos', 'neg']
    data = []
    for label in classes:
        files = os.listdir(os.path.join(path, seg, label))
        for file in files:
            with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
                review = rf.read().replace('\n', '')
                if label == 'pos':
                    data.append([preprocess_text(review), 1])
                elif label == 'neg':
                    data.append([preprocess_text(review), 0])
    return data


In [8]:
label_map = {0: 'neg', 1: 'pos'}

In [9]:
#create Dataset
import tensorflow as tf
dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)



import tensorflow as tf
class MyDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]  
        self.mode = mode
        self.df = readIMDB(os.path.join(os.path.dirname(dataset),  "aclImdb"),mode) #its list [['text1',label],['text2',label],...]
        self.len = len(self.df)
        self.maxlen = 200 
        self.tokenizer = tokenizer  # we will use BERT tokenizer
    
    def __getitem__(self, idx):
        origin_text = self.df[idx][0]
        if self.mode == "test":
            text_a = self.df[idx][0]
            text_b = None  #for natural language inference
            #label_tensor = None #in our case, we have label
            label_id = self.df[idx][1]
            label_tensor = torch.tensor(label_id)
        else:     
            text_a = self.df[idx][0]
            text_b = None  #for natural language inference
            label_id = self.df[idx][1]
            label_tensor = torch.tensor(label_id)
            
        
        # 建立 BERT tokens
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a[:self.maxlen] + ["[SEP]"]
        len_a = len(word_pieces)
        
        if text_b is not None:
            tokens_b = self.tokenizer.tokenize(text_b)
            word_pieces += tokens_b + ["[SEP]"]
            len_b = len(word_pieces) - len_a
               
        # 將 token 序列转换成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 0 表示第一句，1 表示第二句，此数据集全部为第一句
        if text_b is None:
            segments_tensor = torch.tensor([1] * len_a,dtype=torch.long)
        elif text_b is not None:
            segments_tensor = torch.tensor([0] * len_a + [1] * len_b,dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor, origin_text)
    
    def __len__(self):
        return self.len
    
# initialize Dataset
trainset = MyDataset("train", tokenizer=tokenizer)
testset = MyDataset("test", tokenizer=tokenizer)


#split val from trainset
val_size = int(trainset.__len__()*0.04) 
trainset, valset = random_split(trainset,[trainset.__len__()-val_size,val_size])
print('trainset size:' ,trainset.__len__())
print('valset size:',valset.__len__())
print('testset size: ',testset.__len__())

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
trainset size: 24000
valset size: 1000
testset size:  25000


In [10]:
# 可视化样本
sample_idx = 10

# 查看转换后的 id tensors
tokens_tensor, segments_tensor, label_tensor,origin_text = trainset[sample_idx]

# 此函数将 tokens_tensor 还原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

print('token:\n',tokens,'\n')
print('origin_text:\n',origin_text,'\n')
print('label:',label_map[int(label_tensor.numpy())],'\n')
print('tokens_tensor:\n',tokens_tensor,'\n')
print('segments_tensor:\n',segments_tensor,'\n')

token:
 ['[CLS]', 'i', 'saw', 'the', 'film', 'twice', 'in', 'the', 'space', 'of', 'one', 'week', 'both', 'times', 'the', 'at', 'cinema', 'in', 'or', '##ping', '##ton', 'kent', 'uk', 'the', 'place', 'was', 'packed', 'both', 'times', 'and', 'people', 'had', 'to', 'be', 'turned', 'away', 'from', 'the', 'start', 'of', 'the', 'film', 'with', 'henry', 'winkler', 'getting', 'injured', 'on', 'the', 'football', 'field', 'the', 'whole', 'audience', 'was', 'in', 'up', '##ro', '##ar', 'with', 'laughter', 'laughter', 'that', 'lasted', 'until', 'the', 'credits', 'for', 'those', 'who', 'love', 'american', 'wrestling', 'this', 'film', 'is', 'must', 'but', 'be', 'ready', 'to', 'see', 'henry', 'winkler', 'as', 'you', 'have', 'never', 'seen', 'him', 'before', 'also', 'look', 'out', 'for', 'very', 'well', 'known', 'actor', 'whose', 'trademark', 'wrestling', 'move', 'is', 'head', 'but', 'if', 'you', 'get', 'chance', 'watch', 'this', 'movie', 'and', 'it', 'is', 'family', 'comedy', 'entertainment', 'at', 'it

In [11]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

""""
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

#截短补长后要限制attention只注意非pad的部分
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    #  labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    # attention masks，只关注有词的部分，padding部分设为0
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids



# 初始化Dataloader：每次回传 batch size 个训练样本
# 利用 'collate_fn' 将 list of samples 合并成一个 mini-batch
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=True)
valloader = DataLoader(valset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)

data = next(iter(trainloader))

In [12]:
tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")
tokens_tensors.shape  


tokens_tensors.shape   = torch.Size([16, 202]) 
tensor([[  101,  3383,  2028,  ...,     0,     0,     0],
        [  101,  1999,  2019,  ...,     0,     0,     0],
        [  101,  2057,  4149,  ...,     0,     0,     0],
        ...,
        [  101,  1045,  2089,  ...,     0,     0,     0],
        [  101, 10166,  2023,  ...,     0,     0,     0],
        [  101,  1998,  2672,  ...,  1996, 15620,   102]])
------------------------
segments_tensors.shape = torch.Size([16, 202])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])
------------------------
masks_tensors.shape    = torch.Size([16, 202])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])
---------

torch.Size([16, 202])

In [13]:
from transformers import BertForSequenceClassification


In [14]:
PRETRAINED_MODEL_NAME = "bert-base-uncased"

NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [15]:
print("""
name      module
--------------------""")

for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print("{:10}{}".format(name,n) )
    else:
        print("{:15} {}".format(name, module))


name      module
--------------------
bert      embeddings
bert      encoder
bert      pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=2, bias=True)


In [19]:
%%time
from sklearn.metrics import accuracy_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:",device)
model = model.to(device)


optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 6

for epoch in range(EPOCHS):
    correct = 0
    #total = 0
    train_loss , val_loss = 0.0 , 0.0
    train_acc, val_acc = 0, 0
    n, m = 0, 0
    model.train()
    for data in trainloader:
        n += 1
        tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]

       
        optimizer.zero_grad()
        
        # forward propagation
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
        # outputs --> "(loss), logits, (hidden_states), (attentions)"
        
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        
        #get prediction and calulate acc
        logits = outputs[1]
        _, pred = torch.max(logits.data, 1)
        train_acc += accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())

        #  batch loss
        train_loss += loss.item()
    
    #validation
    with torch.no_grad():
        model.eval()
        for data in valloader:
            m += 1
            tokens_tensors, segments_tensors,masks_tensors, labels = [t.to(device) for t in data]
            val_outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)
            
            logits = val_outputs[1]
            _, pred = torch.max(logits.data, 1)
            val_acc += accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())
            val_loss += val_outputs[0].item()

    print('[epoch %d] loss: %.4f, acc: %.4f, val loss: %4f, val acc: %4f' %
          (epoch+1, train_loss/n, train_acc/n, val_loss/m,  val_acc/m  ))

print('Done')


device: cuda:0
[epoch 1] loss: 0.0237, acc: 0.9930, val loss: 0.371544, val acc: 0.912698
[epoch 2] loss: 0.0179, acc: 0.9945, val loss: 0.551721, val acc: 0.902778
[epoch 3] loss: 0.0177, acc: 0.9945, val loss: 0.410391, val acc: 0.907738
[epoch 4] loss: 0.0132, acc: 0.9961, val loss: 0.506520, val acc: 0.904762
[epoch 5] loss: 0.0126, acc: 0.9957, val loss: 0.405872, val acc: 0.907738
[epoch 6] loss: 0.0093, acc: 0.9966, val loss: 0.545460, val acc: 0.900794
Done
CPU times: user 2h 43min 40s, sys: 7min 55s, total: 2h 51min 36s
Wall time: 2h 51min 23s


In [4]:
torch.cuda.is_available()

True

In [27]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")


整個分類模型的參數量：109483778
線性分類器的參數量：1538

