In [1]:
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import BertTokenizer
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import random
import collections
import tokenization


In [2]:
# setting some value
max_seq_length = 512
masked_lm_prob = 0.15
max_predictions_per_seq = 20
rng = random.Random()

pretrain_model_path = './chinese_wwm_pytorch/'
tokenizer = BertTokenizer.from_pretrained(pretrain_model_path)
vocab_words = list(tokenizer.vocab.keys())

MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])

OSError: Model name './chinese_wwm_pytorch/' was not found in tokenizers model name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased, TurkuNLP/bert-base-finnish-cased-v1, TurkuNLP/bert-base-finnish-uncased-v1, wietsedv/bert-base-dutch-cased). We assumed './chinese_wwm_pytorch/' was a path, a model identifier, or url to a directory containing vocabulary files named ['vocab.txt'] but couldn't find such vocabulary files at this path or url.

In [5]:
column_names = ['article_id', 'start_position', 'end_position', 'entity_text', 'entity_type']
df = pd.read_csv('./data/train_1.csv')


In [7]:
import re
from zhon.hanzi import stops
def cut_sent(para):
    para = re.sub("([。！？\?])([^”’])", r"\1\n\2", para)
    para = re.sub("(\.{6})([^”’])", r"\1\n\2", para)
    para = re.sub("(\…{2})([^”’])", r"\1\n\2", para)
    para = re.sub("([。！？\?][”’])([^，。！？\?])", r"\1\n\2", para)
    return para.split("\n")

def clean_string(content):
    content = content.replace('\n','。').replace('\t','，').replace('!', '！').replace('?', '？').replace('.','。')
    content = re.sub(r"[%s]+" %stops, "。",content)
    return content

class TrainingInstance(object):
    # """A single training instance (sentence pair)."""
    def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,is_random_next,attention_mask,original_tokens):
        self.tokens = tokens
        self.segment_ids = segment_ids
        self.is_random_next = is_random_next
        self.masked_lm_positions = masked_lm_positions
        self.masked_lm_labels = masked_lm_labels
        self.attention_mask = attention_mask
        self.original_tokens = original_tokens

    def __str__(self):
        s = ""
        s += "tokens: %s\n" % (" ".join(
            [(x) for x in self.tokens]))
        s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
        s += "is_random_next: %s\n" % self.is_random_next
        s += "masked_lm_positions: %s\n" % (" ".join(
            [str(x) for x in self.masked_lm_positions]))
        s += "masked_lm_labels: %s\n" % (" ".join(
            [(x) for x in self.masked_lm_labels]))
        s += "\n"
        return s

    def __repr__(self):
        return self.__str__()
    
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
# """Truncates a pair of sequences to a maximum sequence length."""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_num_tokens:
            break

        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
        assert len(trunc_tokens) >= 1
        # We want to sometimes truncate from the front and sometimes from the
        # back to add more randomness and avoid biases.
        if rng.random() < 0.5:
            del trunc_tokens[0]
        else:
            trunc_tokens.pop()   
#!/usr/bin/python
# -*- coding: utf-8 -*-


def create_masked_lm_predictions(
    tokens,
    masked_lm_prob,
    max_predictions_per_seq,
    vocab_words,
    rng,
    ):
    """Creates the predictions for the masked LM objective."""

    cand_indexes = []
    for (i, token) in enumerate(tokens):
        if token == '[CLS]' or token == '[SEP]' or token == '[PAD]':
            continue
        cand_indexes.append(i)

    rng.shuffle(cand_indexes)

    output_tokens = list(tokens)

    num_to_predict = min(max_predictions_per_seq, max(1,
                         int(round(len(tokens) * masked_lm_prob))))

    masked_lms = []
    covered_indexes = set()
    for index in cand_indexes:
        if len(masked_lms) >= num_to_predict:
            break
        if index in covered_indexes:
            continue
        covered_indexes.add(index)

        masked_token = None

    # 80% of the time, replace with [MASK]

        if rng.random() < 0.8:
            masked_token = '[MASK]'
        else:

      # 10% of the time, keep original

            if rng.random() < 0.5:
                masked_token = tokens[index]
            else:

      # 10% of the time, replace with random word

                masked_token = vocab_words[rng.randint(0,
                        len(vocab_words) - 1)]

        output_tokens[index] = masked_token

        masked_lms.append(MaskedLmInstance(index=index,
                          label=tokens[index]))

    masked_lms = sorted(masked_lms, key=lambda x: x.index)

    masked_lm_positions = []
    masked_lm_labels = []
    for p in masked_lms:
        masked_lm_positions.append(p.index)
        masked_lm_labels.append(p.label)

    return (output_tokens, masked_lm_positions, masked_lm_labels)



In [8]:
all_texts = df['article'].tolist()


In [10]:
all_texts

['醫師：啊回去還好嗎？民眾：欸，還是虛虛的，但。醫師：欸，真的。民眾：好險好險。坦白講我剛回去的時候晚上還是有盜汗。醫師：盜汗。民眾：阿只是前天好很多。前天就算沒盜，可是一覺到天明這樣。醫師：一覺到天明齁。我給你看電腦斷層齁。民眾：嘿。還有那個病毒報告不知道出來沒。醫師：病毒齁。民眾：對對對。醫師：它有幫你驗了不少，還有自體免疫呢。民眾：喔。醫師：我相信你之前都驗過。民眾：對阿。醫師：哇，不過你工作壓力很大欸，得潰瘍。民眾：欸，真的阿。醫師：自體免疫，還好。民眾：negative。醫師：negative，阿這個是？民眾：皰疹。醫師：皰疹病毒的抗體也是negative。民眾：嗯嗯嗯。醫師：啊Q熱陰性，然後第一次檢查綜合結果也是陰性。民眾：所謂Q熱就是那種蚊蟲叮咬的。醫師：嘿，跟動物阿、跳蚤那些比較有相關。民眾：了解。醫師：然後恙蟲病，需再採檢。沒關係我們到時候要追蹤再一起送疾管區。民眾：可以啊，可以啊。醫師：恙蟲病我覺得機會少。民眾：對。醫師：因為恙蟲病要有恙蟲。民眾：對阿。醫師：恙蟲是在離島比較多，山區。民眾：喔，了解了解。醫師：然後，斑疹傷寒就是跟跳蚤比較有關係，外病啦。民眾：嗯，對對對。醫師：比較少見的。民眾：了解了解。醫師：然後EBV，喔，還有CMV病毒也都OK。民眾：這是什麼東西啊。醫師：就是，病毒會引起你肝功能異常的。EBV還有CMV，巨細胞病毒。民眾：嗯。醫師：這是良性的。民眾：喔，這是良性的。醫師：嘿，這都是良性的，自己會好。阿但是會讓你發燒。民眾：嗯。醫師：一陣子這樣子，嗯。民眾：阿這也是都沒有？醫師：都沒有的樣子。我們還驗了些什麼，直接調出來看。民眾：嗯。醫師：阿不過你來的時候發炎指數比較高。發炎指數比較高，CRP。然後肝功能比較異常，85，102。民眾：這個可能是沒睡好。醫師：也有可能。你有沒有喝酒應酬？民眾：喝酒，其實一陣子沒有了欸。醫師：一陣子沒有。民眾：對。你說假如是前年，那有可能，但是去年可能。醫師：就沒有。民眾：對阿。因為老實講，我前年都在對岸比較多。醫師：喔，前年。民眾：對。醫師：它，Google有設廠喔？民眾：欸，客戶在那邊。醫師：客戶在那邊。民眾：對對，阿也有設廠，也有設廠。醫師：你是做，跑業務這樣子。民眾：對對對。醫師：Google不是，那做什麼，ABS？民眾：ABS。ABS阿，跟那個壓克力。醫師：壓克力這樣子。所以我，我們，

In [6]:
all_documents = [[]]
for text in all_texts:
    text = clean_string(text)
    sentences = cut_sent(text)
    all_documents.append(sentences)

# Remove empty documents
all_documents = [x for x in all_documents if x]
rng.shuffle(all_documents)
print(len(all_documents))

35546


In [7]:
pretrain_instance = []

In [8]:
for document_index in range(len(all_documents)):
    document = all_documents[document_index]
    max_num_tokens = max_seq_length - 3
    target_seq_length = max_num_tokens
    instances = []
    current_chunk = []
    current_length = 0
    i = 0
    while i < len(document):
        segment = document[i]
        current_chunk.append(segment)
        current_length += len(segment)
        if i == len(document) - 1 or current_length >= target_seq_length:
            if current_chunk:
                # `a_end` is how many segments from `current_chunk` go into the `A`
                # (first) sentence.
                a_end = 1
                if len(current_chunk) >= 2:
                    a_end = rng.randint(1, len(current_chunk) - 1)
                tokens_a = []
                for j in range(a_end):
                    tokens_a.extend(current_chunk[j])

                tokens_b = []

        # Random next

                is_random_next = False
                if len(current_chunk) == 1 or rng.random() < 0.5:
                    is_random_next = True
                    target_b_length = target_seq_length - len(tokens_a)

          # This should rarely go for more than one iteration for large
          # corpora. However, just to be careful, we try to make sure that
          # the random document is not the same as the document
          # we're processing.

                    for _ in range(10):
                        random_document_index = rng.randint(0,
                                len(all_documents) - 1)
                        if random_document_index != document_index:
                            break

                    random_document = \
                        all_documents[random_document_index]
                    random_start = rng.randint(0, len(random_document)
                            - 1)
                    for j in range(random_start, len(random_document)):
                        tokens_b.extend(random_document[j])
                        if len(tokens_b) >= target_b_length:
                            break

          # We didn't actually use these segments so we "put them back" so
          # they don't go to waste.

                    num_unused_segments = len(current_chunk) - a_end
                    i -= num_unused_segments
                else:

        # Actual next

                    is_random_next = False
                    for j in range(a_end, len(current_chunk)):
                        tokens_b.extend(current_chunk[j])
                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens,
                                  rng)

                assert len(tokens_a) >= 1
                assert len(tokens_b) >= 1

                tokens = []
                segment_ids = []
                tokens.append('[CLS]')
                segment_ids.append(0)
                for token in tokens_a:
                    tokens.append(token)
                    segment_ids.append(0)

                tokens.append('[SEP]')
                segment_ids.append(0)

                for token in tokens_b:
                    tokens.append(token)
                    segment_ids.append(1)
                tokens.append('[SEP]')
                segment_ids.append(1)
                
                attention_mask = [1] * len(tokens)
                
                while len(tokens) < max_seq_length:
                    tokens.append('[PAD]')
                    segment_ids.append(0)
                    attention_mask.append(0)
                
                
                
                original_tokens = tokens
                (tokens, masked_lm_positions, masked_lm_labels) = \
                    create_masked_lm_predictions(tokens,
                        masked_lm_prob, max_predictions_per_seq,
                        vocab_words, rng)
                
                instance = TrainingInstance(tokens=tokens,
                        segment_ids=segment_ids,
                        is_random_next=is_random_next,
                        masked_lm_positions=masked_lm_positions,
                        masked_lm_labels=masked_lm_labels,
                        attention_mask = attention_mask,
                        original_tokens = original_tokens)
                
                instances.append(instance)
            current_chunk = []
            current_length = 0
        i += 1

#     print(instances)
    pretrain_instance.extend(instances)

print(len(pretrain_instance))
    

76608


In [9]:
print(pretrain_instance[0].tokens)
print(pretrain_instance[0].original_tokens)
# print(pretrain_instance[0].attention_mask)
# print(pretrain_instance[0].segment_ids)
# print(pretrain_instance[0].is_random_next)
# print(pretrain_instance[0].masked_lm_labels)


['[CLS]', '人', '團', '聚', '，', '讓', '她', '笑', '說', '：', '「', '回', '去', '當', '然', '是', '好', '好', '當', '公', '主', '呀', '。', '媽', '媽', '、', '阿', '姨', '們', '輪', '流', '煮', '飯', '給', '我', '吃', '，', '還', '得', '與', '大', '家', '夜', '聊', '到', '天', '明', '，', '真', '的', '比', '工', '[MASK]', '還', '累', '。', '」', '孝', '順', '的', '她', '更', '樂', '當', '財', '神', '爺', '，', '發', '出', '超', '過', '3', '0', '包', '紅', '包', '給', '親', '戚', '家', '人', '。', '除', '了', '回', '鄉', '[MASK]', '啖', '媽', '媽', '傳', '家', '菜', '、', '原', '住', '民', '傳', '統', '料', '理', '，', '戴', '愛', '玲', '還', '揪', '團', '到', '部', '落', '投', '幣', '式', '[MASK]', 'T', 'V', '歡', '唱', '。', '5', '月', '2', '3', '日', '將', '首', '度', '站', '上', '台', '北', '國', '際', '會', '議', '中', '心', '舉', '辦', '《', '愛', '戴', '2', '0', '2', '0', '》', '演', '唱', '會', '，', '戴', '愛', '玲', '這', '次', '回', '鄉', '也', '趁', '機', '會', '向', '族', '人', '持', '續', '學', '習', '排', '灣', '[MASK]', '古', '調', '，', '希', '望', '有', '機', '會', '能', '在', '餮', '來', '音', '樂', '演', '出', '融', '合', '。', '至', '於', 

In [10]:
class PreTrainDataset(Dataset):
    def __init__(self, pretrain_instance,tokenizer):
        self.tokenizer = tokenizer 
        self.pretrain_instance = pretrain_instance

    def __getitem__(self, idx):
        input_ids = self.tokenizer.encode(self.pretrain_instance[idx].tokens,
                                          add_special_tokens=False,
                                         return_tensors = 'pt')
        
        original_input_ids = self.tokenizer.encode(self.pretrain_instance[idx].original_tokens,
                                          add_special_tokens=False,
                                         return_tensors = 'pt')
        
        masked_lm_labels_ids = self.tokenizer.encode(self.pretrain_instance[idx].masked_lm_labels,
                                          add_special_tokens=False,
                                         return_tensors = 'pt')
        

        token_type_ids = torch.tensor(self.pretrain_instance[idx].segment_ids)
        attention_mask = torch.tensor(self.pretrain_instance[idx].attention_mask)
        is_random_next = torch.tensor(self.pretrain_instance[idx].is_random_next)
        return input_ids, token_type_ids, attention_mask,  is_random_next , original_input_ids
    def __len__(self):
        return len(self.pretrain_instance)
    


In [11]:
BATCH_SIZE = 5
trainset = PreTrainDataset(pretrain_instance,tokenizer)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE)


In [12]:
# for data in trainloader:
#     input_ids, token_type_ids, attention_mask,  is_random_next , masked_lm_labels = [t.to(device) for t in data]

#     input_ids = torch.reshape(input_ids,(input_ids.size()[0],input_ids.size()[2]))
#     masked_lm_labels = torch.reshape(masked_lm_labels,(masked_lm_labels.size()[0],masked_lm_labels.size()[2]))
#     print(masked_lm_labels.size())
#     print(masked_lm_labels)
# #     masked_lm_labels_ids = torch.reshape(masked_lm_labels_ids,(masked_lm_labels_ids.size()[0],masked_lm_labels_ids.size()[2]))
#     break

In [13]:
from transformers import BertForPreTraining
model = BertForPreTraining.from_pretrained(pretrain_model_path)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 2
for epoch in range(EPOCHS):
    running_loss = 0.0
    i = 0
    for data in trainloader:

        input_ids, token_type_ids, attention_mask,  is_random_next , masked_lm_labels = [t.to(device) for t in data]

        input_ids = torch.reshape(input_ids,(input_ids.size()[0],input_ids.size()[2]))
        masked_lm_labels = torch.reshape(masked_lm_labels,(masked_lm_labels.size()[0],masked_lm_labels.size()[2]))
    

        optimizer.zero_grad()
#         print(input_ids.size())
#         print(token_type_ids.size())
#         print(attention_mask.size())
#         print(is_random_next.size())
#         print(masked_lm_labels.size())
        i += (input_ids.size()[0])
        outputs = model(input_ids=input_ids, 
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_mask,
                        labels = masked_lm_labels,
                        next_sentence_label = is_random_next.long())

        loss = outputs[0]
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        print(f'\rEpoch [{epoch+1}/{EPOCHS}] {i}/{len(trainloader)} Loss: {loss.item():.4f} totaloss: {running_loss:.4f}', end='')
    model.save_pretrained('./bert_pretrain_news/')



Some weights of BertForPreTraining were not initialized from the model checkpoint at ./chinese_wwm_pytorch/ and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [2/2] 11060/15322 Loss: 0.3172 totaloss: 206.94249

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

