In [22]:
import torch
import torch.nn.functional as F
import pandas as pd
from transformers import BertTokenizer
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import random
import collections
import tokenization


In [23]:
# setting some value
max_seq_length = 512
masked_lm_prob = 0.15
max_predictions_per_seq = 20
rng = random.Random()

pretrain_model_path = '../chinese_wwm_pytorch/'
tokenizer = BertTokenizer.from_pretrained(pretrain_model_path)
vocab_words = list(tokenizer.vocab.keys())

MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])

In [24]:
column_names = ['article_id', 'start_position', 'end_position', 'entity_text', 'entity_type']
df = pd.read_csv('./data/train_1.csv')


In [25]:
import re
from zhon.hanzi import stops
def cut_sent(para):
    para = re.sub("([。！？\?])([^”’])", r"\1\n\2", para)
    para = re.sub("(\.{6})([^”’])", r"\1\n\2", para)
    para = re.sub("(\…{2})([^”’])", r"\1\n\2", para)
    para = re.sub("([。！？\?][”’])([^，。！？\?])", r"\1\n\2", para)
    return para.split("\n")

def clean_string(content):
    content = content.replace('\n','。').replace('\t','，').replace('!', '！').replace('?', '？').replace('.','。')
    content = re.sub(r"[%s]+" %stops, "。",content)
    return content

class TrainingInstance(object):
    # """A single training instance (sentence pair)."""
    def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,is_random_next,attention_mask,original_tokens):
        self.tokens = tokens
        self.segment_ids = segment_ids
        self.is_random_next = is_random_next
        self.masked_lm_positions = masked_lm_positions
        self.masked_lm_labels = masked_lm_labels
        self.attention_mask = attention_mask
        self.original_tokens = original_tokens

    def __str__(self):
        s = ""
        s += "tokens: %s\n" % (" ".join(
            [(x) for x in self.tokens]))
        s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
        s += "is_random_next: %s\n" % self.is_random_next
        s += "masked_lm_positions: %s\n" % (" ".join(
            [str(x) for x in self.masked_lm_positions]))
        s += "masked_lm_labels: %s\n" % (" ".join(
            [(x) for x in self.masked_lm_labels]))
        s += "\n"
        return s

    def __repr__(self):
        return self.__str__()
    
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
# """Truncates a pair of sequences to a maximum sequence length."""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_num_tokens:
            break

        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
        assert len(trunc_tokens) >= 1
        # We want to sometimes truncate from the front and sometimes from the
        # back to add more randomness and avoid biases.
        if rng.random() < 0.5:
            del trunc_tokens[0]
        else:
            trunc_tokens.pop()   
#!/usr/bin/python
# -*- coding: utf-8 -*-


def create_masked_lm_predictions(
    tokens,
    masked_lm_prob,
    max_predictions_per_seq,
    vocab_words,
    rng,
    ):
    """Creates the predictions for the masked LM objective."""

    cand_indexes = []
    for (i, token) in enumerate(tokens):
        if token == '[CLS]' or token == '[SEP]' or token == '[PAD]':
            continue
        cand_indexes.append(i)

    rng.shuffle(cand_indexes)

    output_tokens = list(tokens)

    num_to_predict = min(max_predictions_per_seq, max(1,
                         int(round(len(tokens) * masked_lm_prob))))

    masked_lms = []
    covered_indexes = set()
    for index in cand_indexes:
        if len(masked_lms) >= num_to_predict:
            break
        if index in covered_indexes:
            continue
        covered_indexes.add(index)

        masked_token = None

    # 80% of the time, replace with [MASK]

        if rng.random() < 0.8:
            masked_token = '[MASK]'
        else:

      # 10% of the time, keep original

            if rng.random() < 0.5:
                masked_token = tokens[index]
            else:

      # 10% of the time, replace with random word

                masked_token = vocab_words[rng.randint(0,
                        len(vocab_words) - 1)]

        output_tokens[index] = masked_token

        masked_lms.append(MaskedLmInstance(index=index,
                          label=tokens[index]))

    masked_lms = sorted(masked_lms, key=lambda x: x.index)

    masked_lm_positions = []
    masked_lm_labels = []
    for p in masked_lms:
        masked_lm_positions.append(p.index)
        masked_lm_labels.append(p.label)

    return (output_tokens, masked_lm_positions, masked_lm_labels)



In [26]:
all_texts = df['article'].tolist()
print(len(all_texts))
all_texts = list(dict.fromkeys(all_texts))
print(len(all_texts))


2184
120


In [27]:
all_documents = [[]]
for text in all_texts:
    text = clean_string(text)
    sentences = cut_sent(text)
    all_documents.append(sentences)

# Remove empty documents
all_documents = [x for x in all_documents if x]
rng.shuffle(all_documents)
print(len(all_documents))

120


In [28]:
pretrain_instance = []

In [29]:
for document_index in range(len(all_documents)):
    document = all_documents[document_index]
    max_num_tokens = max_seq_length - 3
    target_seq_length = max_num_tokens
    instances = []
    current_chunk = []
    current_length = 0
    i = 0
    while i < len(document):
        segment = document[i]
        current_chunk.append(segment)
        current_length += len(segment)
        if i == len(document) - 1 or current_length >= target_seq_length:
            if current_chunk:
                # `a_end` is how many segments from `current_chunk` go into the `A`
                # (first) sentence.
                a_end = 1
                if len(current_chunk) >= 2:
                    a_end = rng.randint(1, len(current_chunk) - 1)
                tokens_a = []
                for j in range(a_end):
                    tokens_a.extend(current_chunk[j])

                tokens_b = []

        # Random next

                is_random_next = False
                if len(current_chunk) == 1 or rng.random() < 0.5:
                    is_random_next = True
                    target_b_length = target_seq_length - len(tokens_a)

          # This should rarely go for more than one iteration for large
          # corpora. However, just to be careful, we try to make sure that
          # the random document is not the same as the document
          # we're processing.

                    for _ in range(10):
                        random_document_index = rng.randint(0,
                                len(all_documents) - 1)
                        if random_document_index != document_index:
                            break

                    random_document = \
                        all_documents[random_document_index]
                    random_start = rng.randint(0, len(random_document)
                            - 1)
                    for j in range(random_start, len(random_document)):
                        tokens_b.extend(random_document[j])
                        if len(tokens_b) >= target_b_length:
                            break

          # We didn't actually use these segments so we "put them back" so
          # they don't go to waste.

                    num_unused_segments = len(current_chunk) - a_end
                    i -= num_unused_segments
                else:

        # Actual next

                    is_random_next = False
                    for j in range(a_end, len(current_chunk)):
                        tokens_b.extend(current_chunk[j])
                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens,
                                  rng)

                assert len(tokens_a) >= 1
                assert len(tokens_b) >= 1

                tokens = []
                segment_ids = []
                tokens.append('[CLS]')
                segment_ids.append(0)
                for token in tokens_a:
                    tokens.append(token)
                    segment_ids.append(0)

                tokens.append('[SEP]')
                segment_ids.append(0)

                for token in tokens_b:
                    tokens.append(token)
                    segment_ids.append(1)
                tokens.append('[SEP]')
                segment_ids.append(1)
                
                attention_mask = [1] * len(tokens)
                
                while len(tokens) < max_seq_length:
                    tokens.append('[PAD]')
                    segment_ids.append(0)
                    attention_mask.append(0)
                
                
                
                original_tokens = tokens
                (tokens, masked_lm_positions, masked_lm_labels) = \
                    create_masked_lm_predictions(tokens,
                        masked_lm_prob, max_predictions_per_seq,
                        vocab_words, rng)
                
                instance = TrainingInstance(tokens=tokens,
                        segment_ids=segment_ids,
                        is_random_next=is_random_next,
                        masked_lm_positions=masked_lm_positions,
                        masked_lm_labels=masked_lm_labels,
                        attention_mask = attention_mask,
                        original_tokens = original_tokens)
                
                instances.append(instance)
            current_chunk = []
            current_length = 0
        i += 1

#     print(instances)
    pretrain_instance.extend(instances)

print(len(pretrain_instance))
    

765


In [30]:
print(pretrain_instance[0].tokens)
print(pretrain_instance[0].original_tokens)
# print(pretrain_instance[0].attention_mask)
# print(pretrain_instance[0].segment_ids)
# print(pretrain_instance[0].is_random_next)
# print(pretrain_instance[0].masked_lm_labels)


['[CLS]', '個', '管', '師', '：', '其', '實', '你', '這', '樣', '子', '的', '吃', '法', '是', '吃', '每', '天', '嗎', '。', '民', '眾', '：', '對', '我', '是', '吃', '每', '天', '。', '個', '管', '師', '：', '亲', '你', '有', '固', '定', '的', '時', '間', '嗎', '。', '[MASK]', '眾', '：', '每', '天', '大', '概', '都', '九', '、', '十', '點', '。', '個', '管', '師', '：', '九', '、', '十', '點', '晚', '上', '。', '民', '眾', '：', '晚', '上', '。', '個', '管', '師', '：', '晚', '上', '，', '就', '是', '飯', '前', '飯', '後', '，', '那', '你', '吃', '這', '樣', '[MASK]', 'r', 'E', '[MASK]', '有', '沒', '有', '[MASK]', '舒', '服', '。', '民', '眾', '：', '不', '會', '。', '[SEP]', '師', '：', '[MASK]', '，', '很', '可', '愛', '[MASK]', '，', '一', '歲', '。', '民', '眾', '：', '問', '題', '是', '她', '那', '個', '骨', '頭', '發', '育', '不', '全', '，', '所', '以', '…', '…', '醫', '師', '：', '先', '天', '的', '。', '民', '眾', '：', '對', '，', '就', '是', '基', '因', '突', '變', '。', '醫', '師', '：', '基', '因', '突', '變', '，', '玻', '璃', '娃', '娃', '嗎', '，', '還', '是', '。', '民', '眾', '：', '誒', '，', '類', '似', '那', '種', '侏', '儒', '症', '。', '

In [31]:
class PreTrainDataset(Dataset):
    def __init__(self, pretrain_instance,tokenizer):
        self.tokenizer = tokenizer 
        self.pretrain_instance = pretrain_instance

    def __getitem__(self, idx):
        input_ids = self.tokenizer.encode(self.pretrain_instance[idx].tokens,
                                          add_special_tokens=False,
                                         return_tensors = 'pt')
        
        original_input_ids = self.tokenizer.encode(self.pretrain_instance[idx].original_tokens,
                                          add_special_tokens=False,
                                         return_tensors = 'pt')
        
        masked_lm_labels_ids = self.tokenizer.encode(self.pretrain_instance[idx].masked_lm_labels,
                                          add_special_tokens=False,
                                         return_tensors = 'pt')
        

        token_type_ids = torch.tensor(self.pretrain_instance[idx].segment_ids)
        attention_mask = torch.tensor(self.pretrain_instance[idx].attention_mask)
        is_random_next = torch.tensor(self.pretrain_instance[idx].is_random_next)
        return input_ids, token_type_ids, attention_mask,  is_random_next , original_input_ids
    def __len__(self):
        return len(self.pretrain_instance)
    


In [32]:
BATCH_SIZE = 5
trainset = PreTrainDataset(pretrain_instance,tokenizer)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE)


In [12]:
# for data in trainloader:
#     input_ids, token_type_ids, attention_mask,  is_random_next , masked_lm_labels = [t.to(device) for t in data]

#     input_ids = torch.reshape(input_ids,(input_ids.size()[0],input_ids.size()[2]))
#     masked_lm_labels = torch.reshape(masked_lm_labels,(masked_lm_labels.size()[0],masked_lm_labels.size()[2]))
#     print(masked_lm_labels.size())
#     print(masked_lm_labels)
# #     masked_lm_labels_ids = torch.reshape(masked_lm_labels_ids,(masked_lm_labels_ids.size()[0],masked_lm_labels_ids.size()[2]))
#     break

In [33]:
from transformers import BertForPreTraining
model = BertForPreTraining.from_pretrained(pretrain_model_path)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 2
for epoch in range(EPOCHS):
    running_loss = 0.0
    i = 0
    for data in trainloader:

        input_ids, token_type_ids, attention_mask,  is_random_next , masked_lm_labels = [t.to(device) for t in data]

        input_ids = torch.reshape(input_ids,(input_ids.size()[0],input_ids.size()[2]))
        masked_lm_labels = torch.reshape(masked_lm_labels,(masked_lm_labels.size()[0],masked_lm_labels.size()[2]))
    

        optimizer.zero_grad()
#         print(input_ids.size())
#         print(token_type_ids.size())
#         print(attention_mask.size())
#         print(is_random_next.size())
#         print(masked_lm_labels.size())
        i += (input_ids.size()[0])
        outputs = model(input_ids=input_ids, 
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_mask,
                        labels = masked_lm_labels,
                        next_sentence_label = is_random_next.long())

        loss = outputs[0]
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
        print(f'\rEpoch [{epoch+1}/{EPOCHS}] {i}/{len(trainloader)} Loss: {loss.item():.4f} totaloss: {running_loss:.4f}', end='')
    model.save_pretrained('./bertwwm_pretrain_aicup/')



Some weights of BertForPreTraining were not initialized from the model checkpoint at ../chinese_wwm_pytorch/ and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch [2/2] 765/153 Loss: 0.6941 totaloss: 56.48549