# Load BERT

In [1]:
import json
import torch
import pandas as pd
from pandas import read_parquet
from transformers import BertModel
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 尽量要用BertTokenizerFast, 避免会有莫名其妙的module问题
raw_model_path = r"C:\Users\monkeydc\544\PROJECT\data\mBERT\of"
# 储存mBERT的参数文件地址，我直接用的是绝对地址
# tokenizer = AutoTokenizer.from_pretrained(raw_model_path,is_split_into_words=True)
tokenizer = BertTokenizerFast.from_pretrained(raw_model_path,is_split_into_words=True)
# model = BertModel.from_pretrained(raw_model_path)

# Load Dataset

In [3]:
# 储存数据的地址
data_train = read_parquet(r"C:\Users\monkeydc\544\PROJECT\data\merge\raw.parquet")
data_test = read_parquet(r"C:\Users\monkeydc\544\PROJECT\data\merge\test.parquet")

In [4]:
data_train.head()

Unnamed: 0,tokens,ner_tags,langs,spans
0,"[#, #, ユ, リ, ウ, ス, ・, ベ, ー, リ, ッ, ク, #, 1, 9, ...","[0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...",[PER: ユ リ ウ ス ・ ベ ー リ ッ ク]
1,"[#, ル, ノ, ー, 、, 日, 産, 自, 動, 車, に, 資, 本, 参, 加, 。]","[0, 3, 4, 4, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[ORG: ル ノ ー, ORG: 日 産 自 動 車]"
2,"[ソ, マ, リ, ラ, ン, ド, （, 事, 実, 上, 独, 立, し, た, 地, ...","[5, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...",[LOC: ソ マ リ ラ ン ド]
3,"[R, E, D, I, R, E, C, T, #, ス, レ, イ, マ, ニ, エ, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[ORG: ス レ イ マ ニ エ ・ モ, ORG: ス ク]"
4,"[#, ', ', E, l, e, c, t, r, i, c, #, C, o, u, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[PER: S t e v e, PER: R e i c h]"


In [5]:
data_test.head()

Unnamed: 0,tokens,ner_tags,langs,spans
0,"[#, ヌ, ン, チ, ャ, ク, バ, ン, キ, ：, 吉, 水, 孝, 宏]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...",[PER: 吉 水 孝 宏]
1,"[:, #, テ, レ, ビ, 東, 京, 系, ア, ニ, メ, 『, ジ, ュ, エ, ...","[0, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 3, 4, 4, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[ORG: テ レ ビ 東 京, ORG: ジ ュ エ ル ペ ッ ト, ORG: ハ ッ ..."
2,"[#, ゆ, う, ち, ょ, 銀, 行, A, T, M]","[0, 3, 4, 4, 4, 4, 4, 3, 4, 4]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja]","[ORG: ゆ う ち ょ 銀 行, ORG: A T M]"
3,"[', ', ', ノ, ー, ネ, ', ', ', (, ', ', ', N, o, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[LOC: イ タ リ ア 共 和 国, LOC: ピ エ モ ン テ 州, LOC: ト ..."
4,"[#, ', ', ', バ, ー, リ, 県, ', ', ']","[0, 0, 0, 0, 5, 6, 6, 6, 0, 0, 0]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja]",[LOC: バ ー リ 県]


# prepare training dataset

In [6]:
# 获得tag文件
with open(r"C:\Users\monkeydc\544\PROJECT\data\merge\tags_2_idx.json","r") as file:
    tags_2_idx = json.load(file)
file.close()


In [7]:
idx_2_tags = {tags_2_idx[tag]:tag for tag in tags_2_idx.keys()}

In [8]:
# dumped codes
def merge_token_into_sentence(tokens:list):
    sentence = ''
    for token in tokens:
        sentence += token + " "
    return sentence

In [9]:
sentences_train = data_train['tokens'].values.tolist()
tags_train = data_train['ner_tags'].values.tolist()

In [10]:
sentences_test = data_test['tokens'].values.tolist()
tags_test = data_test['ner_tags'].values.tolist()

In [11]:
# sample_sentences_train = sentences_train[0:2]
# sample_tags_train = tags_train[0:2]

In [12]:
# sample_sentences_test = sentences_test[0:2]
# sample_tags_test = tags_test[0:2]

In [13]:
def align_label(tokenized_input, tags, tags_2_idx, idx_2_tags, label_all_tokens=True): 
        # tokenized_input refers to the sequences after tokenized
        # tags refers to the original tags from dataset
        # False:只为每个拆分token的第一个子词提供一个标签。
        # True:在属于同一 token 的所有子词中提供相同的标签。
        word_ids = tokenized_input.word_ids()
        previous_word_idx = None
        label_ids = []   
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)                
            elif word_idx != previous_word_idx:
                try:
                    label_ids.append(tags_2_idx[idx_2_tags[tags[word_idx]]])
                except:
                    label_ids.append(-100) 
            else:
                label_ids.append(tags_2_idx[idx_2_tags[tags[word_idx]]] if label_all_tokens else -100)
            previous_word_idx = word_idx      
        return label_ids

In [14]:
def align_label_by_case(tokenizer , sentence, tags, tags_2_idx, idx_2_tags, label_all_tokens=True): 
        tokenized_input = tokenizer(sentence)
        # tokenized_input refers to the sequences after tokenized
        # tags refers to the original tags from dataset
        # False:只为每个拆分token的第一个子词提供一个标签。
        # True:在属于同一 token 的所有子词中提供相同的标签。
        word_ids = tokenized_input.word_ids()
        previous_word_idx = None
        label_ids = []   
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)                
            elif word_idx != previous_word_idx:
                try:
                    label_ids.append(tags_2_idx[idx_2_tags[tags[word_idx]]])
                except:
                    label_ids.append(-100) 
            else:
                label_ids.append(tags_2_idx[idx_2_tags[tags[word_idx]]] if label_all_tokens else -100)
            previous_word_idx = word_idx      
        return tokenized_input, label_ids

In [15]:
# sample dataset
# train_sentences = []
# train_tags = []
# for i in range(len(sample_sentences_train)):
#     try:
#         text_tokenized = tokenizer(sample_sentences_train[i].tolist(), padding='max_length',max_length=512, truncation=True,return_tensors="pt", is_split_into_words=True)
#         extend_tags = align_label(text_tokenized, sample_tags_train[i], tags_2_idx, idx_2_tags)
#         train_sentences.append(text_tokenized)
#         train_tags.append(extend_tags)
#     except:
#         print(merge_token_into_sentence(sentences_train[i]))
# test_sentences = []
# test_tags = []
# for i in range(len(sample_sentences_test)):
#     try:
#         text_tokenized = tokenizer(sample_sentences_test[i].tolist(), padding='max_length',max_length=512, truncation=True,return_tensors="pt", is_split_into_words=True)
#         extend_tags = align_label(text_tokenized, sample_tags_test[i], tags_2_idx, idx_2_tags)
#         test_sentences.append(text_tokenized)
#         test_tags.append(extend_tags)
#     except:
#         print(merge_token_into_sentence(sentences_test[i]))

In [16]:
train_sentences = []
train_tags = []
for i in range(len(sentences_train)):
    try:
        text_tokenized = tokenizer(sentences_train[i].tolist(), padding='max_length',max_length=512, truncation=True,return_tensors="pt", is_split_into_words=True)
        extend_tags = align_label(text_tokenized, tags_train[i], tags_2_idx, idx_2_tags)
        train_sentences.append(text_tokenized)
        train_tags.append(extend_tags)
    except:
        print(merge_token_into_sentence(sentences_train[i]))
test_sentences = []
test_tags = []
for i in range(len(sentences_test)):
    try:
        text_tokenized = tokenizer(sentences_test[i].tolist(), padding='max_length',max_length=512, truncation=True,return_tensors="pt", is_split_into_words=True)
        extend_tags = align_label(text_tokenized, tags_test[i], tags_2_idx, idx_2_tags)
        test_sentences.append(text_tokenized)
        test_tags.append(extend_tags)
    except:
        print(merge_token_into_sentence(sentences_test[i]))

In [17]:
class DataSequence(torch.utils.data.Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def get_text_tokenized(self, idx):
        return self.sentences[idx]

    def get_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        text_tokenized = self.get_text_tokenized(idx)
        labels = self.get_labels(idx)
        return text_tokenized, labels

In [18]:
train_dataset = DataSequence(train_sentences, train_tags)
test_dataset = DataSequence(test_sentences, test_tags)

# load official .bin

In [19]:
from transformers import BertForTokenClassification
class BertModel_self(torch.nn.Module):
    def __init__(self, raw_model_path, tags_2_idx):
        super(BertModel, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained(raw_model_path, num_labels=len(tags_2_idx))

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask,
                           labels=label, return_dict=False)
        return output

In [20]:
model = BertModel_self(raw_model_path, tags_2_idx)

Some weights of the model checkpoint at C:\Users\monkeydc\544\PROJECT\data\mBERT\of were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model

# fine-tuning

In [34]:
from torch.optim import SGD
from tqdm import tqdm
def train_loop(model, train, val):
    train_dataloader = DataLoader(train_dataset, num_workers=0, batch_size=1, shuffle=True)
    test_dataloader = DataLoader(test_dataset, num_workers=0, batch_size=1)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
    if use_cuda:
        model = model.cuda()
    best_acc = 0
    best_loss = 1000
    for epoch_num in range(EPOCHS):
        total_acc_train = 0
        total_loss_train = 0
        model.train()
        for train_data, train_label in tqdm(train_dataloader):
            optimizer.zero_grad()
            train_label = train_label[0].to(device)
            mask = train_data['attention_mask'][0].to(device)
            input_id = train_data['input_ids'][0].to(device)
            loss, logits = model(input_id, mask, train_label)
            logits_clean = logits[0][train_label != -100]
            label_clean = train_label[train_label != -100]
            predictions = logits_clean.argmax(dim=1)
            acc = (predictions == label_clean).float().mean()
            total_acc_train += acc
            total_loss_train += loss.item()
            loss.backward()
            optimizer.step()
        model.eval()
        total_acc_val = 0
        total_loss_val = 0
        for val_data, val_label in test_dataloader:
            val_label = val_label[0].to(device)
            mask = val_data['attention_mask'][0].to(device)
            input_id = val_data['input_ids'][0].to(device)
            loss, logits = model(input_id, mask, val_label)
            logits_clean = logits[0][val_label != -100]
            label_clean = val_label[val_label != -100]
            predictions = logits_clean.argmax(dim=1)          
            acc = (predictions == label_clean).float().mean()
            total_acc_val += acc
            total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(val)
        val_loss = total_loss_val / len(val)
        if val_accuracy >= best_acc:
            best_acc = val_accuracy
            bert_model = model.bert # only save bert part
            torch.save(bert_model.state_dict(), r'C:\Users\monkeydc\544\PROJECT\data\mBERT\fine\pytorch_model.bin')
        print(
            f'''Epochs: {epoch_num + 1} | 
                Loss: {total_loss_train / len(train): .3f} | 
                Accuracy: {total_acc_train / len(train): .3f} |
                Val_Loss: {total_loss_val / len(val): .3f} | 
                Accuracy: {total_acc_val / len(val): .3f}''')

LEARNING_RATE = 1e-2
EPOCHS = 5

In [35]:
train_loop(model, train_dataset, test_dataset)

100%|████████████████████████████████████████████████████████████████████████| 120200/120200 [3:03:00<00:00, 10.95it/s]


Epochs: 1 | 
                Loss:  0.773 | 
                Accuracy:  0.732 |
                Val_Loss:  0.638 | 
                Accuracy:  0.787


100%|████████████████████████████████████████████████████████████████████████| 120200/120200 [3:37:43<00:00,  9.20it/s]


Epochs: 2 | 
                Loss:  0.591 | 
                Accuracy:  0.802 |
                Val_Loss:  0.581 | 
                Accuracy:  0.808


100%|████████████████████████████████████████████████████████████████████████| 120200/120200 [2:58:58<00:00, 11.19it/s]


Epochs: 3 | 
                Loss:  0.551 | 
                Accuracy:  0.816 |
                Val_Loss:  0.518 | 
                Accuracy:  0.829


100%|████████████████████████████████████████████████████████████████████████| 120200/120200 [2:59:57<00:00, 11.13it/s]


Epochs: 4 | 
                Loss:  0.516 | 
                Accuracy:  0.828 |
                Val_Loss:  0.543 | 
                Accuracy:  0.821


100%|████████████████████████████████████████████████████████████████████████| 120200/120200 [3:18:14<00:00, 10.11it/s]


Epochs: 5 | 
                Loss:  0.876 | 
                Accuracy:  0.700 |
                Val_Loss:  1.154 | 
                Accuracy:  0.632


# model .bin save

In [36]:
# bert_model = model.bert # only save bert part
# torch.save(bert_model.state_dict(), r'C:\Users\monkeydc\544\PROJECT\data\mBERT\fine\pytorch_model.bin')

# reload model sample

In [39]:
from transformers import BertModel
fine_model_path = r'C:\Users\monkeydc\544\PROJECT\data\mBERT\fine'
tokenizer = BertTokenizerFast.from_pretrained(fine_model_path,is_split_into_words=True)
model_reload = BertModel.from_pretrained(fine_model_path)

Some weights of the model checkpoint at C:\Users\monkeydc\544\PROJECT\data\mBERT\fine were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\monkeydc\544\PROJECT\data\mBERT\fine and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# marks
# [Errno 32] Broken pipe
# 进程bug，由于进程的内存占有率是更具process走的，所以占有率远高于同等级的batch_size

# dumpcodes

In [18]:
class mBertModel(torch.nn.Module):
    def __init__(self, raw_model_path):
        super(mBertModel, self).__init__()
        self.bert = BertModel.from_pretrained(raw_model_path)

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label)
        return sequence_output

In [None]:
from transformers import BertPreTrainedModel
class mBertModel(BertPreTrainedModel):
    def __init__(self, raw_model_path):
        super(BertSoftmaxForNer, self).__init__(raw_model_path)
        self.num_labels = 9
        self.bert = BertModel(raw_model_path)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, 9)
        self.loss_type = "ce"
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None,labels=None):
        outputs = self.bert(input_ids = input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
        if labels is not None:
            assert self.loss_type in ['lsr', 'focal', 'ce']
            if self.loss_type == 'lsr':
                loss_fct = LabelSmoothingCrossEntropy(ignore_index=0)
            elif self.loss_type == 'focal':
                loss_fct = FocalLoss(ignore_index=0)
            else:
                loss_fct = CrossEntropyLoss(ignore_index=0)
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = labels.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs
        return outputs

In [19]:
model = mBertModel(raw_model_path)

Some weights of the model checkpoint at C:\Users\monkeydc\544\PROJECT\data\mBERT\of were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
train_dataset = DataSequence(train_sentences, train_tags)
test_dataset = DataSequence(test_sentences, test_tags)

In [21]:
train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

# fine-tuning dumped

In [1]:
import transformers
optimizer = transformers.AdamW(model_parameters, lr=lr, correct_bias=True)

ModuleNotFoundError: No module named 'pytorch_pretrained_bert'