# Load BERT

In [1]:
import json
import torch
import pandas as pd
from pandas import read_parquet
from transformers import BertModel
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 一定要用BertTokenizerFast
raw_model_path = r"C:\Users\monkeydc\544\PROJECT\data\mBERT\of"
# 储存mBERT的参数文件地址，我直接用的是绝对地址
# tokenizer = AutoTokenizer.from_pretrained(raw_model_path,is_split_into_words=True)
tokenizer = BertTokenizerFast.from_pretrained(raw_model_path,is_split_into_words=True)
# model = BertModel.from_pretrained(raw_model_path)

# Load Dataset

In [3]:
# 储存数据的地址
data_train = read_parquet(r"C:\Users\monkeydc\544\PROJECT\data\merge\raw.parquet")
data_test = read_parquet(r"C:\Users\monkeydc\544\PROJECT\data\merge\test.parquet")

In [4]:
data_train.head()

Unnamed: 0,tokens,ner_tags,langs,spans
0,"[#, #, ユ, リ, ウ, ス, ・, ベ, ー, リ, ッ, ク, #, 1, 9, ...","[0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...",[PER: ユ リ ウ ス ・ ベ ー リ ッ ク]
1,"[#, ル, ノ, ー, 、, 日, 産, 自, 動, 車, に, 資, 本, 参, 加, 。]","[0, 3, 4, 4, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[ORG: ル ノ ー, ORG: 日 産 自 動 車]"
2,"[ソ, マ, リ, ラ, ン, ド, （, 事, 実, 上, 独, 立, し, た, 地, ...","[5, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...",[LOC: ソ マ リ ラ ン ド]
3,"[R, E, D, I, R, E, C, T, #, ス, レ, イ, マ, ニ, エ, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 4, 4, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[ORG: ス レ イ マ ニ エ ・ モ, ORG: ス ク]"
4,"[#, ', ', E, l, e, c, t, r, i, c, #, C, o, u, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[PER: S t e v e, PER: R e i c h]"


In [5]:
data_test.head()

Unnamed: 0,tokens,ner_tags,langs,spans
0,"[#, ヌ, ン, チ, ャ, ク, バ, ン, キ, ：, 吉, 水, 孝, 宏]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...",[PER: 吉 水 孝 宏]
1,"[:, #, テ, レ, ビ, 東, 京, 系, ア, ニ, メ, 『, ジ, ュ, エ, ...","[0, 0, 3, 4, 4, 4, 4, 0, 0, 0, 0, 0, 3, 4, 4, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[ORG: テ レ ビ 東 京, ORG: ジ ュ エ ル ペ ッ ト, ORG: ハ ッ ..."
2,"[#, ゆ, う, ち, ょ, 銀, 行, A, T, M]","[0, 3, 4, 4, 4, 4, 4, 3, 4, 4]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja]","[ORG: ゆ う ち ょ 銀 行, ORG: A T M]"
3,"[', ', ', ノ, ー, ネ, ', ', ', (, ', ', ', N, o, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, j...","[LOC: イ タ リ ア 共 和 国, LOC: ピ エ モ ン テ 州, LOC: ト ..."
4,"[#, ', ', ', バ, ー, リ, 県, ', ', ']","[0, 0, 0, 0, 5, 6, 6, 6, 0, 0, 0]","[ja, ja, ja, ja, ja, ja, ja, ja, ja, ja, ja]",[LOC: バ ー リ 県]


# prepare training dataset

In [6]:
# 获得tag文件
with open(r"C:\Users\monkeydc\544\PROJECT\data\merge\tags_2_idx.json","r") as file:
    tags_2_idx = json.load(file)
file.close()


In [7]:
idx_2_tags = {tags_2_idx[tag]:tag for tag in tags_2_idx.keys()}

In [8]:
# dumped codes
# def merge_token_into_sentence(tokens:list):
#     sentence = ''
#     for token in tokens:
#         sentence += token + " "
#     return sentence

In [9]:
sentences_train = data_train['tokens'].values.tolist()
tags_train = data_train['ner_tags'].values.tolist()

In [10]:
sentences_test = data_test['tokens'].values.tolist()
tags_test = data_test['ner_tags'].values.tolist()

In [11]:
def align_label(tokenized_input, tags, tags_2_idx, idx_2_tags, label_all_tokens=True): 
        # tokenized_input refers to the sequences after tokenized
        # tags refers to the original tags from dataset
        # False:只为每个拆分token的第一个子词提供一个标签。
        # True:在属于同一 token 的所有子词中提供相同的标签。
        word_ids = tokenized_input.word_ids()
        previous_word_idx = None
        label_ids = []   
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)                
            elif word_idx != previous_word_idx:
                try:
                    label_ids.append(tags_2_idx[idx_2_tags[tags[word_idx]]])
                except:
                    label_ids.append(-100) 
            else:
                label_ids.append(tags_2_idx[idx_2_tags[tags[word_idx]]] if label_all_tokens else -100)
            previous_word_idx = word_idx      
        return label_ids

In [12]:
def align_label_by_case(tokenizer , sentence, tags, tags_2_idx, idx_2_tags, label_all_tokens=True): 
        tokenized_input = tokenizer(sentence)
        # tokenized_input refers to the sequences after tokenized
        # tags refers to the original tags from dataset
        # False:只为每个拆分token的第一个子词提供一个标签。
        # True:在属于同一 token 的所有子词中提供相同的标签。
        word_ids = tokenized_input.word_ids()
        previous_word_idx = None
        label_ids = []   
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)                
            elif word_idx != previous_word_idx:
                try:
                    label_ids.append(tags_2_idx[idx_2_tags[tags[word_idx]]])
                except:
                    label_ids.append(-100) 
            else:
                label_ids.append(tags_2_idx[idx_2_tags[tags[word_idx]]] if label_all_tokens else -100)
            previous_word_idx = word_idx      
        return tokenized_input, label_ids

In [13]:
train_sentences = []
train_tags = []
for i in range(len(sentences_train)):
    try:
        text_tokenized = tokenizer(sentences_train[i].tolist(), padding='max_length',max_length=512, truncation=True,return_tensors="pt", is_split_into_words=True)
        extend_tags = align_label(text_tokenized, tags_train[i], tags_2_idx, idx_2_tags)
        train_sentences.append(text_tokenized)
        train_tags.append(extend_tags)
    except:
        print(merge_token_into_sentence(sentences_train[i]))

In [None]:
print(train_sentences)

In [14]:
test_sentences = []
test_tags = []
for i in range(len(sentences_test)):
    try:
        text_tokenized = tokenizer(sentences_test[i].tolist(), padding='max_length',max_length=512, truncation=True,return_tensors="pt", is_split_into_words=True)
        extend_tags = align_label(text_tokenized, tags_test[i], tags_2_idx, idx_2_tags)
        train_sentences.append(text_tokenized)
        train_tags.append(extend_tags)
    except:
        print(merge_token_into_sentence(sentences_test[i]))

In [35]:
print(len(test_tags))

0


In [15]:
class DataSequence(torch.utils.data.Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.sentences[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_data, batch_labels

In [16]:
class mBertModel(torch.nn.Module):
    def __init__(self, raw_model_path):
        super(mBertModel, self).__init__()
        self.bert = BertModel.from_pretrained(raw_model_path)

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask,labels=label, return_dict=False)
        return output

In [17]:
model = mBertModel(raw_model_path)

Some weights of the model checkpoint at C:\Users\monkeydc\544\PROJECT\data\mBERT\of were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
train_dataset = DataSequence(train_sentences, train_tags)
test_dataset = DataSequence(test_sentences, test_tags)

In [28]:
train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

# fine-tuning

In [29]:
LEARNING_RATE = 1e-2
EPOCHS = 5

In [33]:
for train_data, train_label in train_dataloader:
    print(train_data)

MemoryError: 

In [30]:
def train_evaluate(model, train_dataloader, val_dataloader):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model.to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
    best_acc = 0
    best_loss = 1000
    for epoch_num in range(EPOCHS):
        total_acc_train = 0
        total_loss_train = 0
        # 训练模型
        model.train()
        # 按批量循环训练模型
        for train_data, train_label in train_dataloader:
            train_label = train_label.to(device)
            mask = train_data['attention_mask'][0].to(device)
            input_id = train_data['input_ids'][0].to(device)
            # 梯度清零！！
            optimizer.zero_grad()
            # 输入模型训练结果：损失及分类概率
            loss, logits = model(input_id, mask, train_label)
            # 过滤掉特殊token及padding的token
            logits_clean = logits[0][train_label != -100]
            label_clean = train_label[train_label != -100]
            # 获取最大概率值
            predictions = logits_clean.argmax(dim=1)
      # 计算准确率
            acc = (predictions == label_clean).float().mean()
            total_acc_train += acc
            total_loss_train += loss.item()
      # 反向传递
            loss.backward()
            # 参数更新
            optimizer.step()
        # 模型评估
        model.eval()

        total_acc_val = 0
        total_loss_val = 0
        for val_data, val_label in val_dataloader:
      # 批量获取验证数据
            val_label = val_label[0].to(device)
            mask = val_data['attention_mask'].to(device)
            input_id = val_data['input_ids'][0].to(device)
      # 输出模型预测结果
            loss, logits = model(input_id, mask, val_label)
      # 清楚无效token对应的结果
            logits_clean = logits[0][val_label != -100]
            label_clean = val_label[val_label != -100]
            # 获取概率值最大的预测
            predictions = logits_clean.argmax(dim=1)          
            # 计算精度
            acc = (predictions == label_clean).float().mean()
            total_acc_val += acc
            total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'''Epochs: {epoch_num + 1} | 
                Loss: {total_loss_train / len(df_train): .3f} | 
                Accuracy: {total_acc_train / len(df_train): .3f} |
                Val_Loss: {total_loss_val / len(df_val): .3f} | 
                Accuracy: {total_acc_val / len(df_val): .3f}''')

In [31]:
train_evaluate(model, train_dataloader, test_dataloader)

MemoryError: 