In [1]:
from transformers import BertConfig, BertModel, BertTokenizer
import json
import os
import glob
from collections import Counter
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "oracle_bert_sub"

# 数据处理


## token

In [3]:
with open("/workspace/READ_SEQUENCE/DXDatav4.json", "r", encoding = "utf-8") as f:
    data = json.load(f)


sub_labels_buffer = []

for item in data:
    for sub_item in item["RecordUtilSentenceGroupVoList"]:
        for sub_sub_item in sub_item["RecordUtilOracleCharVoList"]:
            sub_labels_buffer.append(sub_sub_item["SubLabel"])



In [4]:
pd.DataFrame([[item, vlaue] for item, vlaue in Counter(sub_labels_buffer).items()]).to_csv("./sub_labels.csv", index = False)

In [6]:
# speicial_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

# 对sublabels进行去重
labels = sorted([item for item in list(set(sub_labels_buffer)) if item is not None])

labels = labels

with open(f"./{model_path}/vocab.txt", "w", encoding = "utf-8") as f:
    f.write("\n".join(labels)
)

In [7]:
bert_token = BertTokenizer(
    vocab_file = f"./{model_path}/vocab.txt", 
    do_lower_case = True, )

bert_token.save_pretrained(f"./{model_path}")

('./oracle_bert_sub/tokenizer_config.json',
 './oracle_bert_sub/special_tokens_map.json',
 './oracle_bert_sub/vocab.txt',
 './oracle_bert_sub/added_tokens.json')

In [8]:
input_2 = bert_token(["1", "r47d5160c7"], is_split_into_words = True)

In [9]:
bert_token.decode(input_2["input_ids"])

'[CLS] [UNK] r47d5160c7 [SEP]'

In [10]:
# 输入一个BATCH的数据
input_1 = bert_token(
    [["dvfc5aavmn", "r47d5160c7"],["dvfc5aavmn", "[MASK]","r47d5160c7"]], 
    is_split_into_words = True,
    return_tensors = "pt",
    padding = True)

# 输入一条数据
input_2 = bert_token(["dvfc5aavmn", "r47d5160c7"], is_split_into_words = True)
print(input_1)

batch_data = bert_token.batch_decode([input_2["input_ids"], input_2["input_ids"]])
print(batch_data)

batch_data = bert_token.batch_decode(input_1["input_ids"])
print(batch_data)
# bert_token.convert_ids_to_tokens(outputs['input_ids'])

{'input_ids': tensor([[2750, 1072, 2064, 2748, 2749],
        [2750, 1072, 2751, 2064, 2748]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]])}
['[CLS] dvfc5aavmn r47d5160c7 [SEP]', '[CLS] dvfc5aavmn r47d5160c7 [SEP]']
['[CLS] dvfc5aavmn r47d5160c7 [SEP] [PAD]', '[CLS] dvfc5aavmn [MASK] r47d5160c7 [SEP]']


## 构建数据集

In [13]:
with open("/workspace/READ_SEQUENCE/DXDatav4.json", "r", encoding = "utf-8") as f:
    data = json.load(f)

datasets = []
max_length = 0

for item in data[2:]:
    lines = []
    lengths = []
    
    for sub_item in item["RecordUtilSentenceGroupVoList"]:
        blocks = []
        for sub_sub_item in sub_item["RecordUtilOracleCharVoList"]:
            blocks.append([sub_sub_item["SubLabel"], sub_sub_item["OrderNumber"]])
            
        blocks = [item[0] for item in sorted(blocks, key = lambda x : x[-1])]
        lengths.append(len(blocks))
        lines.append(blocks)
    if sum(lengths) > max_length:
        max_length = sum(lengths)
    datasets.append({"name": item['Facsimile'].split("/")[-1], "line": lines})

print("最大序列长度:", max_length)
print("数据集长度:", len(datasets))

with open(os.path.join("datasets_sub", "oracle_lines.json"), "w", encoding = "utf-8") as f:
    json.dump(datasets, f, ensure_ascii = False, indent = 4)

最大序列长度: 278
数据集长度: 10075


## 测试token

In [14]:
def trans_seqenceNone2UNK(sequences):
    return [[item if item is not None else "[UNK]" for item in sequence] for sequence in sequences]

In [15]:
seq1 = trans_seqenceNone2UNK(datasets[7])
# input_sequences = "[SEP]".join([item for item in input_sequences])

seq1 = " [SEP] ".join([" ".join(item) for item in seq1]).split(" ")

input_sequences = []
input_sequences.append(seq1)

seq2 = trans_seqenceNone2UNK(datasets[2])
seq2 = " [SEP] ".join([" ".join(item) for item in seq2]).split(" ")
input_sequences.append(seq2)
seq2[3] = "[MASK]"
print(seq2[3])

print(len(input_sequences))

input_sequences_ = bert_token.__call__(input_sequences, 
           is_split_into_words = True, 
           return_tensors = "pt", 
           padding = True)

print(bert_token.batch_decode(input_sequences_["input_ids"]))


print(input_sequences_["input_ids"])
print(input_sequences_['attention_mask'])
print(input_sequences_)

[MASK]
2
['[CLS] [UNK] [UNK] [UNK] [UNK] [SEP] [UNK] [UNK] [UNK] [UNK] [SEP]', '[CLS] [UNK] [UNK] [UNK] [MASK] [SEP] [UNK] [UNK] [UNK] [UNK] [SEP]']
tensor([[2750, 2747, 2747, 2747, 2747, 2748, 2747, 2747, 2747, 2747, 2748],
        [2750, 2747, 2747, 2747, 2751, 2748, 2747, 2747, 2747, 2747, 2748]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
{'input_ids': tensor([[2750, 2747, 2747, 2747, 2747, 2748, 2747, 2747, 2747, 2747, 2748],
        [2750, 2747, 2747, 2747, 2751, 2748, 2747, 2747, 2747, 2747, 2748]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


# 构建模型

In [16]:
bert_config = BertConfig(
    vocab_size = len(labels) + len(bert_token.get_added_vocab()),
    position_embedding_type = "absolute",
    pad_token_id = bert_token.get_added_vocab()["[PAD]"],
    num_hidden_layers = 6)

bert_config.save_pretrained(model_path)

In [17]:
bert_config.hidden_size

768

In [18]:
model = BertModel(bert_config)

In [19]:
len(labels)

2747

In [20]:
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(2752, 768, padding_idx=2749)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [21]:
import pprint

In [22]:
inputs = bert_token(["dvfc5aavmn", "r47d5160c7"], return_tensors = "pt",is_split_into_words = True)

pprint.pprint(model(**inputs)['last_hidden_state'].shape)

torch.Size([1, 4, 768])


In [23]:
outputs = model(**inputs)

In [24]:
outputs["last_hidden_state"]

tensor([[[-0.7226, -0.9055,  0.3861,  ...,  1.1700,  0.2557,  1.0943],
         [-0.4769, -0.2815,  0.5293,  ..., -0.2412,  0.7810, -0.8341],
         [ 0.2178, -1.1193,  1.4128,  ...,  0.3036, -0.3821, -0.3806],
         [-1.1064,  0.4937, -0.4036,  ..., -0.0415, -0.2664, -0.2219]]],
       grad_fn=<NativeLayerNormBackward0>)