In [1]:
from transformers import BertConfig, BertModel, BertTokenizer
import json
import os
import glob
from collections import Counter
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
model_path_sub = "oracle_bert_sub"
model_path = "oracle_bert"

# 数据处理


## token

In [11]:
with open("/workspace/READ_SEQUENCE/DXDatav4.json", "r", encoding = "utf-8") as f:
    data = json.load(f)


sub_labels_buffer = []

for item in data:
    for sub_item in item["RecordUtilSentenceGroupVoList"]:
        for sub_sub_item in sub_item["RecordUtilOracleCharVoList"]:
            sub_labels_buffer.append(sub_sub_item["SubLabel"])



In [5]:
pd.DataFrame([[item, vlaue] for item, vlaue in Counter(sub_labels_buffer).items()]).to_csv("./sub_labels.csv", index = False)

In [13]:
with open("/workspace/READ_SEQUENCE/DXDatav4.json", "r", encoding = "utf-8") as f:
    data = json.load(f)


labels_buffer = []

for item in data:
    for sub_item in item["RecordUtilSentenceGroupVoList"]:
        for sub_sub_item in sub_item["RecordUtilOracleCharVoList"]:
            labels_buffer.append(sub_sub_item["Label"])

In [14]:
pd.DataFrame([[item, vlaue] for item, vlaue in Counter(labels_buffer).items()]).to_csv("./labels.csv", index = False)

In [15]:
# speicial_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

# 对sublabels进行去重
labels = sorted([item for item in list(set(labels_buffer)) if item is not None])

labels = labels

with open(f"./{model_path}/vocab.txt", "w", encoding = "utf-8") as f:
    f.write("\n".join(labels)
)

In [8]:
bert_token = BertTokenizer(
    vocab_file = f"./{model_path}/vocab.txt", 
    do_lower_case = True, )

bert_token.save_pretrained(f"./{model_path}")

('./oracle_bert/tokenizer_config.json',
 './oracle_bert/special_tokens_map.json',
 './oracle_bert/vocab.txt',
 './oracle_bert/added_tokens.json')

In [9]:
input_2 = bert_token(["1", "r47d5160c7"], is_split_into_words = True)

In [10]:
bert_token.decode(input_2["input_ids"])

'[CLS] [UNK] r47d5160c7 [SEP]'

In [11]:
# 输入一个BATCH的数据
input_1 = bert_token(
    [["dvfc5aavmn", "r47d5160c7"],["dvfc5aavmn", "[MASK]","r47d5160c7"]], 
    is_split_into_words = True,
    return_tensors = "pt",
    padding = True)

# 输入一条数据
input_2 = bert_token(["dvfc5aavmn", "r47d5160c7"], is_split_into_words = True)
print(input_1)

batch_data = bert_token.batch_decode([input_2["input_ids"], input_2["input_ids"]])
print(batch_data)

batch_data = bert_token.batch_decode(input_1["input_ids"])
print(batch_data)
# bert_token.convert_ids_to_tokens(outputs['input_ids'])

{'input_ids': tensor([[1733, 1730, 1310, 1731, 1732],
        [1733, 1730, 1734, 1310, 1731]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1]])}
['[CLS] [UNK] r47d5160c7 [SEP]', '[CLS] [UNK] r47d5160c7 [SEP]']
['[CLS] [UNK] r47d5160c7 [SEP] [PAD]', '[CLS] [UNK] [MASK] r47d5160c7 [SEP]']


## 构建数据集

In [12]:
with open("./DXDatav4.json", "r", encoding = "utf-8") as f:
    data = json.load(f)

datasets = []
max_length = 0

for item in data[2:]:
    lines = []
    lengths = []
    
    for sub_item in item["RecordUtilSentenceGroupVoList"]:
        blocks = []
        for sub_sub_item in sub_item["RecordUtilOracleCharVoList"]:
            blocks.append([sub_sub_item["Label"], sub_sub_item["OrderNumber"]])
            
        blocks = [item[0] for item in sorted(blocks, key = lambda x : x[-1])]
        lengths.append(len(blocks))
        lines.append(blocks)
    if sum(lengths) > max_length:
        max_length = sum(lengths)
    datasets.append({"name": item['Facsimile'].split("/")[-1], "line": lines})

print("最大序列长度:", max_length)
print("数据集长度:", len(datasets))

with open(os.path.join("datasets", "oracle_lines.json"), "w", encoding = "utf-8") as f:
    json.dump(datasets, f, ensure_ascii = False, indent = 4)

最大序列长度: 278
数据集长度: 10075


## 测试token

In [14]:
def trans_seqenceNone2UNK(sequences):
    return [[item if item is not None else "[UNK]" for item in sequence] for sequence in sequences]

In [15]:
seq1 = trans_seqenceNone2UNK(datasets[7])
# input_sequences = "[SEP]".join([item for item in input_sequences])

seq1 = " [SEP] ".join([" ".join(item) for item in seq1]).split(" ")

input_sequences = []
input_sequences.append(seq1)

seq2 = trans_seqenceNone2UNK(datasets[2])
seq2 = " [SEP] ".join([" ".join(item) for item in seq2]).split(" ")
input_sequences.append(seq2)
seq2[3] = "[MASK]"
print(seq2[3])

print(len(input_sequences))

input_sequences_ = bert_token.__call__(input_sequences, 
           is_split_into_words = True, 
           return_tensors = "pt", 
           padding = True)

print(bert_token.batch_decode(input_sequences_["input_ids"]))


print(input_sequences_["input_ids"])
print(input_sequences_['attention_mask'])
print(input_sequences_)

[MASK]
2
['[CLS] [UNK] [UNK] [UNK] [UNK] [SEP] [UNK] [UNK] [UNK] [UNK] [SEP]', '[CLS] [UNK] [UNK] [UNK] [MASK] [SEP] [UNK] [UNK] [UNK] [UNK] [SEP]']
tensor([[1733, 1730, 1730, 1730, 1730, 1731, 1730, 1730, 1730, 1730, 1731],
        [1733, 1730, 1730, 1730, 1734, 1731, 1730, 1730, 1730, 1730, 1731]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
{'input_ids': tensor([[1733, 1730, 1730, 1730, 1730, 1731, 1730, 1730, 1730, 1730, 1731],
        [1733, 1730, 1730, 1730, 1734, 1731, 1730, 1730, 1730, 1730, 1731]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


# 构建模型

In [21]:
bert_config = BertConfig(
    vocab_size = len(labels) + len(bert_token.get_added_vocab()),
    position_embedding_type = "absolute",
    pad_token_id = bert_token.get_added_vocab()["[PAD]"],
    num_hidden_layers = 6)

bert_config.save_pretrained(model_path)

In [22]:
bert_config.hidden_size

768

In [23]:
model = BertModel(bert_config)

In [24]:
len(labels)

1730

In [25]:
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(1735, 768, padding_idx=1732)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [26]:
import pprint

In [27]:
inputs = bert_token(["dvfc5aavmn", "r47d5160c7"], return_tensors = "pt",is_split_into_words = True)

pprint.pprint(model(**inputs)['last_hidden_state'].shape)

torch.Size([1, 4, 768])


In [28]:
outputs = model(**inputs)

In [29]:
outputs["last_hidden_state"]

tensor([[[ 1.3125,  0.9100, -0.6813,  ...,  1.2145, -1.9278,  1.3722],
         [-0.4977,  1.2471,  0.9930,  ..., -0.4758, -1.0121,  0.7602],
         [-0.9555,  0.9318,  1.3791,  ...,  1.0507,  0.4240,  2.0551],
         [ 0.0435,  1.3658,  0.5790,  ..., -0.5127, -2.1826,  0.7507]]],
       grad_fn=<NativeLayerNormBackward0>)