In [5]:
import pickle
import json
import os
import numpy as np
import torch
import torch.nn as nn
from torchcrf import CRF

In [3]:
# hyperparameters
label_path = '../input/labels.txt'
vocab_path = '../input/vocab.pk'
embedding_path = '../input/embedding.npy'
train_path = '../input/train_5k.json'
eval_path = '../input/eval.json'
test_path = '../input/test.json'

padding_value = 0
batch_size = 1024
sequence_length = 50
embedding_dim = 300
gru_unit_num = 200
dropout_rate = 0.4
epochs = 2


In [6]:
# ============================Function=============================
# 读取数据相关
# =================================================================
def load_labels(label_path):
  # 读取标签文件内容
  with open(label_path, "r", encoding="utf-8") as file:
      lines = file.readlines()
  labels = [line.strip() for line in lines]
  label_id_map = {label: i for i, label in enumerate(labels)}
  unique_category = [line.strip().split('-')[-1] for line in lines]
  return labels, label_id_map, list(set(unique_category))

# 读取词汇表
def load_vocabs(vocab_path):
  with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)
  return vocab

# 读取词向量
def load_embeddings(embedding_path):
  embeddings = torch.tensor(np.load(embedding_path), dtype=torch.float32)
  embedding = nn.Embedding.from_pretrained(embeddings)
  return embedding


# =================================================================
# 数据处理相关
# =================================================================
# 读json文件
def read_json(json_file_path):
  with open(json_file_path, 'r', encoding='utf-8') as f:
    return json.load(f)

# 读取训练数据, 测试数据等
def load_data(json_datas, sequence_length):
  datas = []
  labels = []
  for d in json_datas:
    text = d.get('text')
    if len(text) <= sequence_length:
      datas.append(text)
      label = [tokens[1] for tokens in d.get('tokens')]
      if label:
        labels.append(label)
      else:
        print(f'error label: {label}')
  return datas, labels

# 将标注的label文本转成label tensor
def tensor_label(targets, label_map, padding_value, sequence_length):
  target_ids = []
  for target in targets:
    ids = []
    for label in target:
      id = label_map.get(label)
      if id:
        ids.append(label_map.get(label))
      else:
        print(f'error label: {label}')
    if len(ids) <= sequence_length:
      ids.extend([padding_value] * (sequence_length - len(ids)))
    else:
      print(f'error length: {len(ids)} {ids}')
    target_ids.append(ids)
  return torch.tensor(target_ids, dtype=torch.int64)

# 将target id转成one hot编码
# def one_hot_target_id(target_ids, label_num):
#   target_ids_tensor = torch.tensor(target_ids, dtype=torch.int64)
#   one_hot = F.one_hot(target_ids_tensor, num_classes=label_num)
#   return one_hot

# 将中文文本(训练/测试)转成词向量
# 在使用预训练的词向量进行嵌入后，张量的维度将发生变化。假设输入数据的形状为 (2, 50)，其中 2 是批量大小（batch size），50 是序列长度（sequence length）。
# 预训练的词向量维度为 (18109, 300)，其中 18109 是词汇表中的词语数量，300 是每个词语的嵌入维度。
# 在进行嵌入后，输出张量的形状将是 (2, 50, 300)。这是因为每个词语都被嵌入为一个长度为 300 的向量，而输入序列中有 50 个词语，所以输出张量的形状将是 (2, 50, 300)。
# 其中 2 是批量大小，50 是序列长度，300 是每个词语的嵌入维度。
def tensor_data(vocabulary, embeddings, datas, padding_value, sequence_length):
  tokens = []
  # masks = []
  for text in datas:
    ids = [vocabulary.get(token, 0) for token in text]
    # mask = [1] * len(ids) + [padding_value] * (sequence_length - len(ids))
    # 使用填充来确保序列具有相同的长度
    ids += [padding_value] * (sequence_length - len(ids))
    # masks.append(mask)
    tokens.append(ids)
  return torch.tensor(tokens, dtype=torch.int64)

def embedding_data(tensors, embeddings):
  return embeddings(tensors)

In [7]:
_, label_map, unique_category = load_labels(label_path)
vocab_map = load_vocabs(vocab_path)
embeddings = load_embeddings(embedding_path)

print(f'labels_map = {label_map}')
print(f'vocabulary = {vocab_map}')
print(f'embeddings = {embeddings}')


train_data, train_label = load_data(read_json(train_path), sequence_length)
train_data_tensor = tensor_data(vocab_map, embeddings, train_data, padding_value, sequence_length)
train_data_embedding = embedding_data(train_data_tensor, embeddings)
train_label = tensor_label(train_label, label_map, padding_value, sequence_length)
print(f'train_data_tensor.shape = {train_data_tensor.shape}')
print(f'train_data_embedding.shape = {train_data_embedding.shape}')
print(f'train_labels.shape = {train_label.shape}')

print('-' * 10)
eval_data, eval_label = load_data(read_json(eval_path), sequence_length)
eval_data_tensor = tensor_data(vocab_map, embeddings, eval_data, padding_value, sequence_length)
eval_data_embedding = embedding_data(eval_data_tensor, embeddings)
eval_label = tensor_label(eval_label, label_map, padding_value, sequence_length)
print(f'eval_data_tensor.shape = {eval_data_tensor.shape}')
print(f'eval_data_embedding.shape = {eval_data_embedding.shape}')
print(f'eval_label.shape = {eval_label.shape}')

print('-' * 10)
test_data, test_label = load_data(read_json(test_path), sequence_length)
test_data_tensor = tensor_data(vocab_map, embeddings, test_data, padding_value, sequence_length)
test_data_embedding = embedding_data(test_data_tensor, embeddings)
test_label = tensor_label(test_label, label_map, padding_value, sequence_length)
print(f'test_data_tensor.shape = {test_data_tensor.shape}')
print(f'test_data_embedding.shape = {test_data_embedding.shape}')
print(f'test_label.shape = {test_label.shape}')

labels_map = {'_PAD': 0, 'B-cls': 1, 'E-sep': 2, 'E-village': 3, 'E-district': 4, 'B-room': 5, 'I-province': 6, 'I-redundant': 7, 'B-redundant': 8, 'S-road': 9, 'E-devZone': 10, 'E-floor': 11, 'S-room': 12, 'B-city': 13, 'E-room': 14, 'B-neighborhood': 15, 'S-otherinfo': 16, 'S-redundant': 17, 'B-road': 18, 'I-township': 19, 'I-poi': 20, 'S-building': 21, 'I-road': 22, 'E-road': 23, 'E-city': 24, 'I-community': 25, 'I-assist': 26, 'B-devZone': 27, 'I-otherinfo': 28, 'I-neighborhood': 29, 'E-assist': 30, 'I-floor': 31, 'I-village': 32, 'B-building': 33, 'B-assist': 34, 'B-country': 35, 'B-village': 36, 'B-houseNumber': 37, 'E-houseNumber': 38, 'E-township': 39, 'B-district': 40, 'S-houseNumber': 41, 'I-district': 42, 'I-building': 43, 'E-country': 44, 'E-poi': 45, 'B-poi': 46, 'E-province': 47, 'S-floor': 48, 'S-village': 49, 'B-township': 50, 'B-province': 51, 'B-floor': 52, 'E-redundant': 53, 'E-building': 54, 'S-community': 55, 'S-assist': 56, 'I-devZone': 57, 'B-community': 58, 'B-o

In [8]:
class MyModel(nn.Module):
  def __init__(self, embedding_dim, gru_unit_num, dropout_rate, labels_size):
    super(MyModel, self).__init__()
    
    # 定义层
    self.gru_1 = nn.GRU(embedding_dim, gru_unit_num, batch_first=True, bidirectional=True) # 双向GRU层
    self.gru_2 = nn.GRU(gru_unit_num * 2, gru_unit_num, batch_first=True, bidirectional=True) # 双向GRU层
    self.dropout = nn.Dropout(dropout_rate) # Dropout层
    self.dense = nn.Linear(gru_unit_num * 2, labels_size)

  def forward(self, x):
    x, _ = self.gru_1(x) # 第一层GRU
    x, _ = self.gru_2(x) # 第二层GRU
    x = self.dropout(x) # Dropout
    x = self.dense(x) # 全连接层
    return x

In [9]:
crf = CRF(num_tags=len(label_map), batch_first=True)
train_dataset = torch.utils.data.TensorDataset(train_data_embedding, train_label, train_data_tensor)
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

eval_dataset = torch.utils.data.TensorDataset(eval_data_embedding, eval_label, eval_data_tensor)
eval_data_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

test_dataset = torch.utils.data.TensorDataset(test_data_embedding, test_label, test_data_tensor)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [14]:
def group_tokens(tokens, category):
  stack = []
  group = []
  for i, token in enumerate(tokens):
    if token not in [f'B-{category}', f'I-{category}', f'E-{category}', f'S-{category}']:
      continue      

    if token == f'S-{category}':
      group.append([(token, i)])
      continue

    if len(stack) == 0:
      stack.append((token, i))
      continue

    last_token, _ = stack[-1]
    if last_token == f'B-{category}' and token == f'I-{category}' or last_token == f'B-{category}' and token == f'E-{category}' or last_token == f'I-{category}' and token == f'E-{category}':
      stack.append((token, i))
    else:
      group.append(stack)
      stack = [(token, i)]

  if len(stack) > 0:
    group.append(stack)

  return group

def trans_group_token_to_text(groups, raw):
  texts = []
  for group in groups:
    start = group[0][1]
    end = group[-1][1]
    texts.append(''.join(raw[start:end+1]))
  return texts


def statistics_category(predict, y, r, category):
  all_predict_text = []
  all_y_text = []
  for i, _ in enumerate(predict):
   predict_group = group_tokens(predict[i], category)
   text = trans_group_token_to_text(predict_group, r[i])
   all_predict_text.append(text)

   y_group = group_tokens(y[i], category)
   text = trans_group_token_to_text(y_group, r[i])
   all_y_text.append(text)

  print('asd')

  pass



def decode(predict, y, r, vocab_map, label_map, padding_value, sequence_length):
  """
  predict shape: (batch, actual_length)
  y shape: (batch, sequence_length)
  r shape: (batch, sequence_length)
  """

  inverse_labels_map = {v: k for k, v in label_map.items()}
  inverse_vocab_map = {v: k for k, v in vocab_map.items()}

  batch_predict_labels = []
  for items in predict:
    labels = []
    for j in items:
      if j == 0:
        continue
      label = inverse_labels_map[j]
      labels.append(label)
    batch_predict_labels.append(labels)

  y = y.numpy()
  batch_y_labels = []
  for items in y:
    labels = []
    for j in items:
      if j == 0:
        continue
      label = inverse_labels_map[j]
      labels.append(label)
    batch_y_labels.append(labels)

  r = r.numpy()
  batch_r_tokens = []
  for items in r:
    tokens = []
    for j in items:
      if j == 0:
        continue
      token = inverse_vocab_map[j]
      tokens.append(token)
    batch_r_tokens.append(tokens)

  # 分析每个字段的得分
  # 省/市/区...等
  statistics_category(batch_predict_labels, batch_y_labels, batch_r_tokens, 'city')
  print('eval done')


def eval(eval_data_loader, model):
  model.eval()
  # 初始化一个空列表来存储每个序列的F1分数
  f1_scores = []
  for X, y, r in eval_data_loader:
    predict = model(X)
    mask = (y != padding_value)
    predict = crf.decode(predict, mask=mask)
    # predict = [np.pad(ids, (0, sequence_length - len(ids)), constant_values=padding_value) for ids in predict]
    # predict = np.array(predict)
    # y = y.numpy()
    # r = r.numpy()

    decode(predict, y, r, vocab_map, label_map, padding_value, sequence_length)
    # # 对于每个序列，计算F1分数
    # for i in range(predict.shape[0]):
    #   # 计算准确率
    #   f1 = f1_score(y[i], predict[i], average='macro')
    #   f1_scores.append(f1)
  model.train()
  return sum(f1_scores) / len(f1_scores)


# 定义比较完善的训练函数
def train(train_data_loader, eval_data_loader, model, optimizer, num_epoch, save_step_inteval, eval_step_interval, model_save_path, resume):
  start_epoch = 0
  start_step = 0

  if resume:
    checkpoint = torch.load(resume)
    model.load_state_dict(checkpoint['model'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    start_epoch = checkpoint['epoch']
    start_step = checkpoint['step']
    print(f'resume training from epoch {start_epoch} step {start_step} by {resume}')

  for epoch in range(start_epoch, num_epoch):
    num_batches = len(train_data_loader)
    for batch_index, (X, y, _) in enumerate(train_data_loader):
      step = num_batches * epoch + batch_index + 1
      if step <= start_step:
        continue
      
      optimizer.zero_grad()
      predict = model(X)
      mask = (y != padding_value)
      loss = -1 * crf(predict, y, mask=mask, reduction='mean')
      loss.backward()
      optimizer.step()

      print(f'Epoch {epoch+1}/{num_epoch} Step {step}/{len(train_data_loader)} Loss: {loss.item()}')

      # save model by epoch end
      if step % save_step_inteval == 0:
        os.makedirs(model_save_path, exist_ok=True)
        save_file = os.path.join(model_save_path, f'model_step_{step}.pt')
        torch.save({
          'model': model.state_dict(),
          'optimizer': optimizer.state_dict(),
          'epoch': epoch,
          'step': step
        }, save_file)

      # eval by epoch end
      if step % eval_step_interval == 0:
        f1 = eval(eval_data_loader, model)
        print(f'Epoch {epoch+1}/{num_epoch} Step {step}/{len(train_data_loader)} F1: {f1}')

In [15]:
# 训练
model = MyModel(embedding_dim, gru_unit_num, dropout_rate, len(label_map))
optimizer = torch.optim.Adam(model.parameters())
# train(train_data_loader, eval_data_loader, model, optimizer, epochs, save_step_inteval=2, eval_step_interval=100, 
#       model_save_path='../models', resume='')

train(train_data_loader, eval_data_loader, model, optimizer, epochs, save_step_inteval=2, eval_step_interval=10, 
      model_save_path='../models', resume='../models/model_step_8.pt')

resume training from epoch 0 step 8 by ../models/model_step_8.pt
Epoch 2/2 Step 9/5 Loss: 53.752960205078125
Epoch 2/2 Step 10/5 Loss: 53.6715087890625


KeyboardInterrupt: 

In [2]:
from collections import defaultdict
arr = defaultdict(list)
arr['cate'].append('hello')
arr['cate'].append('world')
arr['cate'].append('haha')


defaultdict(list, {'cate': ['hello', 'world', 'haha']})