In [1]:
import time
import torch
import numpy as np
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from pytorch_pretrained_bert import BertTokenizer, BertForTokenClassification

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(n_gpu, torch.cuda.get_device_name(0))

1 GeForce RTX 3060


In [3]:
start = time.time()
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = torch.load("model_ner_gpu_epoch_2_batch_64")
print ("load model time:", time.time()-start)

load model time: 3.2227115631103516


In [4]:
tags_vals = ['TransFr_B', 'TransFr_I', 'TransTo_B', 'TransTo_I', 'AMOUNT_B', 'BANK_B', 'BANK_I', 'O']
tag2idx = {t: i for i, t in enumerate(tags_vals)}
tag2idx

{'TransFr_B': 0,
 'TransFr_I': 1,
 'TransTo_B': 2,
 'TransTo_I': 3,
 'AMOUNT_B': 4,
 'BANK_B': 5,
 'BANK_I': 6,
 'O': 7}

In [5]:
MAX_LEN = 30
batch_size = 64

In [6]:
start = time.time()

In [16]:
# test_sentence = ["我想要給我的房東台新100元"]
# test_sentence = ["我想要給我老婆100塊從我台幣帳戶"]
test_sentence = ["轉給我老闆的上海商銀2873元"]

tokenized_test_texts = [tokenizer.tokenize(sent) for sent in test_sentence]
print("Tokenized Test Sentence:")
print(tokenized_test_texts[0], len(tokenized_test_texts[0]))
print("-"*50)

input_ids = pad_sequences(
    [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_test_texts],
    maxlen=MAX_LEN, dtype="long", truncating="post", padding="post"
)
test_attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
print("Padding Test Sequence:")
print(input_ids[0])
print(test_attention_masks[0])
print("-"*50)

Tokenized Test Sentence:
['轉', '給', '我', '老', '闆', '的', '上', '海', '商', '銀', '287', '##3', '元'] 13
--------------------------------------------------
Padding Test Sequence:
[ 6752  5183  2769  5439  7293  4638   677  3862  1555  7065 11525  8152
  1039     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
--------------------------------------------------


In [17]:
test_inputs = torch.tensor(input_ids)
test_masks = torch.tensor(test_attention_masks)

test_sentence_data = TensorDataset(test_inputs, test_masks)
train_sentence_sampler = RandomSampler(test_sentence_data)
test_sentence_dataloader = DataLoader(test_sentence_data, sampler=train_sentence_sampler, batch_size=batch_size)

In [18]:
model.eval()
predictions = []

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in test_sentence_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])

pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]

In [19]:
print(tokenized_test_texts[0], len(tokenized_test_texts[0]))
print()
token = list()
for tok in tokenized_test_texts[0]:
    if ('##' in tok):
        token[-1] = token[-1] + tok.replace('##', '')
    else:
        token.append(tok)
        
token_tag = pred_tags[0][:len(token)]
print(token, len(token))
print(token_tag, len(token_tag))

['轉', '給', '我', '老', '闆', '的', '上', '海', '商', '銀', '287', '##3', '元'] 13

['轉', '給', '我', '老', '闆', '的', '上', '海', '商', '銀', '2873', '元'] 12
['O', 'O', 'O', 'TransTo_B', 'TransTo_I', 'O', 'BANK_B', 'BANK_I', 'BANK_I', 'BANK_I', 'AMOUNT_B', 'O'] 12


In [20]:
print ("predict time:", time.time()-start)

predict time: 30.33272933959961


In [21]:
trans_to_list, trans_to = list(), ""
trans_from_list, trans_from = list(), ""
bank_list, bank = list(), ""
amount_list = list()

for i in range(len(token_tag)):
    if (token_tag[i] == "TransTo_B" or token_tag[i] == "TransTo_I"):
        trans_to += token[i]
        if (i+1 >= len(token_tag) or token_tag[i+1] != "TransTo_I"):
            if (trans_to != ""):
                trans_to_list.append(trans_to)
            trans_to = ""
    elif (token_tag[i] == "TransFr_B" or token_tag[i] == "TransFr_I"):
        trans_from += token[i]
        if (i+1 >= len(token_tag) or token_tag[i+1] != "TransFr_I"):
            if (trans_from != ""):
                trans_from_list.append(trans_from)
            trans_from = ""
    elif (token_tag[i] == "BANK_B" or token_tag[i] == "BANK_I"):
        bank += token[i]
        if (i+1 >= len(token_tag) or token_tag[i+1] != "BANK_I"):
            if (bank != ""):
                bank_list.append(bank)
            bank = ""
    elif (token_tag[i] == "AMOUNT_B"):
        amount_list.append(token[i])
            
print("TransFr:", trans_from_list)
print("TransTo:", trans_to_list)
print("BANK:", bank_list)
print("AMOUNT:", amount_list)

TransFr: []
TransTo: ['老闆']
BANK: ['上海商銀']
AMOUNT: ['2873']


In [13]:
import jieba

In [14]:
jieba.load_userdict('./userdict.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.431 seconds.
Prefix dict has been built successfully.


In [15]:
sentence = '請幫忙從我的薪轉轉3993元給我的父親帳戶'
seg_list = jieba.lcut(sentence)
print(seg_list)

['請', '幫忙', '從', '我', '的', '薪轉', '轉', '3993', '元給', '我', '的', '父親', '帳戶']
