# Chinese NER Using Lattice LSTM   ACL 2018

## Main Idea

<img src="./model.png"  width="700" height="700" align="bottom" />

## Outline
* Data Processing
* Model
* Train
* Verify & Test

## 1 Data Processing

### data_initialization

In [39]:
from utils.alphabet import Alphabet
from utils.gazetteer import Gazetteer

In [64]:
word_alphabet = Alphabet('word')
biword_alphabet = Alphabet('biword')
char_alphabet = Alphabet('character')
label_alphabet = Alphabet('label', True)
gaz_alphabet = Alphabet('gaz')

In [20]:
START = "</s>"
UNKNOWN = "</unk>"
PADDING = "</pad>"
NULLKEY = "-null-"

#### Alphabet

In [16]:
# vscode

#### build_alphabet

In [1]:
input_file = './data/demo.train.char'   # train.char, dev.char, test.char

In [2]:
in_lines = open(input_file,'r').readlines()

In [3]:
idx = 0

In [4]:
line = in_lines[idx]

In [5]:
line

'陈 B-PER\n'

In [7]:
len(line)

8

if len(line) > 2:  排除是空字符的情况

In [8]:
pairs = line.strip().split()

In [9]:
pairs

['陈', 'B-PER']

In [10]:
word = pairs[0]

In [11]:
word

'陈'

In [13]:
label = pairs[-1]
label

'B-PER'

In [17]:
label_alphabet.add(label)
word_alphabet.add(word)

if idx < len(in_lines) - 1 and len(in_lines[idx+1]) > 2:   不是in_line 的最后一个 并且 这个例子不是空字符

In [18]:
biword = word + in_lines[idx+1].strip().split()[0] # bigram

In [19]:
in_lines[idx+1]

'元 E-PER\n'

else

In [21]:
biword = word + NULLKEY

In [22]:
biword_alphabet.add(biword)

In [24]:
for char in word:
    char_alphabet.add(char)

In [26]:
word_alphabet_size = word_alphabet.size()
biword_alphabet_size = biword_alphabet.size()
char_alphabet_size = char_alphabet.size()
label_alphabet_size = label_alphabet.size()

In [27]:
startS = False
startB = False

选择编码方式

In [32]:
for label,_ in label_alphabet.iteritems():
    if "S-" in label.upper():
        startS = True
    elif "B-" in label.upper():
        startB = True
if startB:
    if startS:
        tagScheme = "BMES"
    else:
        tagScheme = "BIO"

#### build_gaz_file

if gaz_file:  gaz_file = 'data/ctb.50d.vec'

In [41]:
gaz_lower = False

In [42]:
gaz = Gazetteer(gaz_lower)

In [36]:
gaz_file = 'data/ctb.50d.vec'

In [37]:
fins = open(gaz_file, 'r').readlines()

In [43]:
fin = fins[0]

In [44]:
fin

'</s> 0.008005 0.008839 -0.007661 -0.006556 0.002733 0.006042 0.001882 0.000423 -0.007207 0.004437 -0.008713 0.002499 -0.001503 -0.001914 -0.006631 -0.003764 0.005159 0.006051 0.005938 0.003195 0.003090 -0.007605 -0.008192 0.009939 0.007603 0.006180 -0.001208 0.008031 -0.000990 0.001469 -0.000298 -0.005966 0.002625 -0.002675 -0.007651 0.009508 0.008759 -0.002190 -0.000452 0.001018 -0.007275 -0.008014 0.009109 0.000126 -0.005165 -0.006084 -0.006153 0.003394 0.000403 0.002662 \n'

In [45]:
fin = fin.strip().split()[0]

if fin:

In [47]:
gaz.insert(fin, "one_source")  # vscode

#### build_gaz_alphabet

In [48]:
input_file = './data/demo.dev.char'

In [49]:
in_lines = open(input_file,'r').readlines()

In [50]:
word_list = []

for line in in_lines:

In [51]:
line = in_lines[0]

In [52]:
line

'上 B-GPE\n'

if len(line) > 3:

In [53]:
word = line.split()[0]

In [54]:
word

'上'

In [55]:
word_list.append(word)

In [56]:
word_list

['上']

else:

In [57]:
w_length = len(word_list) #说明是一句话结束了

for idx in range(w_length):

In [58]:
idx = 0

In [62]:
word_list[idx:]

['上']

In [60]:
matched_entity = gaz.enumerateMatchList(word_list[idx:])   # 寻找是否有entity

In [61]:
matched_entity

[]

#### fix_alphabet

In [65]:
word_alphabet.close()
biword_alphabet.close()
char_alphabet.close()
label_alphabet.close() 
gaz_alphabet.close()  

## generate_instance_with_gaz

In [66]:
input_file = './data/demo.train.char'

In [67]:
in_lines = open(input_file,'r').readlines()

In [69]:
instence_texts = []
instence_Ids = []
words = []
biwords = []
chars = []
labels = []
word_Ids = []
biword_Ids = []
char_Ids = []
label_Ids = []

In [70]:
idx = 0

In [71]:
line = in_lines[idx]

In [72]:
line

'陈 B-PER\n'

if len(line) > 2:

In [74]:
pairs = line.strip().split()
pairs

['陈', 'B-PER']

In [75]:
word = pairs[0]
label = pairs[-1]

In [78]:
if idx < len(in_lines) -1 and len(in_lines[idx+1]) > 2:
    biword = word + in_lines[idx+1].strip().split()[0]
else:
    biword = word + NULLKEY

In [79]:
biwords.append(biword)
words.append(word)
labels.append(label)

In [81]:
# word_Ids.append(word_alphabet.get_index(word)) vscode

else:  表示一句话结束

if ((max_sent_length < 0) or (len(words) < max_sent_length)) and (len(words)>0):  句子的长度小于最大长度，并且有句子的情况下

In [82]:
## 查找entity
gazs = []
gaz_Ids = []
w_length = len(words)

In [83]:
idx = 0

In [84]:
matched_list = gaz.enumerateMatchList(words[idx:])
matched_length = [len(a) for a in matched_list]

In [85]:
# vscode 

### build_word_pretrain_emb

#### build_pretrain_embedding

In [89]:
from utils.functions import load_pretrain_emb

In [92]:
import numpy as np

In [86]:
embedding_path = 'data/gigaword_chn.all.a2b.uni.ite50.vec'

In [87]:
embedd_dict = dict()

In [90]:
embedd_dict, embedd_dim = load_pretrain_emb(embedding_path)

In [93]:
scale = np.sqrt(3.0 / embedd_dim)

In [94]:
scale

0.2449489742783178

In [95]:
pretrain_emb = np.empty([word_alphabet.size(), embedd_dim])

In [96]:
pretrain_emb.shape

(2, 50)

In [97]:
perfect_match = 0
case_match = 0
not_match = 0

In [98]:
# vscode

# 2 Model

#### 定义模型 SeqModel

####  BiLSTM 
  * LatticeLSTM
      * MultiInputLSTMCell
      * WordLSTMCell
    

#### CRF
   * self.transitions 

In [101]:
## vscode 

# 3 Train

##### batchify_with_label 将处理好的数据转化为输入

In [None]:
## vscode 

# 4 Verify & Test

## Code https://github.com/hankniu01/Lattice_lstm 