# POS tagging

## Part-of-speech tagging
## 词性标注

### 例如，名词，动词，形容词等

## 实例：
输入：[The cat sat on the mat]
输出：[DT NN VB IN DT NN]

In [1]:
from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import collections
import nltk
import numpy as np
import os

Using TensorFlow backend.


In [2]:
DATA_DIR ="./data"

fedata = open(os.path.join(DATA_DIR, "treebank_sents.txt"), mode="w")
ffdata = open(os.path.join(DATA_DIR, "treebank_poss.txt"), mode="w")

sents = nltk.corpus.treebank.tagged_sents()

for sent in sents:
    #print(sent)
    words, poss = [], []
    for word, pos in sent:
        if pos=="-NONE-":
            continue
        words.append(word)
        poss.append(pos)
    fedata.write("{:s}\n".format(" ".join(words)))
    ffdata.write("{:s}\n".format(" ".join(poss)))
fedata.close()
ffdata.close()

In [3]:
def parse_sentences(filename):
    word_freqs = collections.Counter()
    num_recs, maxlen = 0,0
    fin = open(filename, mode="r")
    for line in fin:
        words = line.strip().lower().split()
        for word in words:
            word_freqs[word]+=1
        if len(words)>maxlen:
            maxlen=len(words)
        num_recs+=1
    fin.close()
    return word_freqs, maxlen, num_recs

In [4]:
s_wordfreqs, s_maxlen, s_numrecs = parse_sentences(os.path.join(DATA_DIR, "treebank_sents.txt"))
t_wordfreqs, t_maxlen, t_numrecs = parse_sentences(os.path.join(DATA_DIR, "treebank_poss.txt"))
print("source word freqs length: %d; max length: %d; num recs: %d" % (len(s_wordfreqs), s_maxlen, s_numrecs))
print("target word freqs length: %d; max length: %d; num recs: %d" % (len(t_wordfreqs), t_maxlen, t_numrecs))

source word freqs length: 10947; max length: 249; num recs: 3914
target word freqs length: 45; max length: 249; num recs: 3914


## 语料库中，有10497个不同单词，最长的句子为249，共有3914个句子

## 语料库中，有45中不同类型的单词

In [5]:
MAX_SEQLEN = 250
S_MAX_FEATURES = 5000
T_MAX_FEATURES = 45

In [6]:
## 只使用source的前5000个不同单词
## 再加上 UNK  和  PAD   两个伪编码
s_vocabsize = min(len(s_wordfreqs), S_MAX_FEATURES) + 2

s_word2index = {x[0]:(i+2) for i,x in enumerate(s_wordfreqs.most_common(S_MAX_FEATURES))}
s_word2index["PAD"]=0
s_word2index["UNK"]=1
s_index2word = {v:k for k,v in s_word2index.items()}


t_vocabsize = len(t_wordfreqs) + 1
t_word2index = {x[0]:(i+1) for i,x in enumerate(t_wordfreqs.most_common(T_MAX_FEATURES))}
t_word2index["PAD"]=0
t_index2word = {v:k for k,v in t_word2index.items()}

In [7]:
s_word2index

{',': 2,
 'the': 3,
 '.': 4,
 'of': 5,
 'to': 6,
 'a': 7,
 'in': 8,
 'and': 9,
 "'s": 10,
 'for': 11,
 'that': 12,
 '$': 13,
 '``': 14,
 "''": 15,
 'is': 16,
 'said': 17,
 'it': 18,
 'on': 19,
 '%': 20,
 'by': 21,
 'at': 22,
 'as': 23,
 'with': 24,
 'from': 25,
 'million': 26,
 'mr.': 27,
 'are': 28,
 'was': 29,
 'be': 30,
 'its': 31,
 'has': 32,
 'an': 33,
 'new': 34,
 'have': 35,
 "n't": 36,
 'but': 37,
 'he': 38,
 'or': 39,
 'will': 40,
 'they': 41,
 'company': 42,
 '--': 43,
 'which': 44,
 'this': 45,
 'u.s.': 46,
 'says': 47,
 'year': 48,
 'about': 49,
 'would': 50,
 'more': 51,
 'were': 52,
 'market': 53,
 'their': 54,
 'than': 55,
 'stock': 56,
 ';': 57,
 'who': 58,
 'trading': 59,
 'had': 60,
 'also': 61,
 'president': 62,
 'billion': 63,
 'up': 64,
 'one': 65,
 'been': 66,
 'some': 67,
 ':': 68,
 'other': 69,
 'not': 70,
 'program': 71,
 'his': 72,
 'because': 73,
 'if': 74,
 'could': 75,
 'share': 76,
 'corp.': 77,
 'all': 78,
 'years': 79,
 'i': 80,
 'first': 81,
 'shares': 

In [8]:
t_word2index

{'#': 40,
 '$': 23,
 "''": 25,
 ',': 7,
 '-lrb-': 36,
 '-rrb-': 35,
 '.': 8,
 ':': 26,
 'PAD': 0,
 '``': 24,
 'cc': 13,
 'cd': 9,
 'dt': 4,
 'ex': 37,
 'fw': 43,
 'in': 2,
 'jj': 6,
 'jjr': 28,
 'jjs': 32,
 'ls': 42,
 'md': 20,
 'nn': 1,
 'nnp': 3,
 'nnps': 29,
 'nns': 5,
 'pdt': 39,
 'pos': 21,
 'prp': 17,
 'prp$': 22,
 'rb': 11,
 'rbr': 34,
 'rbs': 38,
 'rp': 31,
 'sym': 45,
 'to': 14,
 'uh': 44,
 'vb': 12,
 'vbd': 10,
 'vbg': 18,
 'vbn': 15,
 'vbp': 19,
 'vbz': 16,
 'wdt': 27,
 'wp': 30,
 'wp$': 41,
 'wrb': 33}

In [9]:
def build_tensor(filename, numrecs, word2index, maxlen, make_categorical=False, num_classes=0):
    data = np.empty((numrecs,), dtype=list)
    with open(filename, "r") as fin:
        i=0
        for line in fin:
            wids=[]
            for word in line.strip().lower().split():
                if word in word2index:
                    wids.append(word2index[word])
                else:
                    wids.append(word2index["UNK"])
            if make_categorical:
                data[i] = np_utils.to_categorical(wids, num_classes=num_classes)
            else:
                data[i]=wids
            i+=1
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata

In [10]:
X = build_tensor(os.path.join(DATA_DIR, "treebank_sents.txt"), s_numrecs, s_word2index, MAX_SEQLEN)
y = build_tensor(os.path.join(DATA_DIR, "treebank_poss.txt"), t_numrecs, t_word2index, MAX_SEQLEN, True, t_vocabsize)

In [11]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y, test_size=0.2, random_state=666)

In [12]:
Xtrain.shape

(3131, 250)

In [13]:
Xtest.shape

(783, 250)

In [14]:
Xtrain[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [15]:
Ytrain[0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [16]:
Ytrain[0][0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

![image.png](attachment:image.png)

## 网络结构

1. 输入，每个单词的id组成的list，长度为,(None, MAX_SEQLEN, 1)
2. 经过embedding层，输出，(None, MAX_SEQLEN,EMBEDDING_SIZE)
3. 经过RNN encoder后，设置return_sequences=False, 只需要返回最后的上下文向量，看完MAX_SEQLEN后，输出(None, HIDDEN_SIZE)
4. 使用RepeatVector层，输出（None, MAX_SEQLEN, HIDDEN_SIZE）
5. 输入到RNN decoder，输出（None, MAX_SEQLEN, HIDDEN_SIZE）
6. 输入到全连接层，使用softmax激活函数，输出（None, MAX_SEQLEN, t_vocab_size）。输出的每一列的最大值，表示所属的词性标注。

In [17]:
help(GRU)

Help on class GRU in module keras.layers.recurrent:

class GRU(RNN)
 |  Gated Recurrent Unit - Cho et al. 2014.
 |  
 |  # Arguments
 |      units: Positive integer, dimensionality of the output space.
 |      activation: Activation function to use
 |          (see [activations](../activations.md)).
 |          If you pass None, no activation is applied
 |          (ie. "linear" activation: `a(x) = x`).
 |      recurrent_activation: Activation function to use
 |          for the recurrent step
 |          (see [activations](../activations.md)).
 |      use_bias: Boolean, whether the layer uses a bias vector.
 |      kernel_initializer: Initializer for the `kernel` weights matrix,
 |          used for the linear transformation of the inputs.
 |          (see [initializers](../initializers.md)).
 |      recurrent_initializer: Initializer for the `recurrent_kernel`
 |          weights matrix,
 |          used for the linear transformation of the recurrent state.
 |          (see [initiali

In [22]:
help(TimeDistributed)

Help on class TimeDistributed in module keras.layers.wrappers:

class TimeDistributed(Wrapper)
 |  This wrapper applies a layer to every temporal slice of an input.
 |  
 |  The input should be at least 3D, and the dimension of index one
 |  will be considered to be the temporal dimension.
 |  
 |  Consider a batch of 32 samples,
 |  where each sample is a sequence of 10 vectors of 16 dimensions.
 |  The batch input shape of the layer is then `(32, 10, 16)`,
 |  and the `input_shape`, not including the samples dimension, is `(10, 16)`.
 |  
 |  You can then use `TimeDistributed` to apply a `Dense` layer
 |  to each of the 10 timesteps, independently:
 |  
 |  ```python
 |      # as the first layer in a model
 |      model = Sequential()
 |      model.add(TimeDistributed(Dense(8), input_shape=(10, 16)))
 |      # now model.output_shape == (None, 10, 8)
 |  ```
 |  
 |  The output will then have shape `(32, 10, 8)`.
 |  
 |  In subsequent layers, there is no need for the `input_shape`:
 

In [18]:
EMBED_SIZE = 128
HIDDEN_SIZE = 64
BATCH_SIZE = 32
NUM_EPOCHS = 1

model = Sequential()
model.add(Embedding(input_dim=s_vocabsize, output_dim=EMBED_SIZE, input_length=MAX_SEQLEN))
model.add(Dropout(0.2))
model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(MAX_SEQLEN))
model.add(GRU(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(units=t_vocabsize)))
model.add(Activation("softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [21]:
model.fit(Xtrain, Ytrain, batch_size = BATCH_SIZE, epochs = NUM_EPOCHS, validation_data=[Xtest, Ytest])

score,acc = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy:%.3f" % (score, acc))

Train on 3131 samples, validate on 783 samples
Epoch 1/1
Test score: 0.281, accuracy:0.013
