In [1]:
import theano.sandbox.cuda
theano.sandbox.cuda.use("gpu0")

Using gpu device 0: GeForce GTX TITAN X


In [2]:
from collections import OrderedDict

import numpy
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

### Scan

Theano ではループのために For 文ではなく、Scan というものを使います　　
少しややこしいので、簡単な例を

In [3]:
##Suppose you have a sequence [1, 2, 3, 4, 5] let's define identity function with scan
x = T.fvector("x")

def step(x):
    return x

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=None # 初期値
                    )

f = theano.function([x], h)

print( f(numpy.array([1, 2, 3, 4, 5]).astype("float32")) )

[ 1.  2.  3.  4.  5.]


In [4]:
##Next we define accumulation function
x = T.fvector("x")

def step(x, h_tm1):
    return x + h_tm1

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=0.0, #Initial value for h
                       #go_backwards=True #you might use it for bi-directional RNNs
                    )

f = theano.function([x], h)

print( f(numpy.array([1, 2, 3, 4, 5]).astype("float32")) )

[  1.   3.   6.  10.  15.]


In [5]:
## Let's do the same thing with matrix, accumulation over column
x = T.fmatrix("x")

def step(x, h_tm1):
    return x + h_tm1

h, _ = theano.scan(
                       fn=step,
                       sequences=x, 
                       outputs_info=numpy.array([0., 0., 0., 0., 0.]) #Initial value for h, it's better to use T.alloc().
                    )

f = theano.function([x], h)

print( f(numpy.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]).astype("float32")) )

[[  1.   2.   3.   4.   5.]
 [  2.   4.   6.   8.  10.]
 [  3.   6.   9.  12.  15.]]


In [6]:
## Advanced :: take previous inputs
x = T.fmatrix("x")

def step(x, h_tm1, h_tm2):
    return x + h_tm1 + h_tm2

h, _ = theano.scan(
                       fn=step,
                       sequences=[ dict(input= x, taps = [0, -1, -2])],
                       outputs_info=None #Initial value for h
                    )

f = theano.function([x], h)

print(
    f(numpy.array([[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]]).astype("float32"))
)

[[  3.   6.   9.  12.  15.]
 [  3.   6.   9.  12.  15.]
 [  3.   6.   9.  12.  15.]]


### [宿題] POS Tagging

文が与えられた時、その品詞を予測する RNN を学習します。

word2index は単語をIDに変換する辞書、tag2index は品詞をIDに変換する辞書です。  
train_data, dev_data には文と品詞タグのペアが入っています。  
文の長さと品詞タグの長さは必ず同じです。

encode_dataset を使うと単語と品詞をIDに変換することができます。

In [7]:
from collections import OrderedDict

import numpy
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from sklearn.utils import shuffle
from sklearn.metrics import f1_score

def load_data(file_path):
    dataset = []
    vocab, tag = set(), set()
    for line in open(file_path):
        # "a pen ||| " => [['a', 'pen'], ['DT', 'NN']]
        instance = [ l.strip().split() for l in line.split('|||') ]
        vocab.update(instance[0])
        tag.update(instance[1])
        dataset.append(instance)
    return dataset, vocab, tag

def encode_dataset(dataset, word2index, tag2index):
    X, y = [], []
    vocab = set(word2index.keys())
    for sentence, tags in dataset:
        X.append([ word2index[word] if word in vocab else word2index['<unk>'] for word in sentence])
        y.append([ tag2index[tag] for tag in tags])
    return X, y

train_data, train_vocab, train_tags = load_data('train.unk')
special_words = set(['<unk>']) # 未知の単語に使う

# {'a': 1, 'pen': 1}
word2index = dict(map(lambda x: (x[1], x[0]), enumerate(train_vocab | special_words)))
tag2index  = dict(map(lambda x: (x[1], x[0]), enumerate(train_tags)))

In [8]:
train_size = len(train_data)
train_data, dev_data = train_data[:train_size//10 * 8], train_data[train_size//10 * 8:]

In [9]:
# train_data[0] = train.unk の1行目を表示
for word, tag in zip(train_data[0][0], train_data[0][1]):
    print(word, tag)

In IN
an DT
Oct. NNP
19 CD
review NN
of IN
`` ``
The DT
Misanthrope NN
'' ''
at IN
Chicago NNP
's POS
Goodman NNP
Theatre NNP
`` ``
Revitalized VBN
Classics NNS
Take VBP
the DT
Stage NN
in IN
Windy NNP
City NNP
, ,
'' ''
Leisure NN
& CC
Arts NNS
, ,
the DT
role NN
of IN
Celimene NNP
, ,
played VBN
by IN
Kim NNP
Cattrall NNP
, ,
was VBD
mistakenly RB
attributed VBN
to TO
Christina NNP
Haag NNP
. .


次のセルを完成させて提出してください　　

今回の入力は単語のID列（ベクトル x）と品詞のID列 (ベクトル y)です。  
Projection レイヤーを使って、単語をベクトルに変換します。  
その後、RNN に入力し、その出力値をSotfmax関数を使って確率分布に変換します。  
予測は画像の時とおなじく、最大の確率を持つクラスを予測とします。

In [30]:
train_size = len(train_data)
train_data, dev_data = train_data[:train_size//10 * 8], train_data[train_size//10 * 8:]

train_X, train_y = encode_dataset(train_data, word2index, tag2index)
dev_X  , dev_y   = encode_dataset(dev_data,   word2index, tag2index)

rng = numpy.random.RandomState(42)
trng = RandomStreams(42)

def sharedX(X, dtype="float32"):
    return theano.shared(numpy.asarray(X, dtype=dtype))


class Activation:
    def __init__(self, func):
        self.func = func
        self.params = []

    def fprop(self, x):
        return self.func(x)


class Projection:
    def __init__(self, in_dim, out_dim, scale):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.params = [ self.W ]

    def fprop(self, x):
        h = self.W[x]
        return h
    
    
class Linear:
    def __init__(self, in_dim, out_dim, scale):
        self.W = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.b = sharedX(rng.randn(out_dim,) * scale)
        self.params = [ self.W, self.b ]

    def fprop(self, x):
        h = T.dot(x, self.W)+self.b
        return h

    
class RNN:
    def __init__(self, in_dim, out_dim, scale):
        self.scale = scale
        self.hid_dim = hid_dim

        ## 重みの次元を決める。
        self.Wx = sharedX(rng.randn(in_dim, out_dim) * scale)
        self.Wh = sharedX(rng.randn(out_dim, out_dim) * scale)
        self.bh = sharedX(rng.randn(out_dim,) * scale)
        ## Initial State をどのように初期化するか
        self.h0 = sharedX(numpy.zeros(out_dim,))

        self.output_info = [ self.h0 ]
        self.params = [ self.Wx, self.Wh, self.bh, self.h0 ]

    def fprop(self, x):
        def step(u_t, h_tm1):
            h = T.tanh(T.dot(u_t, self.Wx)  + T.dot(h_tm1, self.Wh) + self.bh)
            return h
        
        ## Scan の方法を考える 
        h, _ = theano.scan(
            fn=step,
            sequences=x,
            outputs_info=self.h0
        )
        return h
    

def sgd(cost, params, lr):
    gparams = T.grad(cost, params)
    updates = OrderedDict()
    for param, gparam in zip(params, gparams):
        ## Advanced Gradient Glip を実装する　（必須ではない）
        updates[param] = param - lr * gparam
    return updates


def prop(layers, x):
    for i, layer in enumerate(layers):
        if i == 0:
            layer_out = layer.fprop(x)
        else:
            layer_out = layer.fprop(layer_out)
    return layer_out


def get_params(layers):
    params = []
    for layer in layers:
        params += layer.params
    return params


### build Model + Train
vocab_size = len(word2index)
hid_dim    = 100
out_dim    = len(tag2index)

x, t = T.lvector("x"), T.lvector("t")

layers = [
    Projection(vocab_size, hid_dim, scale=0.09), # scale?
    RNN(hid_dim, out_dim, scale=0.5), # scale?
    Linear(out_dim, out_dim, scale=0.5), # scale?
    Activation(func=T.nnet.softmax)
]

prob = prop(layers, x)
cost = -T.mean(T.log(prob)[T.arange(t.shape[0]), t]) # WRITEME
pred = T.argmax(prob, axis=1) # WRITEME

## Collect Parameters
params = get_params(layers) 

## Define update graph
updates = sgd(cost, params, lr=numpy.float32(0.01)) 

## Compile Function
train = theano.function([x,t], cost, updates=updates)
valid = theano.function([x,t], [cost, pred])
test  = theano.function([x], pred)

epochs = 100
## Train
for epoch in range(epochs):
    train_X, train_y = shuffle(train_X, train_y)  # Shuffle Samples !!
    for i, (instance_x, instance_y) in enumerate(zip(train_X, train_y)):
        cost = train(instance_x, instance_y)
        #if i % 1000 == 0:
            #print("EPOCH:: %i, Iteration %i, cost: %.3f"%(epoch+1, i, cost))
    
    dev_true, dev_pred = [], []
    costs = []
    for i, (instance_x, instance_y) in enumerate(zip(dev_X, dev_y)):
        cost, pred = valid(instance_x, instance_y)
        dev_pred += list(pred) # 予測結果はベクトル
        dev_true += instance_y
        costs.append(cost)

    if True:
    #if ((epoch+1) % 10 == 0) or (epoch == 0):
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(
                        epoch+1,
                        numpy.mean(costs),
                        f1_score(dev_true, dev_pred, average="micro")
                    ))

EPOCH::   1, Validatioon Cost:: 2.674, Validation F1:: 0.323
EPOCH::   2, Validatioon Cost:: 2.285, Validation F1:: 0.408
EPOCH::   3, Validatioon Cost:: 2.043, Validation F1:: 0.464
EPOCH::   4, Validatioon Cost:: 1.874, Validation F1:: 0.496
EPOCH::   5, Validatioon Cost:: 1.759, Validation F1:: 0.529
EPOCH::   6, Validatioon Cost:: 1.668, Validation F1:: 0.543
EPOCH::   7, Validatioon Cost:: 1.594, Validation F1:: 0.567
EPOCH::   8, Validatioon Cost:: 1.523, Validation F1:: 0.576
EPOCH::   9, Validatioon Cost:: 1.481, Validation F1:: 0.589
EPOCH::  10, Validatioon Cost:: 1.432, Validation F1:: 0.611
EPOCH::  11, Validatioon Cost:: 1.387, Validation F1:: 0.615
EPOCH::  12, Validatioon Cost:: 1.352, Validation F1:: 0.627
EPOCH::  13, Validatioon Cost:: 1.321, Validation F1:: 0.650
EPOCH::  14, Validatioon Cost:: 1.291, Validation F1:: 0.653
EPOCH::  15, Validatioon Cost:: 1.245, Validation F1:: 0.664
EPOCH::  16, Validatioon Cost:: 1.218, Validation F1:: 0.671
EPOCH::  17, Validatioon