In [37]:
import getopt
import sys
import os
import math
import time
import argparse
import torch
import torch as T
from torch.autograd import Variable as var
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.nn.utils import clip_grad_norm_

#torch.manual_seed(1)

In [38]:
parser = argparse.ArgumentParser(description='PyTorch Differentiable Neural Computer')
parser.add_argument('-input_size', type=int, default=6, help='dimension of input feature')
parser.add_argument('-rnn_type', type=str, default='lstm', help='type of recurrent cells to use for the controller')
parser.add_argument('-nhid', type=int, default=64, help='number of hidden units of the inner nn')
parser.add_argument('-dropout', type=float, default=0, help='controller dropout')
parser.add_argument('-memory_type', type=str, default='dnc', help='dense or sparse memory: dnc | sdnc | sam')

parser.add_argument('-nlayer', type=int, default=1, help='number of layers')
parser.add_argument('-nhlayer', type=int, default=2, help='number of hidden layers')
parser.add_argument('-lr', type=float, default=1e-4, help='initial learning rate')
parser.add_argument('-optim', type=str, default='adam', help='learning rule, supports adam|rmsprop')
parser.add_argument('-clip', type=float, default=50, help='gradient clipping')

parser.add_argument('-batch_size', type=int, default=100, metavar='N', help='batch size')
parser.add_argument('-mem_size', type=int, default=20, help='memory dimension')
parser.add_argument('-mem_slot', type=int, default=16, help='number of memory slots')
parser.add_argument('-read_heads', type=int, default=4, help='number of read heads')
parser.add_argument('-sparse_reads', type=int, default=10, help='number of sparse reads per read head')
parser.add_argument('-temporal_reads', type=int, default=2, help='number of temporal reads')

parser.add_argument('-sequence_max_length', type=int, default=4, metavar='N', help='sequence_max_length')
parser.add_argument('-curriculum_increment', type=int, default=0, metavar='N', help='sequence_max_length incrementor per 1K iterations')
parser.add_argument('-curriculum_freq', type=int, default=1000, metavar='N', help='sequence_max_length incrementor per 1K iterations')
parser.add_argument('-cuda', type=int, default=-1, help='Cuda GPU ID, -1 for CPU')

parser.add_argument('-iterations', type=int, default=100000, metavar='N', help='total number of iteration')
parser.add_argument('-summarize_freq', type=int, default=100, metavar='N', help='summarize frequency')
parser.add_argument('-check_freq', type=int, default=100, metavar='N', help='check point frequency')
parser.add_argument('-visdom', action='store_true', help='plot memory content on visdom per -summarize_freq steps')
args = parser.parse_args(args=[])

インプレスの写経を元にしたcopy_lstm_first

In [39]:
#Data preparetion
def generate_copyfirst(batch_size=2,length=10,size=6,vocab_size=10):
    #return (batch, length+1, size+1)
    input_story= np.zeros((batch_size, length + 1, size+1), dtype=np.float32)
    #input_query=
    target_out = np.zeros((batch_size, length + 1, size+1), dtype=np.float32)

    sequence = np.random.rand(batch_size, length, size)

    input_story[:,:length,:size]=sequence
    input_story[:, length, -1] = 1  # QUERY
    target_out[:, -1, :size] = sequence[:,0,:]

    input_story=torch.from_numpy(input_story).cuda()
    target_out=torch.from_numpy(target_out).cuda()
    return var(input_story),var(target_out)

i,t =generate_copyfirst()
print("input:  ",i)
print("target:  ",t)


input:   tensor([[[0.0938, 0.6383, 0.2726, 0.2224, 0.3171, 0.5034, 0.0000],
         [0.4009, 0.3630, 0.9441, 0.3897, 0.7335, 0.9221, 0.0000],
         [0.4529, 0.9606, 0.9264, 0.3789, 0.6778, 0.7037, 0.0000],
         [0.8878, 0.6483, 0.9495, 0.0736, 0.0755, 0.4890, 0.0000],
         [0.9427, 0.0769, 0.7107, 0.1870, 0.1975, 0.1911, 0.0000],
         [0.3998, 0.1185, 0.5214, 0.4300, 0.7520, 0.2456, 0.0000],
         [0.1138, 0.7064, 0.0278, 0.0561, 0.1392, 0.3029, 0.0000],
         [0.2495, 0.8236, 0.6098, 0.4970, 0.7401, 0.6144, 0.0000],
         [0.0874, 0.6629, 0.9142, 0.2918, 0.9202, 0.1447, 0.0000],
         [0.9415, 0.6337, 0.1613, 0.7211, 0.5016, 0.6611, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000]],

        [[0.8632, 0.3089, 0.9365, 0.0693, 0.3590, 0.7515, 0.0000],
         [0.3358, 0.0983, 0.3881, 0.1498, 0.9716, 0.4647, 0.0000],
         [0.7356, 0.1386, 0.2871, 0.9466, 0.5107, 0.1444, 0.0000],
         [0.6548, 0.7686, 0.6997, 0.6490, 0.4573, 0

In [86]:
def discrete_copyfirst(batch_size=2,length=10,size=6,vocab_size=10):
    #return (batch, length+1, size+1)
    input_story= np.zeros((batch_size, length + 1, size+1), dtype=np.float32)
    #input_query=
    target_out = np.zeros((batch_size, length + 1, size+1), dtype=np.float32)

    sequence = np.random.binomial(1,0.5,(batch_size, length, size))

    input_story[:,:length,:size]=sequence
    input_story[:, length, -1] = 1  # QUERY
    target_out[:, -1, :size] = sequence[:,0,:]

    input_story=torch.from_numpy(input_story).cuda()
    target_out=torch.from_numpy(target_out).cuda()
    return var(input_story),var(target_out)

i,t =discrete_copyfirst()
print("input:  ",i)
print("target:  ",t)


input:   tensor([[[0., 1., 1., 0., 1., 0., 0.],
         [1., 1., 1., 1., 0., 1., 0.],
         [0., 0., 1., 0., 0., 1., 0.],
         [1., 1., 0., 1., 1., 1., 0.],
         [1., 1., 1., 0., 0., 1., 0.],
         [1., 0., 0., 0., 0., 1., 0.],
         [1., 1., 0., 0., 1., 1., 0.],
         [1., 0., 0., 0., 1., 1., 0.],
         [1., 0., 1., 0., 1., 1., 0.],
         [0., 0., 0., 1., 1., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1.]],

        [[1., 0., 1., 0., 1., 1., 0.],
         [1., 0., 0., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0., 1., 0.],
         [1., 0., 0., 1., 1., 0., 0.],
         [0., 1., 0., 0., 1., 1., 0.],
         [1., 1., 1., 1., 0., 1., 0.],
         [0., 0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 1., 1., 0.],
         [1., 1., 0., 1., 0., 0., 0.],
         [1., 0., 1., 1., 0., 1., 0.],
         [0., 0., 0., 0., 0., 0., 1.]]], device='cuda:0')
target:   tensor([[[0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0.

In [145]:
class MyLSTMCopyFirst(nn.Module):
    # モデルで使う各ネットワークをコンストラクタで定義
    def __init__(self, input_dim, hidden_dim, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(MyLSTMCopyFirst, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。
        self.hidden_dim = hidden_dim
        # LSTMの隠れ層。これ１つでOK。超便利。
        self.lstm = nn.LSTM(input_dim, hidden_dim)
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.flag=True
    # 順伝播処理はforward関数に記載
    def forward(self, sentence,hidden=None):
        # 2次元テンソルをLSTMに食わせられる様にviewで３次元テンソルにした上でLSTMへ流す。
        # 上記で説明した様にmany to oneのタスクを解きたいので、第二戻り値＝hiddenだけ使う。　babiもmany to oneのはず

        if (hidden==None):
            _, lstm_out = self.lstm(sentence.view(1,len(sentence),  -1)) #(1,batch, size)
        else:
            _,lstm_out = self.lstm(sentence.view(1,len(sentence),  -1),hidden)
        
        #sentence = sentence.permute(1,0,2)
        # lstm_out[0]は３次元テンソルになってしまっているので2次元に調整して全結合。
        #if self.flag:
        #    print("lstm_out[0] reshape")
        #    print(lstm_out[0].view(-1, self.hidden_dim))
        tag_space = self.hidden2tag(lstm_out[0].view(-1, self.hidden_dim))
        if self.flag:
            print("linear out")
            print(tag_space)
            self.flag=False
        #tag_space=tag_space.view(len(sentence),1,-1)
        return tag_space,lstm_out

In [89]:
def my_criterion(predictions, targets):
  return T.mean(
      -1 * F.logsigmoid(predictions) * (targets) - T.log(1 - F.sigmoid(predictions) + 1e-9) * (1 - targets)
  )

In [125]:
# model generate, optimizer and criterion setting
batch_size=2
length=10
size=6
vocab_size=10

model= MyLSTMCopyFirst(input_dim=size+1, hidden_dim=16, tagset_size=size+1).cuda()
optimizer = optim.SGD(model.parameters(),lr=0.03)
criterion = nn.MSELoss()


#Learn
n = 2    ##  データのサイズ
bs = 2   ##  バッチのサイズ
iterations=300
summarize_freq=20
last_save_losses = []

model.train()
for epoch in range(iterations + 1):
    #print("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations))
    input_story, target_out = generate_copyfirst()


    for time in range(length+1): #DataLoader　ではないと思う。layer forwardにあたる
        sentence =input_story[:,time,:]
        target=target_out[:,time,:]
        output = model(sentence)
        
        optimizer.zero_grad()
        #最後のステップだけloss計算する
        if (time==length):
            loss = criterion(output,target)

            loss.backward()
            optimizer.step()
            loss_value = loss.item()
            last_save_losses.append(loss_value)

    summarize = (epoch % summarize_freq == 0)
    if summarize:
        loss = np.mean(last_save_losses)
        print("epoch:",epoch," loss:",loss)
        last_save_losses = []


tensor([[[-0.0715, -0.0785,  0.0323, -0.0032,  0.0967, -0.1989,  0.0276,
          -0.0375,  0.0949,  0.0933,  0.0562, -0.0011,  0.0219,  0.0183,
           0.0941, -0.0504],
         [-0.0273, -0.0067,  0.0576, -0.0294,  0.0972, -0.2272,  0.0247,
          -0.0352,  0.1432,  0.1039,  0.0438, -0.0338, -0.0094,  0.0822,
           0.0149, -0.0296]]], device='cuda:0', grad_fn=<CudnnRnnBackward>)
tensor([[[-0.1850,  0.2205, -0.0138,  0.0902,  0.1512, -0.0903, -0.1336],
         [-0.2128,  0.2361, -0.0318,  0.0566,  0.1380, -0.0578, -0.0900]]],
       device='cuda:0', grad_fn=<AddBackward0>)
epoch: 0  loss: 0.2993655204772949
epoch: 20  loss: 0.23127497173845768


  return F.mse_loss(input, target, reduction=self.reduction)


epoch: 40  loss: 0.1777334939688444
epoch: 60  loss: 0.12730709407478571
epoch: 80  loss: 0.11438267435878516
epoch: 100  loss: 0.09148032125085592
epoch: 120  loss: 0.08888919055461883
epoch: 140  loss: 0.08276105672121048
epoch: 160  loss: 0.0834938095882535
epoch: 180  loss: 0.07841408289968968
epoch: 200  loss: 0.0717847978696227
epoch: 220  loss: 0.06774993389844894
epoch: 240  loss: 0.07710188757628203
epoch: 260  loss: 0.07159148193895817
epoch: 280  loss: 0.08199394382536411
epoch: 300  loss: 0.0832165228202939


In [132]:
#Test
test_story, test_y = generate_copyfirst()

model.eval()
with torch.no_grad():
    for time in range(length+1): #DataLoader　ではないと思う。layer forwardにあたる
        tsentence =test_story[:,time,:]
        ttarget=test_y[:,time,:]
        toutput = model(tsentence)

        #最後のステップだけloss計算する
        if (time==length):
            tloss = criterion(toutput,ttarget)
            tloss_value = tloss.item()
            print("tloss:",tloss_value)

#連続地のコピーにしたけど本来↓
    #output1= model(xtest)
    #ans = torch.argmax(output1,1)
    #print(((ytest == ans).sum().float() / len(ans) ).item() )

tloss: 1.7892667055130005


    離散値

In [149]:
# model generate, optimizer and criterion setting
batch_size=2
length=10
size=6
vocab_size=10

model= MyLSTMCopyFirst(input_dim=size+1, hidden_dim=16, tagset_size=size+1).cuda()
optimizer = optim.SGD(model.parameters(),lr=0.1)

#Learn
n = 2    ##  データのサイズ
bs = 2   ##  バッチのサイズ
iterations=1500
summarize_freq=iterations/15
last_save_losses = []

model.train()
for epoch in range(iterations + 1):
    #print("\rIteration {ep}/{tot}".format(ep=epoch, tot=iterations))
    input_story, target_out = discrete_copyfirst()

    hidden=None
    for time in range(length+1): #DataLoader　ではないと思う。layer forwardにあたる
        sentence =input_story[:,time,:]
        target=target_out[:,time,:]

        if time==0:
            output,hidden = model(sentence)
        else:
            output,hidden = model(sentence,hidden)

        optimizer.zero_grad()
        #最後のステップだけloss計算する
        if (time==length):
            loss = criterion((output),target)
            loss.backward()
            optimizer.step()
            loss_value = loss.item()
            last_save_losses.append(loss_value)
    hidden=None
    summarize = (epoch % summarize_freq == 0)
    if summarize:
        loss = np.mean(last_save_losses)
        print("epoch:",epoch," loss:",loss)
        last_save_losses = []


linear out
tensor([[ 0.1772,  0.0809,  0.1295,  0.0918,  0.0759, -0.2118, -0.0059],
        [ 0.2065,  0.1008,  0.1121,  0.0846,  0.0574, -0.2166,  0.0189]],
       device='cuda:0', grad_fn=<AddmmBackward>)
epoch: 0  loss: 0.2822442650794983
epoch: 100  loss: 0.2409405355155468
epoch: 200  loss: 0.2161868679523468
epoch: 300  loss: 0.2165006157755852
epoch: 400  loss: 0.21738955289125442
epoch: 500  loss: 0.21823563307523727
epoch: 600  loss: 0.2162562595307827
epoch: 700  loss: 0.21665824010968207
epoch: 800  loss: 0.216469007730484
epoch: 900  loss: 0.21530019193887712
epoch: 1000  loss: 0.2175290195643902
epoch: 1100  loss: 0.21650463759899138
epoch: 1200  loss: 0.21649720922112464
epoch: 1300  loss: 0.21754909217357635
epoch: 1400  loss: 0.21618148297071457
epoch: 1500  loss: 0.21707578703761102


In [150]:
#Test
test_story, test_y = discrete_copyfirst()

model.eval()
with torch.no_grad():
    hidden=None
    for time in range(length+1): #DataLoader　ではないと思う。layer forwardにあたる
        tsentence =test_story[:,time,:]
        ttarget=test_y[:,time,:]
        if time==0:
            toutput,hidden = model(tsentence)
        else:
            toutput,hidden = model(tsentence,hidden)

        #最後のステップだけloss計算する
        if (time==length):
            print(ttarget)
            print(toutput)
            tloss = my_criterion(toutput,ttarget)
            tloss_value = tloss.item()
            print("tloss:",tloss_value)

            toutput = toutput.cpu().numpy()
            ttarget = ttarget.cpu().numpy()
            print("predict:",toutput)
            print("real:",ttarget)
#連続地のコピーにしたけど本来↓
    #output1= model(xtest)
    #ans = torch.argmax(output1,1)
    #print(((ytest == ans).sum().float() / len(ans) ).item() )

tensor([[1., 1., 0., 1., 1., 1., 0.],
        [1., 0., 0., 0., 1., 0., 0.]], device='cuda:0')
tensor([[0.4641, 0.5617, 0.4601, 0.5492, 0.4401, 0.5182, 0.0095],
        [0.4501, 0.5652, 0.4723, 0.5291, 0.4427, 0.5337, 0.0047]],
       device='cuda:0')
tloss: 0.6893427968025208
predict: [[0.46409285 0.561737   0.46010253 0.5492228  0.44012043 0.51824784
  0.00952974]
 [0.45011994 0.5651748  0.47226402 0.529108   0.44270962 0.53370535
  0.00471528]]
real: [[1. 1. 0. 1. 1. 1. 0.]
 [1. 0. 0. 0. 1. 0. 0.]]


hiddenとoutputのサイズが共通な以上、全結合層を挟まないとhidden_sizeが分類クラス数あるいはvocab_sizeに固定されてしまうよ　　
扱うのがコピーから文章になると、embedding層やvocab_size引数が必要になるよ

In [8]:
class MyLSTMClassifier(nn.Module):
    # モデルで使う各ネットワークをコンストラクタで定義
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(MyLSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。
        self.hidden_dim = hidden_dim
        # LSTMの隠れ層。これ１つでOK。超便利。
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.Softmax(dim=1)

    # 順伝播処理はforward関数に記載
    def forward(self, sentence):
        # 2次元テンソルをLSTMに食わせられる様にviewで３次元テンソルにした上でLSTMへ流す。
        # 上記で説明した様にmany to oneのタスクを解きたいので、第二戻り値＝hiddenだけ使う。　babiもmany to oneのはず
        _, lstm_out = self.lstm(sentence.view(len(sentence), 1, -1))
        # lstm_out[0]は３次元テンソルになってしまっているので2次元に調整して全結合。
        tag_space = self.hidden2tag(lstm_out[0].view(-1, self.hidden_dim))
        # softmaxに食わせて、確率として表現
        tag_scores = self.softmax(tag_space)
        return tag_scores
        #hiddenは明示的にforwardの入力にしなくても大丈夫

In [9]:
# model generate, optimizer and criterion setting

model= MyLSTMClassifier().cuda()
optimizer = optim.SGD(model.parameters(),lr=0.1)
criterion = nn.CrossEntropyLoss()

TypeError: __init__() missing 4 required positional arguments: 'embedding_dim', 'hidden_dim', 'vocab_size', and 'tagset_size'

In [None]:
#Learn

n =     ##  データのサイズ
bs =    ##  バッチのサイズ
itr=5

model.train()
for i in range(itr):
    idx = np.random.permutation(n)
    for j in range(0,n,bs):
        xtm = xtrain[idx[j:(j+bs) if (j+bs)<n else n]]
        ytm = ytrain[idx[j:(j+bs) if (j+bs) < n else n]]
        output = model(xtm)
        loss = criterion(output,ytm)
        print(i,j,loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
#Test

model.eval()
with torch.no_grad():
    output1= model(xtest)
    ans = torch.argmax(output1,1)
    print(((ytest == ans).sum().float() / len(ans) ).item() )

↓は文章をあつかうやつ　コピーの次に

In [None]:
# Define model

class MyLSTMClassifier(nn.Module):
    def __init__(self):
        super(MyLSTM,self).__init__()
        self.l1=nn.Linear(4,6)
        self.l2=nn.Linear(6,3)
    def forward(self,x):
        h1=torch.sigmoid(self.l1(x))
        h2=self.l2(h1)
        return h2

class MyLSTMClassifier(nn.Module):
    # モデルで使う各ネットワークをコンストラクタで定義
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        # 親クラスのコンストラクタ。決まり文句
        super(MyLSTMClassifier, self).__init__()
        # 隠れ層の次元数。これは好きな値に設定しても行列計算の過程で出力には出てこないので。
        self.hidden_dim = hidden_dim
        # インプットの単語をベクトル化するために使う
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # LSTMの隠れ層。これ１つでOK。超便利。
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # LSTMの出力を受け取って全結合してsoftmaxに食わせるための１層のネットワーク
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        # softmaxのLog版。dim=0で列、dim=1で行方向を確率変換。
        self.softmax = nn.LogSoftmax(dim=1)

    # 順伝播処理はforward関数に記載
    def forward(self, sentence):
        # 文章内の各単語をベクトル化して出力。2次元のテンソル
        embeds = self.word_embeddings(sentence)
        # 2次元テンソルをLSTMに食わせられる様にviewで３次元テンソルにした上でLSTMへ流す。
        # 上記で説明した様にmany to oneのタスクを解きたいので、第二戻り値だけ使う。
        _, lstm_out = self.lstm(embeds.view(len(sentence), 1, -1))
        # lstm_out[0]は３次元テンソルになってしまっているので2次元に調整して全結合。
        tag_space = self.hidden2tag(lstm_out[0].view(-1, self.hidden_dim))
        # softmaxに食わせて、確率として表現
        tag_scores = self.softmax(tag_space)
        return tag_scores