In [1]:
from model import EncoderCNN, WordRNN, SentenceRNN
from dataLoader import build_vocab, get_loader
import dataLoader
import torch
import torch.nn as nn
import torch.utils.data as data
from torchvision import transforms
import pickle
import datetime
import os
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
%load_ext autoreload
%autoreload 2


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
torch.cuda.get_device_properties(device).total_memory

11996954624

In [6]:
torch.cuda.empty_cache()

In [7]:
torch.cuda.memory_allocated(device=None)

0

In [15]:
class Im2pGenerator(object):
    def __init__(self, img_embed_size, input_size, hidden_size, topic_size, embed_size, batch_size, learning_rate, lambda_word, lambda_sent):
        self.encoderCnn = self.__init_encoderCNN(img_embed_size)
        self.sentRnn = self.__init_sentenceRNN(input_size, hidden_size, topic_size)
        self.vocab = self.__init_vocab()
        vocab_size = len(self.vocab)
        self.wordRnn = self.__init_wordRNN(embed_size, hidden_size, vocab_size)
        self.transform = self.__init_transform()
        self.criterion = self.__init_criterion()
        params = list(self.encoderCnn.parameters()) + list(self.sentRnn.parameters()) + list(self.wordRnn.parameters())
        self.optimizer = self.__init_optimizer(learning_rate, params)
        self.train_dataLoader = self.__init_data_loader(self.vocab, self.transform, batch_size, "data/small_train.json")
        self.val_dataLoader = self.__init_data_loader(self.vocab, self.transform, batch_size, "data/val_split.json")
        self.lambda_word = lambda_word
        self.lambda_sent = lambda_sent
        self.scheduler = self.__init_scheduler()
        self.min_loss = 10000000
        self.model_path = ''
        
    
    def train(self, num_epochs):
        for epoch in range(1, num_epochs+1):
            train_loss = self.__epoch_train()
            val_loss = 0
            #val_loss = self.__epoch_val()
            self.scheduler.step(train_loss)
            print("[{}] Epoch-{} - train loss:{} - val loss:{} - lr:{}".format(datetime.datetime.now(),
                                                                               epoch + 1,
                                                                               train_loss,
                                                                               val_loss,
                                                                               self.optimizer.param_groups[0]['lr']))
            self.save_model(train_loss, "model1")
    
    def __epoch_train(self):
        self.encoderCnn.train()
        self.sentRnn.train()
        self.wordRnn.train()
        train_loss = 0
        
        for i,(images, image_id, target, prob_true) in enumerate(self.train_dataLoader):
            images = images.to(device)
            features = self.encoderCnn(images)
            prob_true = prob_true.long().to(device)
            self.optimizer.zero_grad()
            hiddens = None
            sentLoss = 0
            wordLoss = 0
            for sent_index in range(target.shape[1]):
                probs, topic, hiddens = self.sentRnn.forward(features = features, states = hiddens)
                target_cur = target[:,sent_index,:].to(device)
                print('prob true',prob_true[:, sent_index].view(-1,1))
                sentLoss += self.criterion(probs[:,0,:], prob_true[:, sent_index]).sum()
                for word_index in range(1, target_cur.shape[1]-1):
                    outputs = self.wordRnn.forward(topic, target_cur[:, :word_index])
                    captionMask = (target_cur[:, word_index] > 1).float() 
                    time_loss = self.criterion(outputs, target_cur[:, word_index].view(-1,))
                    time_loss = time_loss * captionMask
                    wordLoss += time_loss.sum()
            loss = self.lambda_word * wordLoss + self.lambda_sent * sentLoss
            loss.backward()
            self.optimizer.step()
            train_loss += loss.item()
                
        return train_loss
            
    
    def __epoch_val(self):
        self.encoderCnn.eval()
        self.sentRnn.eval()
        self.wordRnn.eval()
        val_loss = 0
        
        for i,(images, image_id, target, prob_true) in enumerate(self.val_dataLoader):
            images = images.to(device)
            features = self.encoderCnn(images)
            prob_true = prob_true.long().to(device)
            hiddens = None
            sentLoss = 0
            wordLoss = 0
            loss = 0
            for sent_index in range(target.shape[1]):
                probs, topic, hiddens = self.sentRnn.forward(features = features, hiddens = hiddens)
                target_cur = target[:,sent_index,:].to(device)
                print('prob true',prob_true[:, sent_index].view(-1,1))
                sentLoss += self.criterion(probs[:,0,:], prob_true[:, sent_index].view(-1,1)).sum()
                for word_index in range(1, target_cur.shape[1]-1):
                    outputs = self.wordRnn.forward(topic, target_cur[:, :word_index])
                    captionMask = (target_cur[:, word_index] > 1).float() 
                    time_loss = self.criterion(outputs, target_cur[:, word_index].view(-1,))
                    time_loss = time_loss * captionMask
                    wordLoss += time_loss.sum()
            loss = self.lambda_word * wordLoss + self.lambda_sent * sentLoss
            val_loss += loss.item()
                
        return val_loss
    
    
    
    def __init_data_loader(self, vocab, transform, batch_size, data_json):
        image_dir = 'data/images'
        caption_json = 'data/captions.json'
        data_json = data_json
        data_loader = get_loader(image_dir=image_dir,
                         caption_json=caption_json,
                         data_json=data_json,
                         vocabulary=vocab,
                         transform=transform,
                         batch_size=batch_size,
                         shuffle=True)
        return data_loader
    
    def __init_scheduler(self):
        scheduler = ReduceLROnPlateau(self.optimizer, 'min', patience=10)
        return scheduler
    
    def __init_transform(self):
        
        resize = 256
        crop_size = 224
        transform = transforms.Compose([
        transforms.Resize(resize),
        transforms.RandomCrop(crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])
        return transform
    
    def __init_vocab(self):
        with open('data/vocab.pkl', 'rb') as f:
            vocab = pickle.load(f)
        return vocab
    
    def __init_encoderCNN(self, img_embed_size):
        return EncoderCNN(img_embed_size).to(device)
    
    def __init_sentenceRNN(self, input_size, hidden_size, topic_size):
        return SentenceRNN(input_size, hidden_size, topic_size).to(device)
        
    def __init_wordRNN(self, embed_size, hidden_size, vocab_size):
        return WordRNN(embed_size, hidden_size, vocab_size).to(device)
    
    def __log(self, train_loss, val_loss, epoch):
        pass
    
    def save_model(self, loss, file_name):
        if loss < self.min_loss:
            print("Saved Model in {}".format(file_name))
            torch.save({'encoderCNN': self.encoderCnn.state_dict(),
                        'sentenceRNN': self.sentRnn.state_dict(),
                        'wordRNN': self.wordRnn.state_dict(),
                        'best_loss': loss,
                        'optimizer': self.optimizer.state_dict()},
                       os.path.join(self.model_path, "{}.npz".format(file_name)))
            self.min_loss = loss
    
    def __init_logger(self):
        pass
    
    def __init_criterion(self):
        return nn.CrossEntropyLoss(reduction = 'none').cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss(reduction = 'none')
    
    def __init_optimizer(self, learning_rate, params):
        return torch.optim.Adam(params, lr=learning_rate)

In [16]:
hidden_size = 512
img_embed_size = 512
input_size = 512
topic_size = 512
embed_size = 512
batch_size = 32
lambda_word = 1
lambda_sent = 5


im2p = Im2pGenerator(img_embed_size, input_size, hidden_size, topic_size, embed_size, batch_size, 1e-3, lambda_word,lambda_sent)

In [17]:
#im2p.encoderCnn.load_state_dict(torch.load("model1.npz")['encoderCNN'], strict=False)
#im2p.sentRnn.load_state_dict(torch.load("model1.npz")['sentenceRNN'],strict=False)
#im2p.wordRnn.load_state_dict(torch.load("model1.npz")['wordRNN'], strict=False)

In [18]:
len(im2p.train_dataLoader.dataset)

506

In [19]:
im2p.train(4)

hiddens tensor([[[-0.0222, -0.0039, -0.0974,  ...,  0.1221, -0.0751, -0.0871]],

        [[-0.0967,  0.0691, -0.0770,  ...,  0.0337, -0.1036,  0.0262]],

        [[ 0.0456,  0.1234,  0.0678,  ...,  0.0117, -0.0599,  0.0862]],

        ...,

        [[-0.2112, -0.0363,  0.0644,  ..., -0.0311,  0.0171,  0.0537]],

        [[-0.0697,  0.0929, -0.0454,  ...,  0.1071,  0.0889, -0.0617]],

        [[-0.0409,  0.0116, -0.1269,  ...,  0.1236, -0.0032, -0.0571]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0222, -0.0039, -0.0974,  ...,  0.1221, -0.0751, -0.0871],
         [-0.0967,  0.0691, -0.0770,  ...,  0.0337, -0.1036,  0.0262],
         [ 0.0456,  0.1234,  0.0678,  ...,  0.0117, -0.0599,  0.0862],
         ...,
         [-0.2112, -0.0363,  0.0644,  ..., -0.0311,  0.0171,  0.0537],
         [-0.0697,  0.0929, -0.0454,  ...,  0.1071,  0.0889, -0.0617],
         [-0.0409,  0.0116, -0.1269,  ...,  0.1236, -0.0032, -0.0571]]],
       device='cuda:0', grad_fn=<Cudnn

states (tensor([[[-0.0137,  0.0358, -0.2183,  ...,  0.2494, -0.1982, -0.1574],
         [-0.1581,  0.0902, -0.1250,  ...,  0.0268, -0.2037,  0.0227],
         [ 0.0888,  0.2448,  0.1403,  ..., -0.0260, -0.0891,  0.1445],
         ...,
         [-0.3294, -0.0835,  0.1357,  ..., -0.0582,  0.0516,  0.0931],
         [-0.1344,  0.1457, -0.1150,  ...,  0.2166,  0.1769, -0.0931],
         [-0.0491,  0.0442, -0.1979,  ...,  0.2254, -0.0191, -0.1313]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>), tensor([[[-0.0227,  0.0664, -0.7111,  ...,  0.4737, -0.5425, -0.4034],
         [-0.2865,  0.2149, -0.3237,  ...,  0.0851, -0.4408,  0.0405],
         [ 0.2067,  0.6917,  0.2508,  ..., -0.0512, -0.1980,  0.2733],
         ...,
         [-0.8308, -0.1993,  0.2642,  ..., -0.0992,  0.1133,  0.2440],
         [-0.3136,  0.3453, -0.1936,  ...,  0.4310,  0.4170, -0.1917],
         [-0.1321,  0.1214, -0.4130,  ...,  0.4546, -0.0410, -0.3269]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>))
pr

hiddens tensor([[[ 0.0174, -0.0900,  0.0282,  ...,  0.0453, -0.1857,  0.0259]],

        [[ 0.1164,  0.0507,  0.1260,  ...,  0.0510,  0.0286,  0.0857]],

        [[ 0.0851, -0.0232,  0.1233,  ..., -0.0172,  0.0108,  0.0857]],

        ...,

        [[-0.2297,  0.0081, -0.0839,  ..., -0.0566, -0.0851, -0.0986]],

        [[ 0.0993,  0.0967, -0.0753,  ...,  0.0061, -0.1013,  0.0111]],

        [[-0.0737,  0.0041,  0.1003,  ..., -0.0233,  0.0086,  0.1057]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0174, -0.0900,  0.0282,  ...,  0.0453, -0.1857,  0.0259],
         [ 0.1164,  0.0507,  0.1260,  ...,  0.0510,  0.0286,  0.0857],
         [ 0.0851, -0.0232,  0.1233,  ..., -0.0172,  0.0108,  0.0857],
         ...,
         [-0.2297,  0.0081, -0.0839,  ..., -0.0566, -0.0851, -0.0986],
         [ 0.0993,  0.0967, -0.0753,  ...,  0.0061, -0.1013,  0.0111],
         [-0.0737,  0.0041,  0.1003,  ..., -0.0233,  0.0086,  0.1057]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0760, -0.1186,  0.0356,  ...,  0.0455, -0.3464,  0.0811]],

        [[ 0.1841,  0.0615,  0.2421,  ...,  0.1101,  0.0339,  0.1492]],

        [[ 0.1479, -0.0610,  0.2134,  ..., -0.0192,  0.0293,  0.1783]],

        ...,

        [[-0.3756,  0.0324, -0.2052,  ..., -0.0489, -0.1666, -0.1461]],

        [[ 0.2210,  0.2671, -0.1328,  ...,  0.0246, -0.2359,  0.0453]],

        [[-0.1417, -0.0169,  0.1963,  ..., -0.0275,  0.0182,  0.1962]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0760, -0.1186,  0.0356,  ...,  0.0455, -0.3464,  0.0811],
         [ 0.1841,  0.0615,  0.2421,  ...,  0.1101,  0.0339,  0.1492],
         [ 0.1479, -0.0610,  0.2134,  ..., -0.0192,  0.0293,  0.1783],
         ...,
         [-0.3756,  0.0324, -0.2052,  ..., -0.0489, -0.1666, -0.1461],
         [ 0.2210,  0.2671, -0.1328,  ...,  0.0246, -0.2359,  0.0453],
         [-0.1417, -0.0169,  0.1963,  ..., -0.0275,  0.0182,  0.1962]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.1072, -0.0095, -0.0754,  ..., -0.0641,  0.0041,  0.0779]],

        [[-0.0870,  0.0533,  0.1843,  ...,  0.0356,  0.0211, -0.0816]],

        [[-0.1481,  0.1023,  0.0935,  ...,  0.1034, -0.0009, -0.0353]],

        ...,

        [[-0.1195,  0.0019,  0.1128,  ...,  0.1271, -0.0183, -0.1092]],

        [[ 0.0683,  0.0372, -0.0407,  ..., -0.0140, -0.0885,  0.0808]],

        [[-0.1217, -0.0056, -0.0166,  ...,  0.0813, -0.0485, -0.0379]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1072, -0.0095, -0.0754,  ..., -0.0641,  0.0041,  0.0779],
         [-0.0870,  0.0533,  0.1843,  ...,  0.0356,  0.0211, -0.0816],
         [-0.1481,  0.1023,  0.0935,  ...,  0.1034, -0.0009, -0.0353],
         ...,
         [-0.1195,  0.0019,  0.1128,  ...,  0.1271, -0.0183, -0.1092],
         [ 0.0683,  0.0372, -0.0407,  ..., -0.0140, -0.0885,  0.0808],
         [-0.1217, -0.0056, -0.0166,  ...,  0.0813, -0.0485, -0.0379]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.1657, -0.0075, -0.1394,  ..., -0.1247, -0.0067,  0.1532]],

        [[-0.2370,  0.0835,  0.2906,  ...,  0.0861,  0.0819, -0.1207]],

        [[-0.2863,  0.1793,  0.1687,  ...,  0.2156,  0.0066,  0.0137]],

        ...,

        [[-0.1995,  0.0208,  0.1967,  ...,  0.2483,  0.0079, -0.1894]],

        [[ 0.0764,  0.0733, -0.0803,  ..., -0.0550, -0.1728,  0.1291]],

        [[-0.1691,  0.0116, -0.0268,  ...,  0.1618, -0.1107, -0.0242]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1657, -0.0075, -0.1394,  ..., -0.1247, -0.0067,  0.1532],
         [-0.2370,  0.0835,  0.2906,  ...,  0.0861,  0.0819, -0.1207],
         [-0.2863,  0.1793,  0.1687,  ...,  0.2156,  0.0066,  0.0137],
         ...,
         [-0.1995,  0.0208,  0.1967,  ...,  0.2483,  0.0079, -0.1894],
         [ 0.0764,  0.0733, -0.0803,  ..., -0.0550, -0.1728,  0.1291],
         [-0.1691,  0.0116, -0.0268,  ...,  0.1618, -0.1107, -0.0242]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0180,  0.1103, -0.1124,  ..., -0.0237, -0.0627,  0.0451]],

        [[ 0.0515, -0.0639, -0.0735,  ..., -0.0808,  0.0032, -0.0389]],

        [[-0.0012,  0.0919, -0.1119,  ...,  0.0549,  0.0200,  0.0194]],

        ...,

        [[-0.0938,  0.0639,  0.0759,  ...,  0.0191,  0.0217,  0.0559]],

        [[-0.1444,  0.0107,  0.0256,  ...,  0.0729,  0.0035,  0.0014]],

        [[ 0.1004,  0.0012,  0.0578,  ..., -0.1374, -0.1200, -0.0294]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0180,  0.1103, -0.1124,  ..., -0.0237, -0.0627,  0.0451],
         [ 0.0515, -0.0639, -0.0735,  ..., -0.0808,  0.0032, -0.0389],
         [-0.0012,  0.0919, -0.1119,  ...,  0.0549,  0.0200,  0.0194],
         ...,
         [-0.0938,  0.0639,  0.0759,  ...,  0.0191,  0.0217,  0.0559],
         [-0.1444,  0.0107,  0.0256,  ...,  0.0729,  0.0035,  0.0014],
         [ 0.1004,  0.0012,  0.0578,  ..., -0.1374, -0.1200, -0.0294]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.0099,  0.2288, -0.2546,  ..., -0.0481, -0.1719,  0.0543]],

        [[ 0.1113, -0.1283, -0.1437,  ..., -0.1623,  0.0010, -0.0293]],

        [[-0.0315,  0.1528, -0.2690,  ...,  0.1486,  0.0428,  0.0448]],

        ...,

        [[-0.2090,  0.1048,  0.1198,  ...,  0.0462,  0.0441,  0.1910]],

        [[-0.2120,  0.0517,  0.0771,  ...,  0.1008, -0.0331,  0.1171]],

        [[ 0.2266,  0.0120,  0.1265,  ..., -0.3076, -0.1998, -0.0262]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0099,  0.2288, -0.2546,  ..., -0.0481, -0.1719,  0.0543],
         [ 0.1113, -0.1283, -0.1437,  ..., -0.1623,  0.0010, -0.0293],
         [-0.0315,  0.1528, -0.2690,  ...,  0.1486,  0.0428,  0.0448],
         ...,
         [-0.2090,  0.1048,  0.1198,  ...,  0.0462,  0.0441,  0.1910],
         [-0.2120,  0.0517,  0.0771,  ...,  0.1008, -0.0331,  0.1171],
         [ 0.2266,  0.0120,  0.1265,  ..., -0.3076, -0.1998, -0.0262]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0805, -0.0247, -0.0870,  ...,  0.0498, -0.0955, -0.0362]],

        [[-0.0298,  0.0349, -0.0061,  ..., -0.0359,  0.0521, -0.0440]],

        [[ 0.0557, -0.0254,  0.1437,  ...,  0.0256, -0.0519,  0.1525]],

        ...,

        [[ 0.0047, -0.0736,  0.1176,  ..., -0.0307,  0.0499,  0.0991]],

        [[ 0.0120,  0.0338,  0.0153,  ..., -0.0206, -0.0679,  0.0236]],

        [[ 0.0404, -0.0627, -0.1147,  ..., -0.0736, -0.0957, -0.0256]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0805, -0.0247, -0.0870,  ...,  0.0498, -0.0955, -0.0362],
         [-0.0298,  0.0349, -0.0061,  ..., -0.0359,  0.0521, -0.0440],
         [ 0.0557, -0.0254,  0.1437,  ...,  0.0256, -0.0519,  0.1525],
         ...,
         [ 0.0047, -0.0736,  0.1176,  ..., -0.0307,  0.0499,  0.0991],
         [ 0.0120,  0.0338,  0.0153,  ..., -0.0206, -0.0679,  0.0236],
         [ 0.0404, -0.0627, -0.1147,  ..., -0.0736, -0.0957, -0.0256]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.1274, -0.0224, -0.1470,  ...,  0.1113, -0.1969, -0.1506]],

        [[-0.0679,  0.0468,  0.0101,  ..., -0.0778,  0.0977, -0.0623]],

        [[ 0.1554, -0.0458,  0.2728,  ...,  0.0394, -0.1096,  0.2884]],

        ...,

        [[-0.0597, -0.0883,  0.2081,  ..., -0.0417,  0.0580,  0.1762]],

        [[ 0.0180,  0.0650,  0.0718,  ..., -0.0355, -0.1263,  0.0316]],

        [[ 0.0938, -0.0942, -0.2284,  ..., -0.1800, -0.1829, -0.0877]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.1274, -0.0224, -0.1470,  ...,  0.1113, -0.1969, -0.1506],
         [-0.0679,  0.0468,  0.0101,  ..., -0.0778,  0.0977, -0.0623],
         [ 0.1554, -0.0458,  0.2728,  ...,  0.0394, -0.1096,  0.2884],
         ...,
         [-0.0597, -0.0883,  0.2081,  ..., -0.0417,  0.0580,  0.1762],
         [ 0.0180,  0.0650,  0.0718,  ..., -0.0355, -0.1263,  0.0316],
         [ 0.0938, -0.0942, -0.2284,  ..., -0.1800, -0.1829, -0.0877]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.1038, -0.0805,  0.0944,  ...,  0.0124,  0.0081, -0.0505]],

        [[-0.0497,  0.1217, -0.0862,  ...,  0.0127,  0.0711, -0.0160]],

        [[ 0.0336,  0.1173, -0.0493,  ...,  0.0082,  0.0026,  0.0063]],

        ...,

        [[-0.0266, -0.0903, -0.0320,  ..., -0.0929, -0.0370, -0.0995]],

        [[-0.0854,  0.0729,  0.0151,  ...,  0.0440, -0.0467,  0.0682]],

        [[ 0.0094,  0.1083,  0.1375,  ..., -0.0179,  0.0202, -0.0247]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.1038, -0.0805,  0.0944,  ...,  0.0124,  0.0081, -0.0505],
         [-0.0497,  0.1217, -0.0862,  ...,  0.0127,  0.0711, -0.0160],
         [ 0.0336,  0.1173, -0.0493,  ...,  0.0082,  0.0026,  0.0063],
         ...,
         [-0.0266, -0.0903, -0.0320,  ..., -0.0929, -0.0370, -0.0995],
         [-0.0854,  0.0729,  0.0151,  ...,  0.0440, -0.0467,  0.0682],
         [ 0.0094,  0.1083,  0.1375,  ..., -0.0179,  0.0202, -0.0247]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.1996, -0.1652,  0.1817,  ..., -0.0081,  0.0440, -0.1113]],

        [[-0.0542,  0.1891, -0.1958,  ...,  0.0132,  0.1244,  0.0399]],

        [[ 0.0756,  0.1927, -0.1174,  ...,  0.0050, -0.0309,  0.0633]],

        ...,

        [[-0.0382, -0.1236, -0.0149,  ..., -0.1734, -0.1351, -0.1556]],

        [[-0.2211,  0.1302,  0.0524,  ...,  0.0989, -0.1218,  0.1782]],

        [[-0.0167,  0.1953,  0.2664,  ..., -0.0343, -0.0238, -0.0363]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.1996, -0.1652,  0.1817,  ..., -0.0081,  0.0440, -0.1113],
         [-0.0542,  0.1891, -0.1958,  ...,  0.0132,  0.1244,  0.0399],
         [ 0.0756,  0.1927, -0.1174,  ...,  0.0050, -0.0309,  0.0633],
         ...,
         [-0.0382, -0.1236, -0.0149,  ..., -0.1734, -0.1351, -0.1556],
         [-0.2211,  0.1302,  0.0524,  ...,  0.0989, -0.1218,  0.1782],
         [-0.0167,  0.1953,  0.2664,  ..., -0.0343, -0.0238, -0.0363]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0708, -0.0252, -0.0449,  ..., -0.1440, -0.0985,  0.0168]],

        [[-0.2840,  0.1308, -0.0342,  ..., -0.0379, -0.0516, -0.0688]],

        [[-0.0212,  0.0592,  0.0553,  ..., -0.0462, -0.0729, -0.0202]],

        ...,

        [[-0.1599, -0.0321,  0.0824,  ...,  0.1426,  0.0273,  0.0098]],

        [[-0.0684, -0.0864, -0.0578,  ...,  0.0636, -0.0924, -0.0420]],

        [[-0.0044,  0.0495,  0.0310,  ...,  0.0841, -0.1389, -0.0665]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0708, -0.0252, -0.0449,  ..., -0.1440, -0.0985,  0.0168],
         [-0.2840,  0.1308, -0.0342,  ..., -0.0379, -0.0516, -0.0688],
         [-0.0212,  0.0592,  0.0553,  ..., -0.0462, -0.0729, -0.0202],
         ...,
         [-0.1599, -0.0321,  0.0824,  ...,  0.1426,  0.0273,  0.0098],
         [-0.0684, -0.0864, -0.0578,  ...,  0.0636, -0.0924, -0.0420],
         [-0.0044,  0.0495,  0.0310,  ...,  0.0841, -0.1389, -0.0665]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.1462, -0.0287, -0.0899,  ..., -0.2481, -0.2696,  0.0597]],

        [[-0.5479,  0.2670, -0.0672,  ..., -0.1010, -0.1808, -0.0803]],

        [[-0.0570,  0.1345,  0.1741,  ..., -0.0900, -0.1639, -0.0176]],

        ...,

        [[-0.3036, -0.0756,  0.1662,  ...,  0.2788,  0.0523,  0.0811]],

        [[-0.1142, -0.0833, -0.0686,  ...,  0.1193, -0.2688, -0.0616]],

        [[-0.0107,  0.0821,  0.0492,  ...,  0.1886, -0.2250, -0.1531]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.1462, -0.0287, -0.0899,  ..., -0.2481, -0.2696,  0.0597],
         [-0.5479,  0.2670, -0.0672,  ..., -0.1010, -0.1808, -0.0803],
         [-0.0570,  0.1345,  0.1741,  ..., -0.0900, -0.1639, -0.0176],
         ...,
         [-0.3036, -0.0756,  0.1662,  ...,  0.2788,  0.0523,  0.0811],
         [-0.1142, -0.0833, -0.0686,  ...,  0.1193, -0.2688, -0.0616],
         [-0.0107,  0.0821,  0.0492,  ...,  0.1886, -0.2250, -0.1531]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0306, -0.1200, -0.0014,  ...,  0.0737,  0.0208, -0.1474]],

        [[-0.1671,  0.0083,  0.0542,  ...,  0.0646, -0.0639, -0.0623]],

        [[-0.0964, -0.0461, -0.0155,  ..., -0.0508, -0.0424, -0.0878]],

        ...,

        [[-0.0873,  0.0194,  0.0230,  ..., -0.0452,  0.0012, -0.0453]],

        [[ 0.1119,  0.1150, -0.0267,  ..., -0.0364, -0.0254, -0.0021]],

        [[ 0.0079, -0.1044, -0.0502,  ..., -0.1560,  0.0537,  0.0658]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0306, -0.1200, -0.0014,  ...,  0.0737,  0.0208, -0.1474],
         [-0.1671,  0.0083,  0.0542,  ...,  0.0646, -0.0639, -0.0623],
         [-0.0964, -0.0461, -0.0155,  ..., -0.0508, -0.0424, -0.0878],
         ...,
         [-0.0873,  0.0194,  0.0230,  ..., -0.0452,  0.0012, -0.0453],
         [ 0.1119,  0.1150, -0.0267,  ..., -0.0364, -0.0254, -0.0021],
         [ 0.0079, -0.1044, -0.0502,  ..., -0.1560,  0.0537,  0.0658]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0572, -0.1919, -0.0118,  ...,  0.1127,  0.0340, -0.3013]],

        [[-0.3375,  0.0419,  0.1624,  ...,  0.1141, -0.1799, -0.1063]],

        [[-0.2136, -0.0385, -0.0159,  ..., -0.0912, -0.1253, -0.1490]],

        ...,

        [[-0.1946,  0.0254,  0.0511,  ..., -0.1170,  0.1008, -0.0854]],

        [[ 0.1693,  0.2085, -0.0745,  ..., -0.0262, -0.0719, -0.0303]],

        [[ 0.0330, -0.1690, -0.0921,  ..., -0.3094,  0.0526,  0.0734]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0572, -0.1919, -0.0118,  ...,  0.1127,  0.0340, -0.3013],
         [-0.3375,  0.0419,  0.1624,  ...,  0.1141, -0.1799, -0.1063],
         [-0.2136, -0.0385, -0.0159,  ..., -0.0912, -0.1253, -0.1490],
         ...,
         [-0.1946,  0.0254,  0.0511,  ..., -0.1170,  0.1008, -0.0854],
         [ 0.1693,  0.2085, -0.0745,  ..., -0.0262, -0.0719, -0.0303],
         [ 0.0330, -0.1690, -0.0921,  ..., -0.3094,  0.0526,  0.0734]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.2104, -0.0094,  0.0120,  ..., -0.0183, -0.0318, -0.0515]],

        [[ 0.0863, -0.0681, -0.0261,  ...,  0.0927, -0.1149, -0.0746]],

        [[-0.0154,  0.0568,  0.1356,  ..., -0.0654,  0.0759,  0.0860]],

        ...,

        [[-0.0768,  0.0417, -0.0171,  ...,  0.0717,  0.0460, -0.0161]],

        [[-0.1393,  0.0459, -0.0872,  ...,  0.0477, -0.1957,  0.0438]],

        [[ 0.1045, -0.0338, -0.0541,  ..., -0.0921, -0.0390,  0.1196]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.2104, -0.0094,  0.0120,  ..., -0.0183, -0.0318, -0.0515],
         [ 0.0863, -0.0681, -0.0261,  ...,  0.0927, -0.1149, -0.0746],
         [-0.0154,  0.0568,  0.1356,  ..., -0.0654,  0.0759,  0.0860],
         ...,
         [-0.0768,  0.0417, -0.0171,  ...,  0.0717,  0.0460, -0.0161],
         [-0.1393,  0.0459, -0.0872,  ...,  0.0477, -0.1957,  0.0438],
         [ 0.1045, -0.0338, -0.0541,  ..., -0.0921, -0.0390,  0.1196]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.4607, -0.0084,  0.0654,  ..., -0.0370, -0.1207, -0.0436]],

        [[ 0.1653, -0.1251, -0.0646,  ...,  0.1621, -0.2294, -0.1218]],

        [[-0.0411,  0.0809,  0.2755,  ..., -0.1318,  0.1270,  0.2092]],

        ...,

        [[-0.1117,  0.0824, -0.0132,  ...,  0.1660,  0.0623, -0.0016]],

        [[-0.3111,  0.1512, -0.1209,  ...,  0.0780, -0.4767,  0.0686]],

        [[ 0.1772, -0.0407, -0.1150,  ..., -0.1759, -0.1088,  0.3000]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.4607, -0.0084,  0.0654,  ..., -0.0370, -0.1207, -0.0436],
         [ 0.1653, -0.1251, -0.0646,  ...,  0.1621, -0.2294, -0.1218],
         [-0.0411,  0.0809,  0.2755,  ..., -0.1318,  0.1270,  0.2092],
         ...,
         [-0.1117,  0.0824, -0.0132,  ...,  0.1660,  0.0623, -0.0016],
         [-0.3111,  0.1512, -0.1209,  ...,  0.0780, -0.4767,  0.0686],
         [ 0.1772, -0.0407, -0.1150,  ..., -0.1759, -0.1088,  0.3000]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.0035,  0.1265, -0.0880,  ...,  0.0658,  0.0329, -0.0722]],

        [[-0.2539,  0.0916, -0.0114,  ..., -0.0225,  0.1003, -0.0149]],

        [[-0.0721, -0.0245,  0.0033,  ...,  0.0206, -0.0995, -0.0572]],

        ...,

        [[ 0.0111,  0.1073,  0.1868,  ...,  0.1790, -0.0305, -0.0824]],

        [[ 0.0146,  0.0068,  0.0766,  ..., -0.0017, -0.0253, -0.0685]],

        [[ 0.1025, -0.0589,  0.1378,  ...,  0.0876, -0.0552,  0.0145]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0035,  0.1265, -0.0880,  ...,  0.0658,  0.0329, -0.0722],
         [-0.2539,  0.0916, -0.0114,  ..., -0.0225,  0.1003, -0.0149],
         [-0.0721, -0.0245,  0.0033,  ...,  0.0206, -0.0995, -0.0572],
         ...,
         [ 0.0111,  0.1073,  0.1868,  ...,  0.1790, -0.0305, -0.0824],
         [ 0.0146,  0.0068,  0.0766,  ..., -0.0017, -0.0253, -0.0685],
         [ 0.1025, -0.0589,  0.1378,  ...,  0.0876, -0.0552,  0.0145]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.0837,  0.2950, -0.1742,  ...,  0.1194,  0.0113, -0.0211]],

        [[-0.6123,  0.1431,  0.0694,  ..., -0.1125,  0.1348,  0.0626]],

        [[-0.2011, -0.0517,  0.0668,  ...,  0.0281, -0.2476, -0.0685]],

        ...,

        [[-0.1221,  0.2481,  0.5591,  ...,  0.4077, -0.1061, -0.0456]],

        [[-0.0205,  0.0684,  0.2458,  ...,  0.0248, -0.1175, -0.1089]],

        [[ 0.1882, -0.1294,  0.3230,  ...,  0.1257, -0.0856,  0.0311]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0837,  0.2950, -0.1742,  ...,  0.1194,  0.0113, -0.0211],
         [-0.6123,  0.1431,  0.0694,  ..., -0.1125,  0.1348,  0.0626],
         [-0.2011, -0.0517,  0.0668,  ...,  0.0281, -0.2476, -0.0685],
         ...,
         [-0.1221,  0.2481,  0.5591,  ...,  0.4077, -0.1061, -0.0456],
         [-0.0205,  0.0684,  0.2458,  ...,  0.0248, -0.1175, -0.1089],
         [ 0.1882, -0.1294,  0.3230,  ...,  0.1257, -0.0856,  0.0311]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-8.4822e-02,  2.8789e-01,  2.6668e-01,  ...,  3.7138e-02,
           8.4534e-02,  6.9034e-02]],

        [[-1.5314e-01, -1.9300e-02, -7.6460e-02,  ...,  1.1186e-02,
          -1.0336e-02, -6.9938e-02]],

        [[ 2.8079e-02, -4.5781e-02, -6.8757e-02,  ...,  1.7546e-01,
          -5.8896e-02, -6.7524e-02]],

        ...,

        [[-2.5936e-01, -3.3564e-02, -2.0109e-01,  ..., -5.5274e-02,
          -9.5074e-02,  6.8940e-03]],

        [[ 3.8902e-02, -8.8500e-02, -9.4069e-03,  ...,  3.8361e-02,
          -1.8843e-04,  1.9344e-02]],

        [[ 1.2838e-01,  1.3382e-01,  8.8843e-02,  ...,  7.7404e-02,
           1.1149e-02, -1.0712e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-8.4822e-02,  2.8789e-01,  2.6668e-01,  ...,  3.7138e-02,
           8.4534e-02,  6.9034e-02],
         [-1.5314e-01, -1.9300e-02, -7.6460e-02,  ...,  1.1186e-02,
          -1.0336e-02, -6.9938e-02],
         [ 2.8079e-02, -4.5781e-02, -6.8757e-02,  ...,  1.7546e-01

hiddens tensor([[[-3.2606e-01,  6.4714e-01,  6.2714e-01,  ...,  8.2616e-03,
          -2.6653e-02,  2.6631e-01]],

        [[-3.8087e-01,  5.4263e-04, -6.2746e-02,  ...,  2.5170e-02,
          -1.0915e-01, -5.4713e-02]],

        [[ 3.2654e-02, -5.3085e-02, -1.1220e-01,  ...,  3.2210e-01,
          -2.1611e-01, -1.2757e-01]],

        ...,

        [[-6.0464e-01, -2.5854e-02, -3.0305e-01,  ..., -8.6393e-02,
          -3.1915e-01,  5.4339e-02]],

        [[ 8.9793e-02, -1.1935e-01, -1.3767e-02,  ...,  6.1573e-02,
          -3.6846e-02, -2.9972e-02]],

        [[ 2.1159e-01,  2.5017e-01,  3.4449e-01,  ...,  1.9437e-01,
          -1.9325e-03, -1.8711e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-3.2606e-01,  6.4714e-01,  6.2714e-01,  ...,  8.2616e-03,
          -2.6653e-02,  2.6631e-01],
         [-3.8087e-01,  5.4263e-04, -6.2746e-02,  ...,  2.5170e-02,
          -1.0915e-01, -5.4713e-02],
         [ 3.2654e-02, -5.3085e-02, -1.1220e-01,  ...,  3.2210e-01

hiddens tensor([[[ 0.0397, -0.1201, -0.0770,  ..., -0.1930,  0.0705,  0.0350]],

        [[-0.0616, -0.1160,  0.0212,  ..., -0.0143, -0.0725, -0.0394]],

        [[-0.1841, -0.0198, -0.0977,  ...,  0.0288, -0.0718, -0.0538]],

        ...,

        [[-0.2113,  0.2135,  0.3505,  ...,  0.0808,  0.0298, -0.1168]],

        [[ 0.0486, -0.0389,  0.0113,  ..., -0.0665, -0.0827,  0.0311]],

        [[ 0.0266, -0.0838,  0.0763,  ...,  0.1952, -0.2707, -0.0259]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0397, -0.1201, -0.0770,  ..., -0.1930,  0.0705,  0.0350],
         [-0.0616, -0.1160,  0.0212,  ..., -0.0143, -0.0725, -0.0394],
         [-0.1841, -0.0198, -0.0977,  ...,  0.0288, -0.0718, -0.0538],
         ...,
         [-0.2113,  0.2135,  0.3505,  ...,  0.0808,  0.0298, -0.1168],
         [ 0.0486, -0.0389,  0.0113,  ..., -0.0665, -0.0827,  0.0311],
         [ 0.0266, -0.0838,  0.0763,  ...,  0.1952, -0.2707, -0.0259]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.1287, -0.1888, -0.0803,  ..., -0.3557,  0.0661,  0.0514]],

        [[-0.1752, -0.1802,  0.0389,  ..., -0.1103, -0.1232, -0.1099]],

        [[-0.4358, -0.0191, -0.1449,  ...,  0.0511, -0.2838, -0.0631]],

        ...,

        [[-0.5812,  0.4859,  0.8196,  ...,  0.2020,  0.0300, -0.1140]],

        [[ 0.0582, -0.0447,  0.0628,  ..., -0.0172, -0.2671,  0.0882]],

        [[ 0.0225, -0.0663,  0.2302,  ...,  0.3702, -0.6316, -0.0061]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.1287, -0.1888, -0.0803,  ..., -0.3557,  0.0661,  0.0514],
         [-0.1752, -0.1802,  0.0389,  ..., -0.1103, -0.1232, -0.1099],
         [-0.4358, -0.0191, -0.1449,  ...,  0.0511, -0.2838, -0.0631],
         ...,
         [-0.5812,  0.4859,  0.8196,  ...,  0.2020,  0.0300, -0.1140],
         [ 0.0582, -0.0447,  0.0628,  ..., -0.0172, -0.2671,  0.0882],
         [ 0.0225, -0.0663,  0.2302,  ...,  0.3702, -0.6316, -0.0061]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.0288,  0.2279,  0.3067,  ...,  0.0771,  0.0500,  0.0188]],

        [[ 0.0034, -0.0526,  0.0154,  ..., -0.0075, -0.1786, -0.0646]],

        [[ 0.0198,  0.0997,  0.2147,  ...,  0.0531, -0.0184,  0.0969]],

        ...,

        [[ 0.0225, -0.0190,  0.0889,  ...,  0.0314,  0.0161, -0.1064]],

        [[-0.1822,  0.2615,  0.1998,  ...,  0.0976,  0.0530, -0.1605]],

        [[ 0.0476, -0.0376, -0.0767,  ...,  0.2092, -0.0974, -0.0728]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0288,  0.2279,  0.3067,  ...,  0.0771,  0.0500,  0.0188],
         [ 0.0034, -0.0526,  0.0154,  ..., -0.0075, -0.1786, -0.0646],
         [ 0.0198,  0.0997,  0.2147,  ...,  0.0531, -0.0184,  0.0969],
         ...,
         [ 0.0225, -0.0190,  0.0889,  ...,  0.0314,  0.0161, -0.1064],
         [-0.1822,  0.2615,  0.1998,  ...,  0.0976,  0.0530, -0.1605],
         [ 0.0476, -0.0376, -0.0767,  ...,  0.2092, -0.0974, -0.0728]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-2.2562e-01,  5.8963e-01,  6.9523e-01,  ...,  1.2518e-01,
          -2.4678e-05,  2.0097e-01]],

        [[ 1.3077e-02, -4.9235e-02, -1.8750e-02,  ...,  3.0296e-02,
          -4.2704e-01, -1.7162e-01]],

        [[-5.0439e-02,  2.9704e-01,  5.5885e-01,  ...,  7.8976e-02,
          -3.9217e-02,  2.0769e-01]],

        ...,

        [[-3.7260e-02, -3.7540e-02,  2.6514e-01,  ...,  1.0265e-01,
           6.1545e-02, -1.6559e-01]],

        [[-4.5745e-01,  6.5305e-01,  6.2801e-01,  ...,  1.6031e-01,
          -7.3383e-02, -1.6048e-01]],

        [[ 4.4741e-02, -1.3752e-03, -1.1703e-01,  ...,  3.9444e-01,
          -4.3079e-01, -1.4059e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-2.2562e-01,  5.8963e-01,  6.9523e-01,  ...,  1.2518e-01,
          -2.4678e-05,  2.0097e-01],
         [ 1.3077e-02, -4.9235e-02, -1.8750e-02,  ...,  3.0296e-02,
          -4.2704e-01, -1.7162e-01],
         [-5.0439e-02,  2.9704e-01,  5.5885e-01,  ...,  7.8976e-02

hiddens tensor([[[ 0.0069, -0.0873, -0.0457,  ...,  0.0150, -0.1276, -0.0409]],

        [[ 0.0549,  0.0811, -0.0508,  ..., -0.0975, -0.0333, -0.0651]],

        [[ 0.1743,  0.0226,  0.1612,  ..., -0.0276, -0.0399,  0.1061]],

        ...,

        [[-0.2568,  0.0145, -0.0788,  ..., -0.0140, -0.0395, -0.1331]],

        [[ 0.0589,  0.1582,  0.2756,  ...,  0.0497,  0.0167, -0.0514]],

        [[-0.0040,  0.0241, -0.0717,  ..., -0.1209, -0.1184,  0.0590]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0069, -0.0873, -0.0457,  ...,  0.0150, -0.1276, -0.0409],
         [ 0.0549,  0.0811, -0.0508,  ..., -0.0975, -0.0333, -0.0651],
         [ 0.1743,  0.0226,  0.1612,  ..., -0.0276, -0.0399,  0.1061],
         ...,
         [-0.2568,  0.0145, -0.0788,  ..., -0.0140, -0.0395, -0.1331],
         [ 0.0589,  0.1582,  0.2756,  ...,  0.0497,  0.0167, -0.0514],
         [-0.0040,  0.0241, -0.0717,  ..., -0.1209, -0.1184,  0.0590]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0805, -0.0847, -0.0662,  ...,  0.0337, -0.4392, -0.0596]],

        [[ 0.0869,  0.2103, -0.0897,  ..., -0.1679, -0.1285, -0.1734]],

        [[ 0.2358,  0.1206,  0.4800,  ...,  0.0097, -0.0955,  0.1937]],

        ...,

        [[-0.6885,  0.2024,  0.0207,  ...,  0.0080, -0.2031, -0.1499]],

        [[-0.0489,  0.4951,  0.7135,  ...,  0.0918, -0.0139, -0.0259]],

        [[ 0.0215,  0.0366, -0.0254,  ..., -0.1852, -0.4613,  0.1252]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0805, -0.0847, -0.0662,  ...,  0.0337, -0.4392, -0.0596],
         [ 0.0869,  0.2103, -0.0897,  ..., -0.1679, -0.1285, -0.1734],
         [ 0.2358,  0.1206,  0.4800,  ...,  0.0097, -0.0955,  0.1937],
         ...,
         [-0.6885,  0.2024,  0.0207,  ...,  0.0080, -0.2031, -0.1499],
         [-0.0489,  0.4951,  0.7135,  ...,  0.0918, -0.0139, -0.0259],
         [ 0.0215,  0.0366, -0.0254,  ..., -0.1852, -0.4613,  0.1252]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.0353, -0.0745, -0.0194,  ..., -0.0795,  0.0137, -0.0081]],

        [[ 0.0354,  0.0075, -0.0781,  ...,  0.1671, -0.2032, -0.0822]],

        [[-0.1536,  0.3985,  0.0011,  ..., -0.0114,  0.0400, -0.1106]],

        ...,

        [[-0.0735, -0.0212,  0.0642,  ..., -0.0851,  0.0615,  0.0894]],

        [[ 0.0389, -0.0472, -0.0495,  ...,  0.0821, -0.1356, -0.1436]],

        [[-0.0398, -0.0257,  0.1002,  ..., -0.0600,  0.0353, -0.0068]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0353, -0.0745, -0.0194,  ..., -0.0795,  0.0137, -0.0081],
         [ 0.0354,  0.0075, -0.0781,  ...,  0.1671, -0.2032, -0.0822],
         [-0.1536,  0.3985,  0.0011,  ..., -0.0114,  0.0400, -0.1106],
         ...,
         [-0.0735, -0.0212,  0.0642,  ..., -0.0851,  0.0615,  0.0894],
         [ 0.0389, -0.0472, -0.0495,  ...,  0.0821, -0.1356, -0.1436],
         [-0.0398, -0.0257,  0.1002,  ..., -0.0600,  0.0353, -0.0068]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.0039, -0.1498, -0.0130,  ..., -0.1568, -0.0207, -0.0065]],

        [[ 0.0477,  0.1276, -0.0958,  ...,  0.3236, -0.5649, -0.1286]],

        [[-0.4017,  0.7521,  0.2322,  ..., -0.0157, -0.1498, -0.0859]],

        ...,

        [[-0.2724, -0.0388,  0.2250,  ..., -0.2129, -0.0268,  0.2973]],

        [[ 0.0824, -0.0160, -0.0650,  ...,  0.2983, -0.5233, -0.1426]],

        [[-0.1873,  0.0573,  0.3465,  ..., -0.1917,  0.0424,  0.0148]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0039, -0.1498, -0.0130,  ..., -0.1568, -0.0207, -0.0065],
         [ 0.0477,  0.1276, -0.0958,  ...,  0.3236, -0.5649, -0.1286],
         [-0.4017,  0.7521,  0.2322,  ..., -0.0157, -0.1498, -0.0859],
         ...,
         [-0.2724, -0.0388,  0.2250,  ..., -0.2129, -0.0268,  0.2973],
         [ 0.0824, -0.0160, -0.0650,  ...,  0.2983, -0.5233, -0.1426],
         [-0.1873,  0.0573,  0.3465,  ..., -0.1917,  0.0424,  0.0148]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.0752, -0.0621, -0.0975,  ..., -0.1981, -0.0927,  0.0267]],

        [[-0.2793,  0.1261,  0.0016,  ...,  0.0861,  0.0109, -0.1052]],

        [[ 0.0513, -0.0793, -0.0978,  ..., -0.2040, -0.1951,  0.0742]],

        ...,

        [[ 0.0347,  0.0380, -0.0083,  ...,  0.1577, -0.1409, -0.0544]],

        [[ 0.0617,  0.1964,  0.2749,  ...,  0.1005,  0.0073,  0.1041]],

        [[-0.0948, -0.0202, -0.0820,  ...,  0.0776, -0.0230, -0.0442]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0752, -0.0621, -0.0975,  ..., -0.1981, -0.0927,  0.0267],
         [-0.2793,  0.1261,  0.0016,  ...,  0.0861,  0.0109, -0.1052],
         [ 0.0513, -0.0793, -0.0978,  ..., -0.2040, -0.1951,  0.0742],
         ...,
         [ 0.0347,  0.0380, -0.0083,  ...,  0.1577, -0.1409, -0.0544],
         [ 0.0617,  0.1964,  0.2749,  ...,  0.1005,  0.0073,  0.1041],
         [-0.0948, -0.0202, -0.0820,  ...,  0.0776, -0.0230, -0.0442]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.2257, -0.0798, -0.0392,  ..., -0.2607, -0.3401,  0.0272]],

        [[-0.8694,  0.5783,  0.5291,  ...,  0.1555, -0.1106, -0.0737]],

        [[ 0.2134, -0.0917, -0.0766,  ..., -0.2354, -0.5496,  0.1135]],

        ...,

        [[ 0.0374,  0.1610,  0.0787,  ...,  0.4369, -0.5498, -0.0329]],

        [[ 0.0594,  0.5795,  0.7519,  ...,  0.2693, -0.1707,  0.3080]],

        [[-0.2024, -0.0213, -0.0892,  ...,  0.1455, -0.0499, -0.0808]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.2257, -0.0798, -0.0392,  ..., -0.2607, -0.3401,  0.0272],
         [-0.8694,  0.5783,  0.5291,  ...,  0.1555, -0.1106, -0.0737],
         [ 0.2134, -0.0917, -0.0766,  ..., -0.2354, -0.5496,  0.1135],
         ...,
         [ 0.0374,  0.1610,  0.0787,  ...,  0.4369, -0.5498, -0.0329],
         [ 0.0594,  0.5795,  0.7519,  ...,  0.2693, -0.1707,  0.3080],
         [-0.2024, -0.0213, -0.0892,  ...,  0.1455, -0.0499, -0.0808]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.1944, -0.0566, -0.0961,  ..., -0.0859, -0.4070, -0.0122]],

        [[-0.4027, -0.0558,  0.1777,  ..., -0.2392, -0.0862,  0.2805]],

        [[ 0.0222, -0.0530, -0.0604,  ...,  0.1006, -0.0535,  0.0215]],

        ...,

        [[ 0.0202,  0.3620,  0.2851,  ...,  0.0478, -0.0778,  0.3435]],

        [[-0.3149,  0.1098, -0.1105,  ..., -0.2250, -0.0087,  0.0027]],

        [[ 0.0646,  0.0170, -0.0598,  ...,  0.4454, -0.5575, -0.1341]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1944, -0.0566, -0.0961,  ..., -0.0859, -0.4070, -0.0122],
         [-0.4027, -0.0558,  0.1777,  ..., -0.2392, -0.0862,  0.2805],
         [ 0.0222, -0.0530, -0.0604,  ...,  0.1006, -0.0535,  0.0215],
         ...,
         [ 0.0202,  0.3620,  0.2851,  ...,  0.0478, -0.0778,  0.3435],
         [-0.3149,  0.1098, -0.1105,  ..., -0.2250, -0.0087,  0.0027],
         [ 0.0646,  0.0170, -0.0598,  ...,  0.4454, -0.5575, -0.1341]]],
       device='cuda:0', grad_fn=<Cudnn

prob true tensor([[1],
        [1],
        [0],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [0]], device='cuda:0')
hiddens tensor([[[-0.2633, -0.0490, -0.0643,  ..., -0.0765, -0.5741, -0.0139]],

        [[-0.6842, -0.0530,  0.3259,  ..., -0.1702, -0.1792,  0.2421]],

        [[ 0.0313, -0.0309, -0.0404,  ...,  0.0895, -0.2819,  0.0231]],

        ...,

        [[-0.0696,  0.5660,  0.5069,  ...,  0.0858, -0.2499,  0.5143]],

        [[-0.5614,  0.2348,  0.0196,  ..., -0.2118, -0.1121, -0.0120]],

        [[ 0.0795,  0.0739, -0.0420,  ...,  0.4993, -0.7432, -0.1225]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.2633, -0.0490, -0.0643,

hiddens tensor([[[-8.0330e-01, -1.7297e-01,  6.6793e-01,  ...,  8.6406e-02,
           1.7480e-02, -1.5508e-04]],

        [[-4.1866e-02,  7.7680e-01,  6.8660e-01,  ...,  2.0744e-01,
           8.9596e-03,  6.4518e-01]],

        [[-2.4650e-01,  2.3459e-01,  4.5134e-01,  ..., -1.4165e-01,
          -2.0723e-02, -2.5923e-02]],

        ...,

        [[-2.3933e-01,  6.7430e-03,  3.7552e-01,  ..., -2.1528e-01,
          -4.5615e-02,  5.4729e-01]],

        [[ 1.1656e-01, -6.0970e-02, -5.2382e-02,  ..., -1.7198e-01,
          -5.9900e-01,  2.5958e-02]],

        [[-6.5423e-01, -1.5531e-01, -1.1323e-01,  ..., -7.7411e-02,
           1.0602e-01, -2.7354e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-8.0330e-01, -1.7297e-01,  6.6793e-01,  ...,  8.6406e-02,
           1.7480e-02, -1.5508e-04],
         [-4.1866e-02,  7.7680e-01,  6.8660e-01,  ...,  2.0744e-01,
           8.9596e-03,  6.4518e-01],
         [-2.4650e-01,  2.3459e-01,  4.5134e-01,  ..., -1.4165e-01

hiddens tensor([[[-0.9367, -0.1238,  0.8415,  ...,  0.0788,  0.0124,  0.0427]],

        [[-0.1907,  0.8693,  0.8637,  ...,  0.2236, -0.0834,  0.7393]],

        [[-0.6005,  0.5608,  0.7354,  ..., -0.1728, -0.1028,  0.0219]],

        ...,

        [[-0.6142,  0.0775,  0.5355,  ..., -0.1586, -0.1354,  0.5611]],

        [[ 0.1713, -0.0563, -0.0188,  ..., -0.1224, -0.7095,  0.0151]],

        [[-0.8745, -0.1958, -0.0127,  ..., -0.0621, -0.0032, -0.0142]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.9367, -0.1238,  0.8415,  ...,  0.0788,  0.0124,  0.0427],
         [-0.1907,  0.8693,  0.8637,  ...,  0.2236, -0.0834,  0.7393],
         [-0.6005,  0.5608,  0.7354,  ..., -0.1728, -0.1028,  0.0219],
         ...,
         [-0.6142,  0.0775,  0.5355,  ..., -0.1586, -0.1354,  0.5611],
         [ 0.1713, -0.0563, -0.0188,  ..., -0.1224, -0.7095,  0.0151],
         [-0.8745, -0.1958, -0.0127,  ..., -0.0621, -0.0032, -0.0142]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.8359,  0.4561,  0.5812,  ..., -0.1394,  0.0120,  0.0843]],

        [[-0.3294,  0.2044,  0.5077,  ..., -0.1042,  0.0444,  0.1458]],

        [[-0.6175,  0.0692,  0.0274,  ..., -0.1206,  0.0388,  0.1581]],

        ...,

        [[-0.0512, -0.0888, -0.0096,  ...,  0.1582,  0.0275, -0.0715]],

        [[-0.0426,  0.2038,  0.2903,  ...,  0.2116, -0.0594,  0.0787]],

        [[ 0.1201, -0.0429, -0.0201,  ..., -0.1125, -0.6284,  0.0693]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.8359,  0.4561,  0.5812,  ..., -0.1394,  0.0120,  0.0843],
         [-0.3294,  0.2044,  0.5077,  ..., -0.1042,  0.0444,  0.1458],
         [-0.6175,  0.0692,  0.0274,  ..., -0.1206,  0.0388,  0.1581],
         ...,
         [-0.0512, -0.0888, -0.0096,  ...,  0.1582,  0.0275, -0.0715],
         [-0.0426,  0.2038,  0.2903,  ...,  0.2116, -0.0594,  0.0787],
         [ 0.1201, -0.0429, -0.0201,  ..., -0.1125, -0.6284,  0.0693]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.9592,  0.6893,  0.7847,  ..., -0.0861, -0.0886,  0.0933]],

        [[-0.7151,  0.4370,  0.7438,  ..., -0.1278, -0.0332,  0.2372]],

        [[-0.9012,  0.2997,  0.3976,  ..., -0.0793, -0.0252,  0.1881]],

        ...,

        [[-0.2283, -0.0182,  0.1084,  ...,  0.1964, -0.0419, -0.0500]],

        [[-0.3786,  0.4692,  0.7442,  ...,  0.2650, -0.1532,  0.1545]],

        [[ 0.2625, -0.0351, -0.0128,  ..., -0.0792, -0.7746,  0.0357]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.9592,  0.6893,  0.7847,  ..., -0.0861, -0.0886,  0.0933],
         [-0.7151,  0.4370,  0.7438,  ..., -0.1278, -0.0332,  0.2372],
         [-0.9012,  0.2997,  0.3976,  ..., -0.0793, -0.0252,  0.1881],
         ...,
         [-0.2283, -0.0182,  0.1084,  ...,  0.1964, -0.0419, -0.0500],
         [-0.3786,  0.4692,  0.7442,  ...,  0.2650, -0.1532,  0.1545],
         [ 0.2625, -0.0351, -0.0128,  ..., -0.0792, -0.7746,  0.0357]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.0887, -0.0666,  0.0991,  ...,  0.2730, -0.2151, -0.1284]],

        [[-0.1768, -0.0487, -0.0135,  ..., -0.1006, -0.6206, -0.0237]],

        [[-0.3261,  0.3204,  0.4482,  ..., -0.3340, -0.1110,  0.3393]],

        ...,

        [[-0.4956,  0.4058,  0.7185,  ..., -0.2171, -0.0235,  0.7330]],

        [[-0.0143,  0.7231,  0.8390,  ...,  0.4210, -0.0052,  0.3768]],

        [[-0.5067,  0.3142,  0.6014,  ...,  0.1297,  0.0489, -0.1056]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0887, -0.0666,  0.0991,  ...,  0.2730, -0.2151, -0.1284],
         [-0.1768, -0.0487, -0.0135,  ..., -0.1006, -0.6206, -0.0237],
         [-0.3261,  0.3204,  0.4482,  ..., -0.3340, -0.1110,  0.3393],
         ...,
         [-0.4956,  0.4058,  0.7185,  ..., -0.2171, -0.0235,  0.7330],
         [-0.0143,  0.7231,  0.8390,  ...,  0.4210, -0.0052,  0.3768],
         [-0.5067,  0.3142,  0.6014,  ...,  0.1297,  0.0489, -0.1056]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.3568, -0.0706,  0.0815,  ..., -0.1987,  0.0708,  0.4026]],

        [[ 0.0700,  0.0925,  0.0885,  ...,  0.1090, -0.0256, -0.0098]],

        [[-0.1048, -0.0093, -0.0612,  ..., -0.0308, -0.2815, -0.0464]],

        ...,

        [[-0.3192, -0.0096, -0.0820,  ..., -0.0792,  0.0050,  0.0197]],

        [[-0.0073, -0.0222, -0.0307,  ..., -0.1615,  0.0837, -0.0067]],

        [[ 0.0893, -0.0458, -0.0125,  ..., -0.0908,  0.0068,  0.0461]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.3568, -0.0706,  0.0815,  ..., -0.1987,  0.0708,  0.4026],
         [ 0.0700,  0.0925,  0.0885,  ...,  0.1090, -0.0256, -0.0098],
         [-0.1048, -0.0093, -0.0612,  ..., -0.0308, -0.2815, -0.0464],
         ...,
         [-0.3192, -0.0096, -0.0820,  ..., -0.0792,  0.0050,  0.0197],
         [-0.0073, -0.0222, -0.0307,  ..., -0.1615,  0.0837, -0.0067],
         [ 0.0893, -0.0458, -0.0125,  ..., -0.0908,  0.0068,  0.0461]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.8639, -0.1015,  0.4080,  ..., -0.1620, -0.0520,  0.7275]],

        [[ 0.0204,  0.3732,  0.4031,  ...,  0.2159, -0.1451,  0.0202]],

        [[-0.3260,  0.0174, -0.0355,  ..., -0.0521, -0.7780, -0.0278]],

        ...,

        [[-0.7255, -0.0708, -0.0799,  ..., -0.0623, -0.1096,  0.1012]],

        [[-0.2631,  0.0166,  0.0690,  ..., -0.1892, -0.0672,  0.0524]],

        [[ 0.3896, -0.0632,  0.0219,  ..., -0.0803, -0.1754,  0.0604]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.8639, -0.1015,  0.4080,  ..., -0.1620, -0.0520,  0.7275],
         [ 0.0204,  0.3732,  0.4031,  ...,  0.2159, -0.1451,  0.0202],
         [-0.3260,  0.0174, -0.0355,  ..., -0.0521, -0.7780, -0.0278],
         ...,
         [-0.7255, -0.0708, -0.0799,  ..., -0.0623, -0.1096,  0.1012],
         [-0.2631,  0.0166,  0.0690,  ..., -0.1892, -0.0672,  0.0524],
         [ 0.3896, -0.0632,  0.0219,  ..., -0.0803, -0.1754,  0.0604]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0485,  0.3419,  0.4938,  ...,  0.1274,  0.0379, -0.1028]],

        [[-0.0022, -0.0441, -0.0431,  ..., -0.0886,  0.0395,  0.1753]],

        [[-0.2917,  0.0292,  0.3470,  ...,  0.0315,  0.0516, -0.0635]],

        ...,

        [[ 0.0258, -0.0311, -0.0169,  ...,  0.2217, -0.4956, -0.0451]],

        [[-0.3191,  0.3867,  0.5347,  ...,  0.0588,  0.0515,  0.0128]],

        [[ 0.0727,  0.3223,  0.4095,  ...,  0.2102, -0.0163,  0.1649]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0485,  0.3419,  0.4938,  ...,  0.1274,  0.0379, -0.1028],
         [-0.0022, -0.0441, -0.0431,  ..., -0.0886,  0.0395,  0.1753],
         [-0.2917,  0.0292,  0.3470,  ...,  0.0315,  0.0516, -0.0635],
         ...,
         [ 0.0258, -0.0311, -0.0169,  ...,  0.2217, -0.4956, -0.0451],
         [-0.3191,  0.3867,  0.5347,  ...,  0.0588,  0.0515,  0.0128],
         [ 0.0727,  0.3223,  0.4095,  ...,  0.2102, -0.0163,  0.1649]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.6362,  0.8865,  0.9382,  ...,  0.3787, -0.0050, -0.0584]],

        [[-0.0660, -0.0431, -0.0052,  ..., -0.0814, -0.3393,  0.3298]],

        [[-0.8742,  0.4192,  0.8185,  ...,  0.0671,  0.0146, -0.0050]],

        ...,

        [[ 0.0681,  0.0024, -0.0062,  ...,  0.4000, -0.8929, -0.0325]],

        [[-0.9084,  0.9075,  0.9566,  ...,  0.1263, -0.0059,  0.3943]],

        [[-0.1491,  0.8563,  0.9218,  ...,  0.5703, -0.1171,  0.6057]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.6362,  0.8865,  0.9382,  ...,  0.3787, -0.0050, -0.0584],
         [-0.0660, -0.0431, -0.0052,  ..., -0.0814, -0.3393,  0.3298],
         [-0.8742,  0.4192,  0.8185,  ...,  0.0671,  0.0146, -0.0050],
         ...,
         [ 0.0681,  0.0024, -0.0062,  ...,  0.4000, -0.8929, -0.0325],
         [-0.9084,  0.9075,  0.9566,  ...,  0.1263, -0.0059,  0.3943],
         [-0.1491,  0.8563,  0.9218,  ...,  0.5703, -0.1171,  0.6057]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.1640, -0.0571, -0.0668,  ..., -0.1164,  0.0597,  0.1263]],

        [[ 0.0637,  0.4655,  0.4529,  ...,  0.2993,  0.0332, -0.0266]],

        [[-0.1906, -0.0572, -0.0828,  ..., -0.1637, -0.2440, -0.0130]],

        ...,

        [[-0.1209,  0.0531,  0.1936,  ...,  0.1720,  0.0361, -0.0900]],

        [[-0.0313, -0.0251, -0.0157,  ..., -0.1561, -0.5205, -0.0486]],

        [[-0.0287,  0.0550, -0.0472,  ...,  0.0538, -0.3083, -0.0634]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1640, -0.0571, -0.0668,  ..., -0.1164,  0.0597,  0.1263],
         [ 0.0637,  0.4655,  0.4529,  ...,  0.2993,  0.0332, -0.0266],
         [-0.1906, -0.0572, -0.0828,  ..., -0.1637, -0.2440, -0.0130],
         ...,
         [-0.1209,  0.0531,  0.1936,  ...,  0.1720,  0.0361, -0.0900],
         [-0.0313, -0.0251, -0.0157,  ..., -0.1561, -0.5205, -0.0486],
         [-0.0287,  0.0550, -0.0472,  ...,  0.0538, -0.3083, -0.0634]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-6.8073e-01, -6.4161e-02,  4.3825e-02,  ..., -4.6615e-02,
          -2.2238e-02,  1.6151e-01]],

        [[-3.4364e-02,  8.9662e-01,  9.3693e-01,  ...,  6.5853e-01,
          -4.9519e-02,  3.4656e-01]],

        [[-7.1977e-01, -3.6936e-02, -2.5943e-02,  ..., -7.6377e-02,
          -7.4307e-01, -2.7564e-03]],

        ...,

        [[-5.9793e-01,  3.4534e-01,  7.2918e-01,  ...,  4.3009e-01,
           7.1142e-04, -8.6123e-02]],

        [[-6.6328e-02, -8.6311e-03, -6.5278e-03,  ..., -7.0208e-02,
          -9.1471e-01, -1.0857e-02]],

        [[ 9.7983e-02,  1.0853e-01, -2.7552e-02,  ...,  6.7518e-02,
          -8.0591e-01, -2.5825e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-6.8073e-01, -6.4161e-02,  4.3825e-02,  ..., -4.6615e-02,
          -2.2238e-02,  1.6151e-01],
         [-3.4364e-02,  8.9662e-01,  9.3693e-01,  ...,  6.5853e-01,
          -4.9519e-02,  3.4656e-01],
         [-7.1977e-01, -3.6936e-02, -2.5943e-02,  ..., -7.6377e-02

hiddens tensor([[[ 0.0292,  0.0116, -0.0293,  ...,  0.3431, -0.4245, -0.0590]],

        [[-0.1494, -0.0455, -0.0692,  ..., -0.1260, -0.1575,  0.0166]],

        [[ 0.0598, -0.0263, -0.0460,  ..., -0.1125, -0.2349, -0.0411]],

        ...,

        [[ 0.0065,  0.2965,  0.3222,  ...,  0.0232,  0.0351,  0.2955]],

        [[-0.0705, -0.0136,  0.0777,  ..., -0.1343, -0.1118, -0.0279]],

        [[ 0.0180, -0.0186, -0.0451,  ...,  0.4193, -0.4670, -0.1009]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0292,  0.0116, -0.0293,  ...,  0.3431, -0.4245, -0.0590],
         [-0.1494, -0.0455, -0.0692,  ..., -0.1260, -0.1575,  0.0166],
         [ 0.0598, -0.0263, -0.0460,  ..., -0.1125, -0.2349, -0.0411],
         ...,
         [ 0.0065,  0.2965,  0.3222,  ...,  0.0232,  0.0351,  0.2955],
         [-0.0705, -0.0136,  0.0777,  ..., -0.1343, -0.1118, -0.0279],
         [ 0.0180, -0.0186, -0.0451,  ...,  0.4193, -0.4670, -0.1009]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0694,  0.2522,  0.1343,  ...,  0.5880, -0.9095, -0.0532]],

        [[-0.5292, -0.0451, -0.0304,  ..., -0.0568, -0.5481,  0.0203]],

        [[ 0.3454, -0.0155, -0.0178,  ..., -0.0516, -0.7437, -0.0056]],

        ...,

        [[-0.2153,  0.8189,  0.7980,  ...,  0.2282, -0.1022,  0.8448]],

        [[-0.4912,  0.0458,  0.3496,  ..., -0.1587, -0.3793, -0.0158]],

        [[ 0.0233,  0.0434,  0.0033,  ...,  0.7396, -0.9034, -0.1157]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0694,  0.2522,  0.1343,  ...,  0.5880, -0.9095, -0.0532],
         [-0.5292, -0.0451, -0.0304,  ..., -0.0568, -0.5481,  0.0203],
         [ 0.3454, -0.0155, -0.0178,  ..., -0.0516, -0.7437, -0.0056],
         ...,
         [-0.2153,  0.8189,  0.7980,  ...,  0.2282, -0.1022,  0.8448],
         [-0.4912,  0.0458,  0.3496,  ..., -0.1587, -0.3793, -0.0158],
         [ 0.0233,  0.0434,  0.0033,  ...,  0.7396, -0.9034, -0.1157]]],
       device='cuda:0', grad_fn=<Cudnn

       device='cuda:0', grad_fn=<CudnnRnnBackward>))
pred prob tensor([[[2.4409e-04, 9.9976e-01]],

        [[1.8775e-05, 9.9998e-01]],

        [[1.1011e-05, 9.9999e-01]],

        [[7.6797e-06, 9.9999e-01]],

        [[2.1622e-04, 9.9978e-01]],

        [[8.0042e-04, 9.9920e-01]],

        [[3.8195e-06, 1.0000e+00]],

        [[2.7554e-03, 9.9724e-01]],

        [[5.0810e-06, 9.9999e-01]],

        [[1.4327e-06, 1.0000e+00]],

        [[6.3892e-04, 9.9936e-01]],

        [[3.8371e-02, 9.6163e-01]],

        [[1.1900e-06, 1.0000e+00]],

        [[1.6240e-05, 9.9998e-01]],

        [[9.9336e-06, 9.9999e-01]],

        [[1.9454e-06, 1.0000e+00]],

        [[8.2801e-02, 9.1720e-01]],

        [[1.2887e-06, 1.0000e+00]],

        [[5.1823e-05, 9.9995e-01]],

        [[5.3290e-05, 9.9995e-01]],

        [[1.1617e-06, 1.0000e+00]],

        [[7.1704e-06, 9.9999e-01]],

        [[4.6958e-05, 9.9995e-01]],

        [[2.2829e-06, 1.0000e+00]],

        [[5.5994e-06, 9.9999e-01]],

        [[2.

hiddens tensor([[[-1.6668e-01,  6.5506e-01,  8.7918e-01,  ...,  7.2752e-01,
          -1.8487e-01, -1.2889e-01]],

        [[-4.4312e-01,  8.2326e-01,  8.1194e-01,  ..., -1.2368e-01,
          -1.0849e-01,  6.4327e-01]],

        [[-5.4225e-01,  6.8342e-02,  3.6430e-01,  ..., -5.4541e-02,
          -1.3807e-01,  7.1607e-01]],

        ...,

        [[-9.3613e-01,  9.1697e-01,  9.4150e-01,  ...,  1.4799e-01,
          -2.7150e-03,  5.2887e-02]],

        [[ 6.1894e-04,  2.5715e-01,  1.8893e-02,  ...,  4.6022e-01,
          -8.6615e-01, -1.9305e-02]],

        [[-1.8200e-01,  9.1449e-01,  9.4053e-01,  ...,  5.7498e-01,
          -1.3236e-01,  7.0541e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-1.6668e-01,  6.5506e-01,  8.7918e-01,  ...,  7.2752e-01,
          -1.8487e-01, -1.2889e-01],
         [-4.4312e-01,  8.2326e-01,  8.1194e-01,  ..., -1.2368e-01,
          -1.0849e-01,  6.4327e-01],
         [-5.4225e-01,  6.8342e-02,  3.6430e-01,  ..., -5.4541e-02

hiddens tensor([[[-0.0087, -0.0377, -0.0464,  ...,  0.0451, -0.3515, -0.0467]],

        [[-0.2674, -0.0191, -0.0268,  ..., -0.0551, -0.1554,  0.0268]],

        [[ 0.0251,  0.5427,  0.4911,  ...,  0.3111,  0.0589,  0.3810]],

        ...,

        [[ 0.0320,  0.2397,  0.2975,  ...,  0.2760, -0.0198,  0.0455]],

        [[-0.3266,  0.2617,  0.3932,  ...,  0.0019,  0.0472, -0.0148]],

        [[-0.1504, -0.0281, -0.0269,  ..., -0.0860, -0.3676,  0.0009]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0087, -0.0377, -0.0464,  ...,  0.0451, -0.3515, -0.0467],
         [-0.2674, -0.0191, -0.0268,  ..., -0.0551, -0.1554,  0.0268],
         [ 0.0251,  0.5427,  0.4911,  ...,  0.3111,  0.0589,  0.3810],
         ...,
         [ 0.0320,  0.2397,  0.2975,  ...,  0.2760, -0.0198,  0.0455],
         [-0.3266,  0.2617,  0.3932,  ...,  0.0019,  0.0472, -0.0148],
         [-0.1504, -0.0281, -0.0269,  ..., -0.0860, -0.3676,  0.0009]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.1484, -0.0182, -0.0217,  ...,  0.0573, -0.7981, -0.0419]],

        [[-0.8463, -0.0122, -0.0108,  ..., -0.0077, -0.7036,  0.0123]],

        [[-0.0748,  0.9438,  0.9374,  ...,  0.5574, -0.1049,  0.8490]],

        ...,

        [[ 0.0166,  0.7052,  0.7929,  ...,  0.6430, -0.3760,  0.4014]],

        [[-0.9646,  0.8294,  0.9299,  ...,  0.0223, -0.0105,  0.0693]],

        [[-0.5311, -0.0121, -0.0089,  ..., -0.0225, -0.8526, -0.0022]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.1484, -0.0182, -0.0217,  ...,  0.0573, -0.7981, -0.0419],
         [-0.8463, -0.0122, -0.0108,  ..., -0.0077, -0.7036,  0.0123],
         [-0.0748,  0.9438,  0.9374,  ...,  0.5574, -0.1049,  0.8490],
         ...,
         [ 0.0166,  0.7052,  0.7929,  ...,  0.6430, -0.3760,  0.4014],
         [-0.9646,  0.8294,  0.9299,  ...,  0.0223, -0.0105,  0.0693],
         [-0.5311, -0.0121, -0.0089,  ..., -0.0225, -0.8526, -0.0022]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0214, -0.0467, -0.0363,  ...,  0.0214, -0.4429, -0.0351]],

        [[-0.2946, -0.0641, -0.0776,  ..., -0.0889,  0.0787,  0.3591]],

        [[-0.0198, -0.0610, -0.0548,  ..., -0.0586,  0.0061,  0.3489]],

        ...,

        [[-0.4511,  0.2810,  0.3367,  ...,  0.0210,  0.0514, -0.0368]],

        [[-0.5382,  0.4683,  0.6195,  ...,  0.1327,  0.0318, -0.1003]],

        [[ 0.1252,  0.1106,  0.1448,  ...,  0.3345, -0.1484, -0.0520]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0214, -0.0467, -0.0363,  ...,  0.0214, -0.4429, -0.0351],
         [-0.2946, -0.0641, -0.0776,  ..., -0.0889,  0.0787,  0.3591],
         [-0.0198, -0.0610, -0.0548,  ..., -0.0586,  0.0061,  0.3489],
         ...,
         [-0.4511,  0.2810,  0.3367,  ...,  0.0210,  0.0514, -0.0368],
         [-0.5382,  0.4683,  0.6195,  ...,  0.1327,  0.0318, -0.1003],
         [ 0.1252,  0.1106,  0.1448,  ...,  0.3345, -0.1484, -0.0520]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.3288, -0.0191, -0.0237,  ...,  0.0764, -0.8379, -0.0049]],

        [[-0.9039, -0.0550,  0.1514,  ..., -0.0153, -0.2758,  0.5258]],

        [[-0.2493, -0.0546, -0.0125,  ..., -0.0177, -0.3452,  0.5685]],

        ...,

        [[-0.9600,  0.8474,  0.9444,  ...,  0.0332,  0.0094,  0.0678]],

        [[-0.9780,  0.9250,  0.9805,  ...,  0.2812, -0.0085, -0.0626]],

        [[ 0.2375,  0.5547,  0.5847,  ...,  0.7115, -0.5073, -0.0695]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.3288, -0.0191, -0.0237,  ...,  0.0764, -0.8379, -0.0049],
         [-0.9039, -0.0550,  0.1514,  ..., -0.0153, -0.2758,  0.5258],
         [-0.2493, -0.0546, -0.0125,  ..., -0.0177, -0.3452,  0.5685],
         ...,
         [-0.9600,  0.8474,  0.9444,  ...,  0.0332,  0.0094,  0.0678],
         [-0.9780,  0.9250,  0.9805,  ...,  0.2812, -0.0085, -0.0626],
         [ 0.2375,  0.5547,  0.5847,  ...,  0.7115, -0.5073, -0.0695]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.1552, -0.0642, -0.0425,  ..., -0.0331, -0.3469, -0.0304]],

        [[-0.0007, -0.0384, -0.0282,  ...,  0.0659, -0.5704, -0.0276]],

        [[ 0.0605, -0.0154, -0.0326,  ...,  0.0326, -0.2895,  0.1311]],

        ...,

        [[-0.1243,  0.0693,  0.1317,  ...,  0.0446, -0.0494, -0.0982]],

        [[ 0.0086, -0.0420,  0.0527,  ..., -0.2109, -0.1298,  0.2625]],

        [[ 0.0159, -0.0621, -0.0656,  ..., -0.0544, -0.0353,  0.3270]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1552, -0.0642, -0.0425,  ..., -0.0331, -0.3469, -0.0304],
         [-0.0007, -0.0384, -0.0282,  ...,  0.0659, -0.5704, -0.0276],
         [ 0.0605, -0.0154, -0.0326,  ...,  0.0326, -0.2895,  0.1311],
         ...,
         [-0.1243,  0.0693,  0.1317,  ...,  0.0446, -0.0494, -0.0982],
         [ 0.0086, -0.0420,  0.0527,  ..., -0.2109, -0.1298,  0.2625],
         [ 0.0159, -0.0621, -0.0656,  ..., -0.0544, -0.0353,  0.3270]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.6000, -0.0433, -0.0147,  ..., -0.0080, -0.8087, -0.0070]],

        [[ 0.3949, -0.0133, -0.0161,  ...,  0.1562, -0.9198, -0.0024]],

        [[ 0.1148,  0.0301, -0.0095,  ...,  0.1803, -0.7617,  0.3884]],

        ...,

        [[-0.7582,  0.6471,  0.6896,  ...,  0.1061, -0.1667, -0.0888]],

        [[-0.1447,  0.0350,  0.3354,  ..., -0.1983, -0.5943,  0.5874]],

        [[-0.1300, -0.0527, -0.0098,  ..., -0.0100, -0.4639,  0.5327]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.6000, -0.0433, -0.0147,  ..., -0.0080, -0.8087, -0.0070],
         [ 0.3949, -0.0133, -0.0161,  ...,  0.1562, -0.9198, -0.0024],
         [ 0.1148,  0.0301, -0.0095,  ...,  0.1803, -0.7617,  0.3884],
         ...,
         [-0.7582,  0.6471,  0.6896,  ...,  0.1061, -0.1667, -0.0888],
         [-0.1447,  0.0350,  0.3354,  ..., -0.1983, -0.5943,  0.5874],
         [-0.1300, -0.0527, -0.0098,  ..., -0.0100, -0.4639,  0.5327]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.1267, -0.0099, -0.1008,  ..., -0.0223,  0.0318,  0.3265]],

        [[-0.0230,  0.1277, -0.0055,  ...,  0.0301,  0.0085, -0.0670]],

        [[-0.4370, -0.0025,  0.0239,  ..., -0.0527,  0.0373,  0.3595]],

        ...,

        [[ 0.0453,  0.0187, -0.0334,  ...,  0.3797, -0.2266, -0.0764]],

        [[-0.2451, -0.0542, -0.0428,  ..., -0.0651, -0.1389, -0.0393]],

        [[ 0.0537, -0.0757, -0.0613,  ..., -0.1214, -0.0404,  0.2338]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1267, -0.0099, -0.1008,  ..., -0.0223,  0.0318,  0.3265],
         [-0.0230,  0.1277, -0.0055,  ...,  0.0301,  0.0085, -0.0670],
         [-0.4370, -0.0025,  0.0239,  ..., -0.0527,  0.0373,  0.3595],
         ...,
         [ 0.0453,  0.0187, -0.0334,  ...,  0.3797, -0.2266, -0.0764],
         [-0.2451, -0.0542, -0.0428,  ..., -0.0651, -0.1389, -0.0393],
         [ 0.0537, -0.0757, -0.0613,  ..., -0.1214, -0.0404,  0.2338]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-6.0485e-01,  1.6959e-01,  1.5833e-01,  ..., -3.2585e-02,
          -4.8333e-02,  7.7917e-01]],

        [[-4.6810e-01,  4.8535e-01,  2.9709e-01,  ...,  1.2465e-01,
          -1.1048e-01, -4.3399e-02]],

        [[-9.5607e-01,  1.5028e-01,  6.1421e-01,  ..., -7.1932e-03,
          -4.6179e-03,  5.2798e-01]],

        ...,

        [[-1.2584e-01,  3.0143e-01,  1.3984e-01,  ...,  6.6983e-01,
          -7.2458e-01, -6.1915e-02]],

        [[-8.3986e-01, -3.9124e-02, -1.1337e-02,  ..., -8.4858e-03,
          -7.1081e-01, -6.8576e-04]],

        [[ 1.7073e-01, -6.6013e-02, -2.7072e-02,  ..., -3.9208e-02,
          -5.1099e-01,  4.1512e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-6.0485e-01,  1.6959e-01,  1.5833e-01,  ..., -3.2585e-02,
          -4.8333e-02,  7.7917e-01],
         [-4.6810e-01,  4.8535e-01,  2.9709e-01,  ...,  1.2465e-01,
          -1.1048e-01, -4.3399e-02],
         [-9.5607e-01,  1.5028e-01,  6.1421e-01,  ..., -7.1932e-03

hiddens tensor([[[ 0.0191, -0.0069, -0.0350,  ...,  0.4312, -0.4909, -0.0463]],

        [[-0.0272,  0.0211,  0.0515,  ...,  0.2951, -0.1700, -0.1000]],

        [[-0.0025, -0.0523, -0.0537,  ..., -0.1193, -0.1348,  0.2673]],

        ...,

        [[-0.0574,  0.4379,  0.5937,  ...,  0.1100,  0.0312,  0.3832]],

        [[-0.3311,  0.1603,  0.4140,  ..., -0.0788,  0.0391, -0.0203]],

        [[-0.0441,  0.4971,  0.6131,  ...,  0.1012,  0.0412,  0.1935]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0191, -0.0069, -0.0350,  ...,  0.4312, -0.4909, -0.0463],
         [-0.0272,  0.0211,  0.0515,  ...,  0.2951, -0.1700, -0.1000],
         [-0.0025, -0.0523, -0.0537,  ..., -0.1193, -0.1348,  0.2673],
         ...,
         [-0.0574,  0.4379,  0.5937,  ...,  0.1100,  0.0312,  0.3832],
         [-0.3311,  0.1603,  0.4140,  ..., -0.0788,  0.0391, -0.0203],
         [-0.0441,  0.4971,  0.6131,  ...,  0.1012,  0.0412,  0.1935]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 4.3531e-02,  6.4153e-02, -1.6746e-02,  ...,  7.4574e-01,
          -9.1333e-01, -2.6512e-02]],

        [[-4.8511e-01,  2.9042e-01,  3.5947e-01,  ...,  5.3233e-01,
          -5.0578e-01, -7.2730e-02]],

        [[-2.1883e-02, -4.9689e-02,  1.7360e-05,  ..., -4.2010e-02,
          -5.5141e-01,  4.6905e-01]],

        ...,

        [[-6.8050e-01,  9.2666e-01,  9.7182e-01,  ...,  4.3000e-01,
          -1.6441e-02,  8.4671e-01]],

        [[-9.6466e-01,  8.0402e-01,  9.2414e-01,  ..., -4.6800e-02,
          -2.0875e-02,  9.2319e-02]],

        [[-6.8292e-01,  9.5657e-01,  9.8178e-01,  ...,  4.5204e-01,
          -2.0466e-02,  6.9326e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 4.3531e-02,  6.4153e-02, -1.6746e-02,  ...,  7.4574e-01,
          -9.1333e-01, -2.6512e-02],
         [-4.8511e-01,  2.9042e-01,  3.5947e-01,  ...,  5.3233e-01,
          -5.0578e-01, -7.2730e-02],
         [-2.1883e-02, -4.9689e-02,  1.7360e-05,  ..., -4.2010e-02

hiddens tensor([[[-0.0056, -0.0292, -0.0544,  ..., -0.0753, -0.3534, -0.0312]],

        [[-0.6194,  0.2622,  0.5647,  ..., -0.0391,  0.0204,  0.1994]],

        [[-0.1888, -0.0170, -0.0127,  ..., -0.0349, -0.2317,  0.0261]],

        ...,

        [[-0.2611,  0.0704,  0.1203,  ..., -0.0811,  0.0066,  0.3526]],

        [[-0.1172, -0.0281, -0.0111,  ..., -0.0225, -0.4527, -0.0417]],

        [[-0.3143,  0.3739,  0.5992,  ..., -0.0038,  0.0203, -0.0378]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.0056, -0.0292, -0.0544,  ..., -0.0753, -0.3534, -0.0312],
         [-0.6194,  0.2622,  0.5647,  ..., -0.0391,  0.0204,  0.1994],
         [-0.1888, -0.0170, -0.0127,  ..., -0.0349, -0.2317,  0.0261],
         ...,
         [-0.2611,  0.0704,  0.1203,  ..., -0.0811,  0.0066,  0.3526],
         [-0.1172, -0.0281, -0.0111,  ..., -0.0225, -0.4527, -0.0417],
         [-0.3143,  0.3739,  0.5992,  ..., -0.0038,  0.0203, -0.0378]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.1785, -0.0070, -0.0199,  ..., -0.0283, -0.8560, -0.0081]],

        [[-0.9875,  0.8948,  0.9655,  ..., -0.0198,  0.0036,  0.4211]],

        [[-0.7453, -0.0080, -0.0049,  ..., -0.0034, -0.7896,  0.0269]],

        ...,

        [[-0.8911,  0.4734,  0.6076,  ..., -0.0335, -0.2375,  0.7039]],

        [[ 0.2006, -0.0084, -0.0025,  ..., -0.0019, -0.9154, -0.0057]],

        [[-0.9641,  0.9048,  0.9832,  ...,  0.0521, -0.0037,  0.2312]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.1785, -0.0070, -0.0199,  ..., -0.0283, -0.8560, -0.0081],
         [-0.9875,  0.8948,  0.9655,  ..., -0.0198,  0.0036,  0.4211],
         [-0.7453, -0.0080, -0.0049,  ..., -0.0034, -0.7896,  0.0269],
         ...,
         [-0.8911,  0.4734,  0.6076,  ..., -0.0335, -0.2375,  0.7039],
         [ 0.2006, -0.0084, -0.0025,  ..., -0.0019, -0.9154, -0.0057],
         [-0.9641,  0.9048,  0.9832,  ...,  0.0521, -0.0037,  0.2312]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.4293, -0.0427, -0.0554,  ..., -0.0449, -0.0719,  0.0860]],

        [[ 0.0387, -0.0322, -0.0215,  ..., -0.0391, -0.4676, -0.0176]],

        [[-0.1078,  0.4804,  0.6112,  ...,  0.1354,  0.0210,  0.1233]],

        ...,

        [[-0.2482, -0.0390, -0.0153,  ..., -0.0661, -0.5294, -0.0237]],

        [[ 0.0242, -0.0505, -0.0262,  ..., -0.0971, -0.4078,  0.1260]],

        [[-0.3825, -0.0251, -0.0604,  ...,  0.0589, -0.0321, -0.0497]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.4293, -0.0427, -0.0554,  ..., -0.0449, -0.0719,  0.0860],
         [ 0.0387, -0.0322, -0.0215,  ..., -0.0391, -0.4676, -0.0176],
         [-0.1078,  0.4804,  0.6112,  ...,  0.1354,  0.0210,  0.1233],
         ...,
         [-0.2482, -0.0390, -0.0153,  ..., -0.0661, -0.5294, -0.0237],
         [ 0.0242, -0.0505, -0.0262,  ..., -0.0971, -0.4078,  0.1260],
         [-0.3825, -0.0251, -0.0604,  ...,  0.0589, -0.0321, -0.0497]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.9316, -0.0389, -0.0401,  ..., -0.0028, -0.5959,  0.0508]],

        [[ 0.1589,  0.0039, -0.0034,  ...,  0.1264, -0.8884, -0.0085]],

        [[-0.7974,  0.9416,  0.9792,  ...,  0.4778, -0.0019,  0.5913]],

        ...,

        [[-0.8904,  0.0098, -0.0065,  ..., -0.0061, -0.9172, -0.0017]],

        [[ 0.1369, -0.0191, -0.0114,  ..., -0.0276, -0.8872,  0.2063]],

        [[-0.9691,  0.4396,  0.4021,  ...,  0.0643, -0.3916, -0.0087]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.9316, -0.0389, -0.0401,  ..., -0.0028, -0.5959,  0.0508],
         [ 0.1589,  0.0039, -0.0034,  ...,  0.1264, -0.8884, -0.0085],
         [-0.7974,  0.9416,  0.9792,  ...,  0.4778, -0.0019,  0.5913],
         ...,
         [-0.8904,  0.0098, -0.0065,  ..., -0.0061, -0.9172, -0.0017],
         [ 0.1369, -0.0191, -0.0114,  ..., -0.0276, -0.8872,  0.2063],
         [-0.9691,  0.4396,  0.4021,  ...,  0.0643, -0.3916, -0.0087]]],
       device='cuda:0', grad_fn=<Cudnn

[2019-06-04 04:58:07.445332] Epoch-3 - train loss:136957.43359375 - val loss:0 - lr:0.001
Saved Model in model1
hiddens tensor([[[ 0.0520,  0.5700,  0.6480,  ...,  0.3260,  0.0187,  0.1724]],

        [[-0.2624, -0.0520,  0.0972,  ..., -0.1126,  0.0211,  0.0669]],

        [[-0.2417,  0.5837,  0.6341,  ...,  0.1337,  0.0172,  0.0255]],

        ...,

        [[-0.2743, -0.0248, -0.0218,  ..., -0.0338, -0.4264, -0.0294]],

        [[-0.0269, -0.0611, -0.0440,  ...,  0.2709, -0.4236, -0.0455]],

        [[-0.3332, -0.0460,  0.0494,  ..., -0.0431,  0.0270,  0.3550]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0520,  0.5700,  0.6480,  ...,  0.3260,  0.0187,  0.1724],
         [-0.2624, -0.0520,  0.0972,  ..., -0.1126,  0.0211,  0.0669],
         [-0.2417,  0.5837,  0.6341,  ...,  0.1337,  0.0172,  0.0255],
         ...,
         [-0.2743, -0.0248, -0.0218,  ..., -0.0338, -0.4264, -0.0294],
         [-0.0269, -0.0611, -0.0440,  ...,  0.2709, -0.4236, -0.0455],

hiddens tensor([[[-0.6704,  0.9636,  0.9861,  ...,  0.7169, -0.0028,  0.7393]],

        [[-0.9139,  0.0849,  0.6491,  ..., -0.0391, -0.1525,  0.1712]],

        [[-0.9541,  0.9722,  0.9838,  ...,  0.4320, -0.0012,  0.3718]],

        ...,

        [[-0.7902, -0.0094, -0.0059,  ..., -0.0068, -0.8876, -0.0043]],

        [[-0.3921,  0.1447,  0.0558,  ...,  0.4551, -0.8798, -0.0082]],

        [[-0.9141, -0.1083,  0.4919,  ..., -0.0039, -0.0303,  0.5583]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.6704,  0.9636,  0.9861,  ...,  0.7169, -0.0028,  0.7393],
         [-0.9139,  0.0849,  0.6491,  ..., -0.0391, -0.1525,  0.1712],
         [-0.9541,  0.9722,  0.9838,  ...,  0.4320, -0.0012,  0.3718],
         ...,
         [-0.7902, -0.0094, -0.0059,  ..., -0.0068, -0.8876, -0.0043],
         [-0.3921,  0.1447,  0.0558,  ...,  0.4551, -0.8798, -0.0082],
         [-0.9141, -0.1083,  0.4919,  ..., -0.0039, -0.0303,  0.5583]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0059, -0.0333, -0.0474,  ...,  0.5434, -0.5625, -0.0736]],

        [[-0.3520,  0.3934,  0.5223,  ...,  0.2395,  0.0253, -0.0823]],

        [[-0.1837,  0.0838,  0.1662,  ...,  0.2505, -0.1181, -0.0569]],

        ...,

        [[-0.0444,  0.3318,  0.1897,  ...,  0.2177, -0.0327, -0.1106]],

        [[ 0.0094,  0.1063,  0.4631,  ...,  0.1677,  0.0221,  0.1434]],

        [[-0.2718, -0.0544,  0.0183,  ..., -0.0831,  0.0487,  0.3042]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0059, -0.0333, -0.0474,  ...,  0.5434, -0.5625, -0.0736],
         [-0.3520,  0.3934,  0.5223,  ...,  0.2395,  0.0253, -0.0823],
         [-0.1837,  0.0838,  0.1662,  ...,  0.2505, -0.1181, -0.0569],
         ...,
         [-0.0444,  0.3318,  0.1897,  ...,  0.2177, -0.0327, -0.1106],
         [ 0.0094,  0.1063,  0.4631,  ...,  0.1677,  0.0221,  0.1434],
         [-0.2718, -0.0544,  0.0183,  ..., -0.0831,  0.0487,  0.3042]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0123,  0.0991, -0.0086,  ...,  0.8280, -0.9548, -0.0735]],

        [[-0.9511,  0.8989,  0.9800,  ...,  0.4450,  0.0023, -0.0440]],

        [[-0.9105,  0.5883,  0.7466,  ...,  0.3738, -0.4807, -0.0100]],

        ...,

        [[-0.8104,  0.8372,  0.7690,  ...,  0.4677, -0.2471, -0.0622]],

        [[-0.5676,  0.8415,  0.9515,  ...,  0.5382, -0.0214,  0.7557]],

        [[-0.9214, -0.0165,  0.3357,  ..., -0.0120, -0.2169,  0.5923]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0123,  0.0991, -0.0086,  ...,  0.8280, -0.9548, -0.0735],
         [-0.9511,  0.8989,  0.9800,  ...,  0.4450,  0.0023, -0.0440],
         [-0.9105,  0.5883,  0.7466,  ...,  0.3738, -0.4807, -0.0100],
         ...,
         [-0.8104,  0.8372,  0.7690,  ...,  0.4677, -0.2471, -0.0622],
         [-0.5676,  0.8415,  0.9515,  ...,  0.5382, -0.0214,  0.7557],
         [-0.9214, -0.0165,  0.3357,  ..., -0.0120, -0.2169,  0.5923]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.4906,  0.1470,  0.3498,  ..., -0.0746,  0.0288,  0.1165]],

        [[ 0.0380, -0.0324, -0.0214,  ...,  0.1984, -0.5877, -0.0378]],

        [[-0.1767, -0.0257, -0.0453,  ..., -0.1025, -0.2635, -0.0013]],

        ...,

        [[-0.0761,  0.0497, -0.0724,  ...,  0.1858, -0.5060, -0.0397]],

        [[-0.3684, -0.0365, -0.0470,  ..., -0.0581, -0.0879,  0.0650]],

        [[ 0.0147, -0.0466, -0.0198,  ..., -0.0540, -0.4570,  0.0380]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.4906,  0.1470,  0.3498,  ..., -0.0746,  0.0288,  0.1165],
         [ 0.0380, -0.0324, -0.0214,  ...,  0.1984, -0.5877, -0.0378],
         [-0.1767, -0.0257, -0.0453,  ..., -0.1025, -0.2635, -0.0013],
         ...,
         [-0.0761,  0.0497, -0.0724,  ...,  0.1858, -0.5060, -0.0397],
         [-0.3684, -0.0365, -0.0470,  ..., -0.0581, -0.0879,  0.0650],
         [ 0.0147, -0.0466, -0.0198,  ..., -0.0540, -0.4570,  0.0380]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-9.8218e-01,  7.9933e-01,  9.2752e-01,  ..., -3.0492e-02,
           1.2729e-04,  3.9953e-01]],

        [[ 2.1193e-01, -5.3049e-03, -8.1034e-03,  ...,  2.8426e-01,
          -9.6278e-01, -1.1114e-02]],

        [[-8.0632e-01, -8.3117e-03, -1.5807e-02,  ..., -1.0209e-02,
          -8.2320e-01,  6.6868e-03]],

        ...,

        [[-4.5266e-01,  2.3613e-01, -6.5067e-02,  ...,  3.6206e-01,
          -9.1216e-01, -4.1003e-03]],

        [[-9.4531e-01, -3.6559e-02,  4.1134e-02,  ..., -4.4052e-03,
          -5.1426e-01,  6.4356e-02]],

        [[ 1.4019e-01, -1.9501e-02, -5.7375e-03,  ..., -5.8105e-03,
          -8.9984e-01,  2.7074e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.8218e-01,  7.9933e-01,  9.2752e-01,  ..., -3.0492e-02,
           1.2729e-04,  3.9953e-01],
         [ 2.1193e-01, -5.3049e-03, -8.1034e-03,  ...,  2.8426e-01,
          -9.6278e-01, -1.1114e-02],
         [-8.0632e-01, -8.3117e-03, -1.5807e-02,  ..., -1.0209e-02

hiddens tensor([[[-0.1988,  0.0887,  0.0737,  ...,  0.0180,  0.0215, -0.0441]],

        [[-0.2541, -0.0117, -0.0096,  ..., -0.0192, -0.6404, -0.0046]],

        [[ 0.0605,  0.6060,  0.7135,  ...,  0.6232,  0.0118, -0.1546]],

        ...,

        [[ 0.0239, -0.0245, -0.0560,  ...,  0.4779, -0.3309, -0.0721]],

        [[-0.0742, -0.0351, -0.0144,  ..., -0.0800, -0.0813,  0.3805]],

        [[-0.4850,  0.0416,  0.3909,  ..., -0.0388,  0.0196, -0.0329]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1988,  0.0887,  0.0737,  ...,  0.0180,  0.0215, -0.0441],
         [-0.2541, -0.0117, -0.0096,  ..., -0.0192, -0.6404, -0.0046],
         [ 0.0605,  0.6060,  0.7135,  ...,  0.6232,  0.0118, -0.1546],
         ...,
         [ 0.0239, -0.0245, -0.0560,  ...,  0.4779, -0.3309, -0.0721],
         [-0.0742, -0.0351, -0.0144,  ..., -0.0800, -0.0813,  0.3805],
         [-0.4850,  0.0416,  0.3909,  ..., -0.0388,  0.0196, -0.0329]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-8.9535e-01,  6.0504e-01,  6.7954e-01,  ...,  9.6887e-02,
          -1.3535e-01, -1.5072e-02]],

        [[-8.2049e-01, -4.5201e-03, -3.4641e-03,  ..., -1.1262e-04,
          -9.5599e-01, -2.4120e-04]],

        [[-4.1900e-01,  9.6475e-01,  9.9438e-01,  ...,  9.5293e-01,
          -1.0630e-02, -2.3270e-01]],

        ...,

        [[ 4.9481e-02,  8.5237e-02,  1.4861e-02,  ...,  8.2294e-01,
          -8.3327e-01, -7.0405e-02]],

        [[-7.3572e-01, -1.9142e-02,  1.3029e-02,  ..., -1.1171e-02,
          -7.3035e-01,  6.3913e-01]],

        [[-9.7987e-01,  7.1041e-01,  9.3607e-01,  ..., -1.2581e-02,
          -1.4463e-03,  1.1037e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-8.9535e-01,  6.0504e-01,  6.7954e-01,  ...,  9.6887e-02,
          -1.3535e-01, -1.5072e-02],
         [-8.2049e-01, -4.5201e-03, -3.4641e-03,  ..., -1.1262e-04,
          -9.5599e-01, -2.4120e-04],
         [-4.1900e-01,  9.6475e-01,  9.9438e-01,  ...,  9.5293e-01

hiddens tensor([[[-0.1665, -0.0384, -0.0524,  ..., -0.0240,  0.0681,  0.1126]],

        [[ 0.0265,  0.2714,  0.4374,  ...,  0.2788,  0.0366, -0.0911]],

        [[-0.1059, -0.0198, -0.0130,  ..., -0.0377, -0.5356, -0.0503]],

        ...,

        [[-0.0351, -0.0198, -0.0101,  ...,  0.0215, -0.5771, -0.0421]],

        [[ 0.0176, -0.0253, -0.0274,  ...,  0.2744, -0.6094, -0.0043]],

        [[-0.0032, -0.0138, -0.0544,  ..., -0.0933, -0.1117,  0.4314]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1665, -0.0384, -0.0524,  ..., -0.0240,  0.0681,  0.1126],
         [ 0.0265,  0.2714,  0.4374,  ...,  0.2788,  0.0366, -0.0911],
         [-0.1059, -0.0198, -0.0130,  ..., -0.0377, -0.5356, -0.0503],
         ...,
         [-0.0351, -0.0198, -0.0101,  ...,  0.0215, -0.5771, -0.0421],
         [ 0.0176, -0.0253, -0.0274,  ...,  0.2744, -0.6094, -0.0043],
         [-0.0032, -0.0138, -0.0544,  ..., -0.0933, -0.1117,  0.4314]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-7.8045e-01, -3.4001e-02, -1.9426e-02,  ..., -2.0698e-03,
          -1.8254e-01,  1.1926e-01]],

        [[-6.5023e-01,  7.9488e-01,  9.2400e-01,  ...,  7.0395e-01,
          -1.3009e-01, -7.7640e-02]],

        [[ 1.0696e-01, -6.7524e-03, -3.9184e-03,  ..., -1.4939e-03,
          -9.2709e-01, -6.2697e-03]],

        ...,

        [[ 5.8282e-01, -6.2912e-03, -2.2778e-03,  ...,  1.1443e-02,
          -9.3199e-01, -4.0423e-03]],

        [[-8.3247e-02,  8.9385e-02, -1.1641e-02,  ...,  3.3354e-01,
          -9.4586e-01, -1.2442e-04]],

        [[-3.6334e-01,  4.2533e-02,  3.2438e-02,  ..., -1.6067e-02,
          -6.4932e-01,  7.7038e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-7.8045e-01, -3.4001e-02, -1.9426e-02,  ..., -2.0698e-03,
          -1.8254e-01,  1.1926e-01],
         [-6.5023e-01,  7.9488e-01,  9.2400e-01,  ...,  7.0395e-01,
          -1.3009e-01, -7.7640e-02],
         [ 1.0696e-01, -6.7524e-03, -3.9184e-03,  ..., -1.4939e-03

hiddens tensor([[[ 0.0332,  0.6371,  0.4521,  ...,  0.4702,  0.0025,  0.1223]],

        [[-0.0309, -0.0501, -0.0370,  ..., -0.1084, -0.2407, -0.0082]],

        [[-0.0540,  0.1508,  0.0506,  ..., -0.1851,  0.0731,  0.4213]],

        ...,

        [[-0.0574,  0.0248,  0.0807,  ..., -0.0843,  0.0455,  0.4468]],

        [[-0.2832, -0.0374, -0.0190,  ...,  0.0497, -0.5414, -0.0077]],

        [[-0.2021, -0.0409, -0.0356,  ...,  0.0598, -0.4573, -0.0438]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0332,  0.6371,  0.4521,  ...,  0.4702,  0.0025,  0.1223],
         [-0.0309, -0.0501, -0.0370,  ..., -0.1084, -0.2407, -0.0082],
         [-0.0540,  0.1508,  0.0506,  ..., -0.1851,  0.0731,  0.4213],
         ...,
         [-0.0574,  0.0248,  0.0807,  ..., -0.0843,  0.0455,  0.4468],
         [-0.2832, -0.0374, -0.0190,  ...,  0.0497, -0.5414, -0.0077],
         [-0.2021, -0.0409, -0.0356,  ...,  0.0598, -0.4573, -0.0438]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-2.1889e-01,  9.7725e-01,  9.2904e-01,  ...,  8.6217e-01,
          -2.5225e-01,  7.0655e-01]],

        [[-3.8742e-01, -2.9647e-02,  4.2490e-02,  ..., -1.9965e-02,
          -7.7981e-01,  1.1045e-02]],

        [[-6.3097e-01,  7.0647e-01,  6.3054e-01,  ..., -1.5386e-01,
          -9.3081e-02,  8.8255e-01]],

        ...,

        [[-6.3870e-01,  4.1241e-01,  7.3532e-01,  ..., -4.9195e-02,
          -4.8903e-03,  8.9113e-01]],

        [[-8.9304e-01,  4.9245e-03, -8.0920e-03,  ...,  1.3353e-02,
          -9.3203e-01, -2.9444e-04]],

        [[-3.1536e-01, -2.1005e-02, -1.4608e-02,  ...,  2.0499e-02,
          -8.7879e-01, -4.5939e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-2.1889e-01,  9.7725e-01,  9.2904e-01,  ...,  8.6217e-01,
          -2.5225e-01,  7.0655e-01],
         [-3.8742e-01, -2.9647e-02,  4.2490e-02,  ..., -1.9965e-02,
          -7.7981e-01,  1.1045e-02],
         [-6.3097e-01,  7.0647e-01,  6.3054e-01,  ..., -1.5386e-01

hiddens tensor([[[-0.4117, -0.0357,  0.0633,  ..., -0.0594, -0.0277, -0.0591]],

        [[ 0.0044, -0.0267, -0.0096,  ...,  0.6203, -0.6057, -0.0477]],

        [[-0.0783, -0.0370,  0.0330,  ..., -0.1136, -0.0748,  0.2355]],

        ...,

        [[ 0.0578,  0.5001,  0.1985,  ...,  0.2572,  0.0581,  0.2615]],

        [[-0.4366,  0.1074,  0.2226,  ...,  0.0037,  0.0252,  0.0123]],

        [[ 0.0403, -0.0240, -0.0156,  ...,  0.3848, -0.7088, -0.0281]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.4117, -0.0357,  0.0633,  ..., -0.0594, -0.0277, -0.0591],
         [ 0.0044, -0.0267, -0.0096,  ...,  0.6203, -0.6057, -0.0477],
         [-0.0783, -0.0370,  0.0330,  ..., -0.1136, -0.0748,  0.2355],
         ...,
         [ 0.0578,  0.5001,  0.1985,  ...,  0.2572,  0.0581,  0.2615],
         [-0.4366,  0.1074,  0.2226,  ...,  0.0037,  0.0252,  0.0123],
         [ 0.0403, -0.0240, -0.0156,  ...,  0.3848, -0.7088, -0.0281]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.9611,  0.0098,  0.5356,  ..., -0.0063, -0.2344, -0.0023]],

        [[ 0.0135,  0.0390,  0.0482,  ...,  0.9245, -0.9655, -0.0446]],

        [[-0.7804,  0.0245,  0.2611,  ..., -0.0142, -0.4961,  0.3071]],

        ...,

        [[-0.2513,  0.9406,  0.8721,  ...,  0.6590, -0.1925,  0.7477]],

        [[-0.9638,  0.6991,  0.8525,  ...,  0.0090,  0.0035,  0.1703]],

        [[ 0.1998,  0.0334, -0.0045,  ...,  0.4744, -0.9918, -0.0045]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.9611,  0.0098,  0.5356,  ..., -0.0063, -0.2344, -0.0023],
         [ 0.0135,  0.0390,  0.0482,  ...,  0.9245, -0.9655, -0.0446],
         [-0.7804,  0.0245,  0.2611,  ..., -0.0142, -0.4961,  0.3071],
         ...,
         [-0.2513,  0.9406,  0.8721,  ...,  0.6590, -0.1925,  0.7477],
         [-0.9638,  0.6991,  0.8525,  ...,  0.0090,  0.0035,  0.1703],
         [ 0.1998,  0.0334, -0.0045,  ...,  0.4744, -0.9918, -0.0045]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-5.6683e-01,  3.6876e-04,  1.9809e-01,  ..., -3.7563e-02,
           5.1952e-02, -4.1702e-02]],

        [[-1.6868e-01,  1.3708e-01,  3.7186e-01,  ...,  8.4017e-02,
          -5.3360e-02, -3.2543e-02]],

        [[ 2.7823e-02, -1.6491e-02, -1.7812e-02,  ...,  1.0879e-01,
          -5.1406e-01, -3.0822e-02]],

        ...,

        [[ 3.1842e-02,  6.0738e-01,  5.6250e-01,  ...,  4.4978e-01,
           3.6342e-02,  4.2100e-01]],

        [[ 9.3992e-02,  1.9630e-01, -1.1338e-01,  ...,  2.6897e-01,
          -1.0826e-01, -1.7235e-02]],

        [[-3.6579e-01,  3.5959e-01,  4.7821e-01,  ..., -7.8983e-02,
           1.9436e-02,  2.3283e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-5.6683e-01,  3.6876e-04,  1.9809e-01,  ..., -3.7563e-02,
           5.1952e-02, -4.1702e-02],
         [-1.6868e-01,  1.3708e-01,  3.7186e-01,  ...,  8.4017e-02,
          -5.3360e-02, -3.2543e-02],
         [ 2.7823e-02, -1.6491e-02, -1.7812e-02,  ...,  1.0879e-01

hiddens tensor([[[-0.9915,  0.6708,  0.7884,  ...,  0.0064, -0.1176, -0.0066]],

        [[-0.9116,  0.7820,  0.9305,  ...,  0.3503, -0.2196,  0.0114]],

        [[ 0.3760, -0.0031, -0.0038,  ...,  0.1004, -0.9088, -0.0056]],

        ...,

        [[-0.2498,  0.9790,  0.9649,  ...,  0.7842, -0.0551,  0.9116]],

        [[-0.0450,  0.7051,  0.0350,  ...,  0.5055, -0.6084,  0.0962]],

        [[-0.9585,  0.9151,  0.9657,  ..., -0.0474, -0.0014,  0.6165]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.9915,  0.6708,  0.7884,  ...,  0.0064, -0.1176, -0.0066],
         [-0.9116,  0.7820,  0.9305,  ...,  0.3503, -0.2196,  0.0114],
         [ 0.3760, -0.0031, -0.0038,  ...,  0.1004, -0.9088, -0.0056],
         ...,
         [-0.2498,  0.9790,  0.9649,  ...,  0.7842, -0.0551,  0.9116],
         [-0.0450,  0.7051,  0.0350,  ...,  0.5055, -0.6084,  0.0962],
         [-0.9585,  0.9151,  0.9657,  ..., -0.0474, -0.0014,  0.6165]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 0.0484,  0.0707, -0.0368,  ..., -0.1395, -0.0399,  0.2309]],

        [[ 0.0114, -0.0110, -0.0188,  ...,  0.2186, -0.5746, -0.0269]],

        [[ 0.0602,  0.3753,  0.5704,  ...,  0.4062,  0.0360,  0.1415]],

        ...,

        [[ 0.0552,  0.5821,  0.3501,  ...,  0.5203,  0.0251, -0.1960]],

        [[ 0.0482, -0.0528, -0.0134,  ..., -0.0580,  0.0311,  0.1997]],

        [[-0.4375,  0.4043,  0.6185,  ...,  0.0369,  0.0141, -0.0145]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0484,  0.0707, -0.0368,  ..., -0.1395, -0.0399,  0.2309],
         [ 0.0114, -0.0110, -0.0188,  ...,  0.2186, -0.5746, -0.0269],
         [ 0.0602,  0.3753,  0.5704,  ...,  0.4062,  0.0360,  0.1415],
         ...,
         [ 0.0552,  0.5821,  0.3501,  ...,  0.5203,  0.0251, -0.1960],
         [ 0.0482, -0.0528, -0.0134,  ..., -0.0580,  0.0311,  0.1997],
         [-0.4375,  0.4043,  0.6185,  ...,  0.0369,  0.0141, -0.0145]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-0.1995,  0.4875,  0.2838,  ..., -0.0113, -0.6000,  0.8107]],

        [[ 0.5135,  0.0337, -0.0067,  ...,  0.1393, -0.9573, -0.0012]],

        [[-0.7253,  0.9503,  0.9748,  ...,  0.8409, -0.0863,  0.7406]],

        ...,

        [[-0.5471,  0.9639,  0.9301,  ...,  0.9150, -0.2843, -0.1135]],

        [[ 0.2749, -0.0449,  0.1706,  ..., -0.0125, -0.3279,  0.4476]],

        [[-0.9894,  0.9366,  0.9896,  ...,  0.0874, -0.0021,  0.3242]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1995,  0.4875,  0.2838,  ..., -0.0113, -0.6000,  0.8107],
         [ 0.5135,  0.0337, -0.0067,  ...,  0.1393, -0.9573, -0.0012],
         [-0.7253,  0.9503,  0.9748,  ...,  0.8409, -0.0863,  0.7406],
         ...,
         [-0.5471,  0.9639,  0.9301,  ...,  0.9150, -0.2843, -0.1135],
         [ 0.2749, -0.0449,  0.1706,  ..., -0.0125, -0.3279,  0.4476],
         [-0.9894,  0.9366,  0.9896,  ...,  0.0874, -0.0021,  0.3242]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 5.0288e-02, -7.7461e-02,  2.2987e-01,  ..., -1.0660e-02,
          -5.5987e-02,  8.8178e-01]],

        [[-2.1913e-01, -2.0768e-02, -1.8062e-02,  ..., -1.0952e-02,
          -8.2732e-01, -1.1180e-02]],

        [[-1.7094e-01,  9.7782e-01,  9.8589e-01,  ...,  8.0328e-01,
          -3.3389e-03,  9.4695e-01]],

        ...,

        [[-9.0280e-01,  9.4230e-01,  9.7781e-01,  ...,  7.2289e-02,
           6.0282e-04,  6.8349e-01]],

        [[-4.4710e-01,  5.3169e-01,  6.4591e-01,  ...,  5.3113e-01,
          -3.3318e-01, -1.5319e-02]],

        [[-9.5292e-01,  9.2996e-01,  9.7568e-01,  ...,  4.4086e-01,
           1.0694e-03, -6.6782e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 5.0288e-02, -7.7461e-02,  2.2987e-01,  ..., -1.0660e-02,
          -5.5987e-02,  8.8178e-01],
         [-2.1913e-01, -2.0768e-02, -1.8062e-02,  ..., -1.0952e-02,
          -8.2732e-01, -1.1180e-02],
         [-1.7094e-01,  9.7782e-01,  9.8589e-01,  ...,  8.0328e-01

hiddens tensor([[[-5.4726e-01,  6.4734e-02,  4.4340e-01,  ..., -3.5437e-03,
          -1.9390e-01,  9.0812e-01]],

        [[-2.8943e-01, -2.1925e-02, -1.2002e-02,  ..., -2.1544e-03,
          -8.8872e-01, -3.2084e-03]],

        [[-3.0868e-01,  9.9231e-01,  9.9558e-01,  ...,  8.5448e-01,
          -8.4594e-03,  9.6383e-01]],

        ...,

        [[-9.8965e-01,  9.6794e-01,  9.9328e-01,  ...,  4.9004e-02,
          -5.9040e-04,  6.7211e-01]],

        [[-9.3837e-01,  7.2927e-01,  8.3640e-01,  ...,  3.9447e-01,
          -4.0055e-01, -6.3117e-03]],

        [[-9.9504e-01,  9.6104e-01,  9.9146e-01,  ...,  2.8877e-01,
          -5.3290e-03, -1.0486e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-5.4726e-01,  6.4734e-02,  4.4340e-01,  ..., -3.5437e-03,
          -1.9390e-01,  9.0812e-01],
         [-2.8943e-01, -2.1925e-02, -1.2002e-02,  ..., -2.1544e-03,
          -8.8872e-01, -3.2084e-03],
         [-3.0868e-01,  9.9231e-01,  9.9558e-01,  ...,  8.5448e-01

hiddens tensor([[[-5.7138e-01, -3.8858e-02, -3.3349e-02,  ..., -3.2429e-03,
          -4.6017e-01,  4.2334e-01]],

        [[-1.0285e-02,  9.3735e-01,  9.6805e-01,  ...,  9.5364e-01,
          -1.3500e-01,  5.6999e-01]],

        [[-3.8822e-02,  8.0452e-01,  8.1498e-01,  ...,  8.8302e-01,
          -4.8964e-01, -4.2960e-02]],

        ...,

        [[ 4.5392e-01,  1.6589e-02, -1.8528e-02,  ...,  5.6359e-01,
          -9.6258e-01, -2.0858e-04]],

        [[ 3.2626e-01, -1.2133e-02, -9.7031e-03,  ..., -2.6608e-02,
          -8.0968e-01,  3.6384e-02]],

        [[-2.7203e-01,  9.8120e-01,  9.8562e-01,  ...,  8.6637e-01,
          -1.3406e-03,  8.6920e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-5.7138e-01, -3.8858e-02, -3.3349e-02,  ..., -3.2429e-03,
          -4.6017e-01,  4.2334e-01],
         [-1.0285e-02,  9.3735e-01,  9.6805e-01,  ...,  9.5364e-01,
          -1.3500e-01,  5.6999e-01],
         [-3.8822e-02,  8.0452e-01,  8.1498e-01,  ...,  8.8302e-01

hiddens tensor([[[-9.1300e-01, -1.1371e-02,  6.4029e-02,  ..., -7.4787e-04,
          -6.3569e-01,  2.3650e-01]],

        [[-7.6506e-02,  9.7174e-01,  9.8860e-01,  ...,  9.7387e-01,
          -1.6638e-01,  7.8103e-01]],

        [[-3.1339e-01,  9.0893e-01,  9.0471e-01,  ...,  9.1201e-01,
          -5.6427e-01,  1.3653e-02]],

        ...,

        [[ 6.0792e-01,  2.5979e-02, -2.1456e-02,  ...,  4.7600e-01,
          -9.7577e-01, -8.4953e-05]],

        [[ 5.4566e-01, -1.0891e-02, -6.9714e-03,  ..., -8.0179e-03,
          -8.9874e-01,  1.4932e-02]],

        [[-4.9945e-01,  9.9392e-01,  9.9640e-01,  ...,  8.9831e-01,
          -7.8822e-03,  9.0055e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.1300e-01, -1.1371e-02,  6.4029e-02,  ..., -7.4787e-04,
          -6.3569e-01,  2.3650e-01],
         [-7.6506e-02,  9.7174e-01,  9.8860e-01,  ...,  9.7387e-01,
          -1.6638e-01,  7.8103e-01],
         [-3.1339e-01,  9.0893e-01,  9.0471e-01,  ...,  9.1201e-01

pred prob tensor([[[3.6438e-05, 9.9996e-01]],

        [[1.6999e-05, 9.9998e-01]],

        [[1.8086e-04, 9.9982e-01]],

        [[1.3313e-04, 9.9987e-01]],

        [[1.0356e-04, 9.9990e-01]],

        [[8.5304e-06, 9.9999e-01]],

        [[5.7985e-04, 9.9942e-01]],

        [[3.3850e-05, 9.9997e-01]],

        [[1.0352e-05, 9.9999e-01]],

        [[8.3966e-01, 1.6034e-01]],

        [[2.0628e-03, 9.9794e-01]],

        [[1.1480e-05, 9.9999e-01]],

        [[5.0434e-04, 9.9950e-01]],

        [[2.9475e-04, 9.9971e-01]],

        [[6.6859e-06, 9.9999e-01]],

        [[3.9470e-05, 9.9996e-01]],

        [[9.9725e-06, 9.9999e-01]],

        [[6.2358e-06, 9.9999e-01]],

        [[1.5429e-03, 9.9846e-01]],

        [[3.5458e-05, 9.9996e-01]],

        [[1.2508e-04, 9.9987e-01]],

        [[2.4517e-04, 9.9975e-01]],

        [[1.2374e-05, 9.9999e-01]],

        [[2.3751e-04, 9.9976e-01]],

        [[3.9759e-04, 9.9960e-01]],

        [[3.6090e-05, 9.9996e-01]],

        [[8.5435e-05, 9.9991

hiddens tensor([[[-6.5590e-01,  9.8919e-01,  9.9522e-01,  ...,  6.3965e-01,
          -1.4645e-02,  9.2413e-01]],

        [[-9.4378e-01,  4.1692e-01,  3.4460e-01,  ..., -3.8584e-03,
          -1.6530e-01,  7.4790e-01]],

        [[ 5.5701e-02, -2.9974e-03,  2.0692e-02,  ...,  1.1549e-01,
          -9.4844e-01,  2.2243e-01]],

        ...,

        [[-9.8989e-01, -3.8361e-03, -2.1371e-03,  ..., -2.4111e-04,
          -8.9802e-01, -1.7454e-03]],

        [[-9.7408e-01,  4.2062e-01,  8.2294e-01,  ...,  3.1109e-02,
          -1.2767e-02, -1.0308e-02]],

        [[ 4.9329e-01, -1.1484e-02, -5.2399e-03,  ..., -1.2703e-03,
          -8.9823e-01, -4.3352e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-6.5590e-01,  9.8919e-01,  9.9522e-01,  ...,  6.3965e-01,
          -1.4645e-02,  9.2413e-01],
         [-9.4378e-01,  4.1692e-01,  3.4460e-01,  ..., -3.8584e-03,
          -1.6530e-01,  7.4790e-01],
         [ 5.5701e-02, -2.9974e-03,  2.0692e-02,  ...,  1.1549e-01

hiddens tensor([[[ 0.0783,  0.0064, -0.0383,  ...,  0.3481, -0.7887, -0.0336]],

        [[-0.6677, -0.0644, -0.0302,  ..., -0.0045, -0.2232,  0.2173]],

        [[-0.8899,  0.3881,  0.1795,  ...,  0.2792, -0.6494, -0.0027]],

        ...,

        [[ 0.0631, -0.0263, -0.0164,  ..., -0.0251, -0.3798,  0.8052]],

        [[-0.9420,  0.7966,  0.8317,  ..., -0.0633, -0.0013,  0.2173]],

        [[-0.9292,  0.0305, -0.0308,  ...,  0.0149, -0.5895, -0.0028]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 0.0783,  0.0064, -0.0383,  ...,  0.3481, -0.7887, -0.0336],
         [-0.6677, -0.0644, -0.0302,  ..., -0.0045, -0.2232,  0.2173],
         [-0.8899,  0.3881,  0.1795,  ...,  0.2792, -0.6494, -0.0027],
         ...,
         [ 0.0631, -0.0263, -0.0164,  ..., -0.0251, -0.3798,  0.8052],
         [-0.9420,  0.7966,  0.8317,  ..., -0.0633, -0.0013,  0.2173],
         [-0.9292,  0.0305, -0.0308,  ...,  0.0149, -0.5895, -0.0028]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[ 1.5072e-01,  2.5937e-02, -1.9439e-02,  ...,  2.8692e-01,
          -9.0242e-01, -1.1416e-02]],

        [[-9.3722e-01, -3.8063e-02,  3.7098e-02,  ..., -8.3591e-04,
          -6.1642e-01,  9.2031e-02]],

        [[-9.9002e-01,  5.6713e-01,  5.3264e-01,  ...,  1.5785e-01,
          -7.1313e-01, -1.4948e-03]],

        ...,

        [[-1.8061e-01,  1.3654e-02,  7.3423e-02,  ..., -7.6727e-03,
          -6.2938e-01,  8.2159e-01]],

        [[-9.9602e-01,  8.9330e-01,  9.2932e-01,  ..., -1.3818e-02,
          -4.1238e-02,  1.5745e-01]],

        [[-9.8987e-01,  1.5756e-01,  1.2726e-01,  ...,  8.8726e-03,
          -7.1108e-01, -1.3011e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 1.5072e-01,  2.5937e-02, -1.9439e-02,  ...,  2.8692e-01,
          -9.0242e-01, -1.1416e-02],
         [-9.3722e-01, -3.8063e-02,  3.7098e-02,  ..., -8.3591e-04,
          -6.1642e-01,  9.2031e-02],
         [-9.9002e-01,  5.6713e-01,  5.3264e-01,  ...,  1.5785e-01

hiddens tensor([[[-0.9664,  0.8298,  0.9520,  ...,  0.2436,  0.0011, -0.0522]],

        [[ 0.0532, -0.0395, -0.0219,  ..., -0.0057, -0.6984,  0.1243]],

        [[-0.6011,  0.5525,  0.5613,  ...,  0.4601, -0.0718, -0.0347]],

        ...,

        [[-0.9027,  0.0172,  0.5055,  ..., -0.0045,  0.0072,  0.7883]],

        [[-0.8379, -0.0405, -0.0416,  ..., -0.0011,  0.0047,  0.1421]],

        [[-0.0511,  0.0325,  0.2319,  ...,  0.0139, -0.4362,  0.5747]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.9664,  0.8298,  0.9520,  ...,  0.2436,  0.0011, -0.0522],
         [ 0.0532, -0.0395, -0.0219,  ..., -0.0057, -0.6984,  0.1243],
         [-0.6011,  0.5525,  0.5613,  ...,  0.4601, -0.0718, -0.0347],
         ...,
         [-0.9027,  0.0172,  0.5055,  ..., -0.0045,  0.0072,  0.7883],
         [-0.8379, -0.0405, -0.0416,  ..., -0.0011,  0.0047,  0.1421],
         [-0.0511,  0.0325,  0.2319,  ...,  0.0139, -0.4362,  0.5747]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-9.9623e-01,  8.9089e-01,  9.7919e-01,  ...,  1.0611e-01,
          -5.4677e-03,  1.1782e-03]],

        [[-5.0299e-01,  2.4902e-03,  2.5426e-03,  ..., -1.2672e-03,
          -8.3115e-01,  6.0241e-02]],

        [[-9.6405e-01,  7.8581e-01,  9.3133e-01,  ...,  2.9136e-01,
          -1.8931e-01, -6.3768e-03]],

        ...,

        [[-9.8860e-01,  5.5071e-01,  8.0321e-01,  ..., -8.9504e-04,
          -2.4568e-02,  7.3881e-01]],

        [[-9.6804e-01, -3.5650e-02,  3.3678e-02,  ..., -3.6185e-04,
          -2.6060e-01,  5.8049e-02]],

        [[-4.9751e-01,  1.9547e-01,  4.9286e-01,  ...,  6.5830e-02,
          -6.5898e-01,  6.6155e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.9623e-01,  8.9089e-01,  9.7919e-01,  ...,  1.0611e-01,
          -5.4677e-03,  1.1782e-03],
         [-5.0299e-01,  2.4902e-03,  2.5426e-03,  ..., -1.2672e-03,
          -8.3115e-01,  6.0241e-02],
         [-9.6405e-01,  7.8581e-01,  9.3133e-01,  ...,  2.9136e-01

hiddens tensor([[[-9.8184e-01,  7.3204e-01,  2.9923e-01,  ..., -2.2004e-02,
          -1.6419e-01, -1.0498e-02]],

        [[-9.5464e-01,  9.6015e-01,  9.8289e-01,  ..., -4.8211e-03,
           4.9400e-04,  7.4272e-01]],

        [[-4.0783e-01,  9.7510e-01,  9.8623e-01,  ...,  8.8642e-01,
          -3.9110e-03,  8.7222e-01]],

        ...,

        [[ 3.5499e-02, -1.5307e-02, -6.6749e-03,  ..., -1.3544e-02,
          -7.8856e-01, -2.1162e-02]],

        [[-3.3802e-01,  5.3910e-01,  3.1123e-01,  ...,  7.7067e-01,
          -8.0227e-01, -9.2887e-04]],

        [[-8.5653e-01,  3.0686e-01,  3.8300e-01,  ..., -3.5060e-02,
          -6.2831e-02,  2.7370e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.8184e-01,  7.3204e-01,  2.9923e-01,  ..., -2.2004e-02,
          -1.6419e-01, -1.0498e-02],
         [-9.5464e-01,  9.6015e-01,  9.8289e-01,  ..., -4.8211e-03,
           4.9400e-04,  7.4272e-01],
         [-4.0783e-01,  9.7510e-01,  9.8623e-01,  ...,  8.8642e-01

hiddens tensor([[[-9.9830e-01,  8.6686e-01,  7.5382e-01,  ..., -2.4077e-03,
          -2.3357e-01, -3.4228e-03]],

        [[-9.9530e-01,  9.7779e-01,  9.9520e-01,  ...,  1.4500e-02,
          -1.6241e-04,  7.0637e-01]],

        [[-7.1453e-01,  9.9145e-01,  9.9600e-01,  ...,  9.0829e-01,
          -1.1954e-02,  9.1970e-01]],

        ...,

        [[-4.6653e-01, -1.2994e-02,  6.1298e-03,  ..., -2.0866e-03,
          -8.7339e-01,  8.9633e-03]],

        [[-8.8614e-01,  6.8145e-01,  5.6117e-01,  ...,  6.9393e-01,
          -8.4409e-01, -3.6761e-04]],

        [[-9.9106e-01,  6.3240e-01,  7.0981e-01,  ..., -5.1484e-03,
          -1.9709e-01,  1.6300e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.9830e-01,  8.6686e-01,  7.5382e-01,  ..., -2.4077e-03,
          -2.3357e-01, -3.4228e-03],
         [-9.9530e-01,  9.7779e-01,  9.9520e-01,  ...,  1.4500e-02,
          -1.6241e-04,  7.0637e-01],
         [-7.1453e-01,  9.9145e-01,  9.9600e-01,  ...,  9.0829e-01

hiddens tensor([[[-2.1885e-01,  9.1483e-01,  8.5216e-01,  ...,  5.8645e-01,
          -7.2700e-02,  7.9168e-01]],

        [[-9.4483e-01,  8.1196e-01,  9.0541e-01,  ..., -5.9882e-02,
          -5.5119e-06,  3.1717e-01]],

        [[ 1.8400e-01,  1.3908e-01,  5.2693e-02,  ...,  8.5126e-01,
          -9.1103e-01, -1.1973e-02]],

        ...,

        [[-7.0817e-01,  8.6376e-01,  9.6656e-01,  ..., -4.0906e-02,
           5.6806e-04,  9.1345e-01]],

        [[-8.6588e-01,  2.7853e-02,  4.3207e-02,  ..., -1.8259e-02,
          -2.0130e-01,  9.3043e-02]],

        [[-3.4606e-01,  2.4432e-01,  5.4711e-01,  ...,  5.0507e-04,
          -1.9196e-01,  8.3689e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-2.1885e-01,  9.1483e-01,  8.5216e-01,  ...,  5.8645e-01,
          -7.2700e-02,  7.9168e-01],
         [-9.4483e-01,  8.1196e-01,  9.0541e-01,  ..., -5.9882e-02,
          -5.5119e-06,  3.1717e-01],
         [ 1.8400e-01,  1.3908e-01,  5.2693e-02,  ...,  8.5126e-01

hiddens tensor([[[-7.6935e-01,  9.7085e-01,  9.5202e-01,  ...,  6.2482e-01,
          -1.2275e-01,  8.6365e-01]],

        [[-9.9546e-01,  8.9495e-01,  9.6625e-01,  ..., -1.4769e-02,
          -1.8259e-02,  2.5538e-01]],

        [[ 2.6397e-01,  2.2725e-01,  1.4556e-01,  ...,  8.4725e-01,
          -9.4288e-01, -4.0559e-03]],

        ...,

        [[-9.4506e-01,  9.2999e-01,  9.9010e-01,  ..., -1.1935e-02,
          -4.3891e-03,  9.3494e-01]],

        [[-9.8625e-01,  2.7952e-01,  2.3655e-01,  ..., -2.7707e-03,
          -3.9521e-01,  4.7642e-02]],

        [[-8.6452e-01,  5.5311e-01,  7.7830e-01,  ...,  5.9079e-04,
          -2.2809e-01,  8.7888e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-7.6935e-01,  9.7085e-01,  9.5202e-01,  ...,  6.2482e-01,
          -1.2275e-01,  8.6365e-01],
         [-9.9546e-01,  8.9495e-01,  9.6625e-01,  ..., -1.4769e-02,
          -1.8259e-02,  2.5538e-01],
         [ 2.6397e-01,  2.2725e-01,  1.4556e-01,  ...,  8.4725e-01

hiddens tensor([[[-0.8024, -0.0157, -0.0083,  ..., -0.0024, -0.7067,  0.0129]],

        [[-0.8770,  0.1214,  0.1310,  ..., -0.0289, -0.2302, -0.0223]],

        [[ 0.0069,  0.2920,  0.4477,  ...,  0.9214, -0.8340,  0.2004]],

        ...,

        [[ 0.0120,  0.1232,  0.0285,  ...,  0.9202, -0.9331, -0.0337]],

        [[ 0.4311, -0.0028, -0.0012,  ...,  0.1013, -0.9561, -0.0025]],

        [[-0.6119,  0.5778,  0.2839,  ...,  0.8579, -0.9041, -0.0018]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.8024, -0.0157, -0.0083,  ..., -0.0024, -0.7067,  0.0129],
         [-0.8770,  0.1214,  0.1310,  ..., -0.0289, -0.2302, -0.0223],
         [ 0.0069,  0.2920,  0.4477,  ...,  0.9214, -0.8340,  0.2004],
         ...,
         [ 0.0120,  0.1232,  0.0285,  ...,  0.9202, -0.9331, -0.0337],
         [ 0.4311, -0.0028, -0.0012,  ...,  0.1013, -0.9561, -0.0025],
         [-0.6119,  0.5778,  0.2839,  ...,  0.8579, -0.9041, -0.0018]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-9.6032e-01, -1.9424e-02,  2.0330e-02,  ..., -6.2659e-04,
          -8.1553e-01,  8.3108e-03]],

        [[-9.9201e-01,  5.7293e-01,  6.0974e-01,  ..., -1.7045e-03,
          -3.9242e-01,  1.3028e-03]],

        [[ 4.8693e-03,  3.7603e-01,  5.7057e-01,  ...,  9.5541e-01,
          -9.1001e-01,  3.4041e-01]],

        ...,

        [[ 3.0473e-02,  1.5736e-01,  1.2713e-01,  ...,  9.4039e-01,
          -9.5858e-01, -2.2239e-02]],

        [[ 8.2222e-01, -8.9494e-04, -7.6018e-04,  ...,  4.4410e-02,
          -9.7375e-01, -9.9987e-04]],

        [[-9.3482e-01,  6.7021e-01,  5.3020e-01,  ...,  8.2440e-01,
          -9.3061e-01, -6.4182e-04]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.6032e-01, -1.9424e-02,  2.0330e-02,  ..., -6.2659e-04,
          -8.1553e-01,  8.3108e-03],
         [-9.9201e-01,  5.7293e-01,  6.0974e-01,  ..., -1.7045e-03,
          -3.9242e-01,  1.3028e-03],
         [ 4.8693e-03,  3.7603e-01,  5.7057e-01,  ...,  9.5541e-01

hiddens tensor([[[-1.3698e-02,  4.4892e-01,  5.3124e-01,  ...,  9.0087e-01,
          -7.4113e-01, -8.9444e-03]],

        [[-8.0088e-01,  1.7998e-02,  3.0399e-01,  ..., -1.2194e-02,
           3.0459e-03,  7.7051e-01]],

        [[ 1.1077e-01, -8.8287e-04, -8.0363e-04,  ...,  6.5356e-01,
          -9.8979e-01, -5.6604e-04]],

        ...,

        [[-9.3917e-01,  9.4108e-01,  9.7818e-01,  ...,  1.6448e-01,
           5.6311e-05,  4.0420e-01]],

        [[-8.7350e-01,  7.9219e-02,  3.8207e-01,  ..., -4.0378e-03,
          -6.5524e-02,  5.8549e-01]],

        [[ 3.0233e-02,  1.9867e-02, -1.5560e-02,  ...,  8.3940e-01,
          -9.2513e-01, -3.9550e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-1.3698e-02,  4.4892e-01,  5.3124e-01,  ...,  9.0087e-01,
          -7.4113e-01, -8.9444e-03],
         [-8.0088e-01,  1.7998e-02,  3.0399e-01,  ..., -1.2194e-02,
           3.0459e-03,  7.7051e-01],
         [ 1.1077e-01, -8.8287e-04, -8.0363e-04,  ...,  6.5356e-01

hiddens tensor([[[-5.4665e-01,  6.5838e-01,  7.5385e-01,  ...,  9.0359e-01,
          -7.7463e-01, -3.4340e-03]],

        [[-9.8279e-01,  5.7745e-01,  7.1270e-01,  ..., -1.8788e-03,
          -5.6423e-02,  7.3364e-01]],

        [[ 3.4633e-01, -3.7304e-04, -5.8880e-04,  ...,  5.3141e-01,
          -9.9722e-01, -2.0205e-04]],

        ...,

        [[-9.9314e-01,  9.6803e-01,  9.9286e-01,  ...,  1.3080e-01,
          -1.9788e-03,  4.0075e-01]],

        [[-9.8362e-01,  4.2594e-01,  6.5593e-01,  ..., -8.6263e-04,
          -1.8611e-01,  4.5894e-01]],

        [[ 7.4727e-02,  3.8687e-02,  9.0350e-03,  ...,  8.5314e-01,
          -9.5887e-01, -2.2526e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-5.4665e-01,  6.5838e-01,  7.5385e-01,  ...,  9.0359e-01,
          -7.7463e-01, -3.4340e-03],
         [-9.8279e-01,  5.7745e-01,  7.1270e-01,  ..., -1.8788e-03,
          -5.6423e-02,  7.3364e-01],
         [ 3.4633e-01, -3.7304e-04, -5.8880e-04,  ...,  5.3141e-01

hiddens tensor([[[-9.7452e-01,  9.2610e-01,  9.8530e-01,  ...,  1.2113e-01,
           3.8059e-04,  2.1868e-01]],

        [[-9.5803e-01,  5.9235e-02,  2.4207e-01,  ..., -4.0309e-03,
          -3.9044e-01,  4.1262e-01]],

        [[ 7.4831e-02, -3.6498e-03, -6.0331e-03,  ...,  5.6327e-01,
          -8.9698e-01, -4.1071e-02]],

        ...,

        [[-9.5738e-01,  9.7053e-01,  9.8571e-01,  ..., -1.3264e-02,
           3.4789e-04,  2.5483e-01]],

        [[ 1.5702e-01, -1.2129e-02, -8.7901e-03,  ..., -3.6165e-03,
          -8.3273e-01,  1.8045e-01]],

        [[-9.7397e-01,  1.4861e-02, -4.0548e-02,  ..., -1.7670e-03,
          -7.0218e-01, -2.7530e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.7452e-01,  9.2610e-01,  9.8530e-01,  ...,  1.2113e-01,
           3.8059e-04,  2.1868e-01],
         [-9.5803e-01,  5.9235e-02,  2.4207e-01,  ..., -4.0309e-03,
          -3.9044e-01,  4.1262e-01],
         [ 7.4831e-02, -3.6498e-03, -6.0331e-03,  ...,  5.6327e-01

hiddens tensor([[[-0.1309, -0.0280, -0.0170,  ..., -0.0172, -0.2646,  0.0466]],

        [[ 0.0178,  0.4682,  0.4006,  ...,  0.1863,  0.0156,  0.0308]],

        [[-0.4624,  0.0298,  0.1972,  ..., -0.0287,  0.0456, -0.0756]],

        ...,

        [[-0.5331,  0.5985,  0.7045,  ...,  0.3838,  0.0082, -0.1095]],

        [[-0.1344, -0.0278, -0.0433,  ..., -0.0062,  0.0798,  0.5400]],

        [[-0.2827, -0.0025, -0.0014,  ..., -0.0206, -0.6458, -0.0228]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.1309, -0.0280, -0.0170,  ..., -0.0172, -0.2646,  0.0466],
         [ 0.0178,  0.4682,  0.4006,  ...,  0.1863,  0.0156,  0.0308],
         [-0.4624,  0.0298,  0.1972,  ..., -0.0287,  0.0456, -0.0756],
         ...,
         [-0.5331,  0.5985,  0.7045,  ...,  0.3838,  0.0082, -0.1095],
         [-0.1344, -0.0278, -0.0433,  ..., -0.0062,  0.0798,  0.5400],
         [-0.2827, -0.0025, -0.0014,  ..., -0.0206, -0.6458, -0.0228]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-8.7644e-01, -8.3030e-03,  2.9066e-04,  ..., -4.0745e-04,
          -7.7574e-01,  2.8022e-02]],

        [[-7.5483e-01,  9.5036e-01,  9.3324e-01,  ...,  6.3321e-01,
          -2.2985e-01,  5.4259e-01]],

        [[-9.8788e-01,  7.6389e-01,  8.8725e-01,  ...,  1.6571e-02,
          -5.3044e-02, -1.3960e-02]],

        ...,

        [[-9.8557e-01,  9.7484e-01,  9.9477e-01,  ...,  6.5706e-01,
          -1.0929e-03, -6.7123e-02]],

        [[-8.5375e-01, -3.4512e-02,  1.6126e-02,  ..., -2.1828e-04,
          -2.6760e-01,  8.3032e-01]],

        [[-7.5065e-01, -7.9223e-04, -3.3878e-04,  ..., -5.7994e-04,
          -9.6392e-01, -1.2927e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-8.7644e-01, -8.3030e-03,  2.9066e-04,  ..., -4.0745e-04,
          -7.7574e-01,  2.8022e-02],
         [-7.5483e-01,  9.5036e-01,  9.3324e-01,  ...,  6.3321e-01,
          -2.2985e-01,  5.4259e-01],
         [-9.8788e-01,  7.6389e-01,  8.8725e-01,  ...,  1.6571e-02

hiddens tensor([[[-0.2332, -0.0236, -0.0304,  ...,  0.0286, -0.4323, -0.0168]],

        [[ 0.0424, -0.0465, -0.0360,  ...,  0.2446, -0.5203, -0.0436]],

        [[-0.1837, -0.0165, -0.0088,  ..., -0.0516, -0.4633, -0.0335]],

        ...,

        [[-0.2604, -0.0491, -0.0338,  ..., -0.0541, -0.0065,  0.0233]],

        [[ 0.0642, -0.0099, -0.0040,  ...,  0.0134, -0.6417, -0.0437]],

        [[-0.4792,  0.3665,  0.5290,  ..., -0.0532,  0.0139,  0.3206]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.2332, -0.0236, -0.0304,  ...,  0.0286, -0.4323, -0.0168],
         [ 0.0424, -0.0465, -0.0360,  ...,  0.2446, -0.5203, -0.0436],
         [-0.1837, -0.0165, -0.0088,  ..., -0.0516, -0.4633, -0.0335],
         ...,
         [-0.2604, -0.0491, -0.0338,  ..., -0.0541, -0.0065,  0.0233],
         [ 0.0642, -0.0099, -0.0040,  ...,  0.0134, -0.6417, -0.0437],
         [-0.4792,  0.3665,  0.5290,  ..., -0.0532,  0.0139,  0.3206]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-9.7807e-01,  3.0997e-01,  1.1640e-01,  ...,  9.7782e-02,
          -8.9311e-01, -7.4190e-04]],

        [[ 2.7245e-01,  4.2940e-03, -7.6818e-03,  ...,  3.2005e-01,
          -9.5756e-01, -1.2287e-02]],

        [[-9.4202e-01, -8.3683e-03, -5.4609e-04,  ..., -1.4036e-03,
          -9.1434e-01, -1.7380e-03]],

        ...,

        [[-9.8828e-01,  3.6516e-01,  4.9409e-01,  ..., -2.0911e-03,
          -2.0139e-01,  5.0780e-02]],

        [[ 7.6038e-01, -2.5864e-03, -7.3255e-04,  ...,  5.9927e-03,
          -9.6338e-01, -3.5430e-03]],

        [[-9.9330e-01,  9.6293e-01,  9.8543e-01,  ..., -1.2530e-02,
          -9.4956e-04,  7.7626e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.7807e-01,  3.0997e-01,  1.1640e-01,  ...,  9.7782e-02,
          -8.9311e-01, -7.4190e-04],
         [ 2.7245e-01,  4.2940e-03, -7.6818e-03,  ...,  3.2005e-01,
          -9.5756e-01, -1.2287e-02],
         [-9.4202e-01, -8.3683e-03, -5.4609e-04,  ..., -1.4036e-03

hiddens tensor([[[-0.8631,  0.3418,  0.7270,  ..., -0.0284,  0.0085,  0.1160]],

        [[-0.7951,  0.8294,  0.9092,  ..., -0.0813,  0.0015,  0.6993]],

        [[-0.3672, -0.0127, -0.0039,  ..., -0.0139, -0.7385, -0.0114]],

        ...,

        [[ 0.0907,  0.4718,  0.2649,  ...,  0.5387, -0.3781, -0.0676]],

        [[-0.3942,  0.9023,  0.8674,  ...,  0.2984,  0.0126,  0.7186]],

        [[-0.8972,  0.5174,  0.7353,  ...,  0.0072,  0.0060,  0.0222]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.8631,  0.3418,  0.7270,  ..., -0.0284,  0.0085,  0.1160],
         [-0.7951,  0.8294,  0.9092,  ..., -0.0813,  0.0015,  0.6993],
         [-0.3672, -0.0127, -0.0039,  ..., -0.0139, -0.7385, -0.0114],
         ...,
         [ 0.0907,  0.4718,  0.2649,  ...,  0.5387, -0.3781, -0.0676],
         [-0.3942,  0.9023,  0.8674,  ...,  0.2984,  0.0126,  0.7186],
         [-0.8972,  0.5174,  0.7353,  ...,  0.0072,  0.0060,  0.0222]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-9.9683e-01,  8.6617e-01,  9.5896e-01,  ...,  6.4870e-04,
          -1.1755e-02,  2.0242e-01]],

        [[-9.9216e-01,  9.6159e-01,  9.9223e-01,  ..., -2.0662e-02,
          -7.8524e-04,  8.0009e-01]],

        [[-9.1882e-01, -8.4057e-03, -1.8387e-03,  ..., -7.4186e-04,
          -9.1791e-01,  1.5378e-04]],

        ...,

        [[-6.3145e-01,  7.5889e-01,  6.3486e-01,  ...,  6.1934e-01,
          -6.9016e-01, -2.3316e-02]],

        [[-8.8410e-01,  9.9057e-01,  9.8132e-01,  ...,  4.4569e-01,
          -1.0317e-02,  9.1022e-01]],

        [[-9.9566e-01,  8.3320e-01,  9.4805e-01,  ...,  1.8190e-03,
          -2.7451e-03,  8.5962e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.9683e-01,  8.6617e-01,  9.5896e-01,  ...,  6.4870e-04,
          -1.1755e-02,  2.0242e-01],
         [-9.9216e-01,  9.6159e-01,  9.9223e-01,  ..., -2.0662e-02,
          -7.8524e-04,  8.0009e-01],
         [-9.1882e-01, -8.4057e-03, -1.8387e-03,  ..., -7.4186e-04

hiddens tensor([[[ 1.1878e-01, -2.8187e-02, -2.0204e-02,  ..., -8.1826e-03,
          -7.0518e-01,  5.4104e-02]],

        [[ 6.7835e-03,  8.9064e-01,  9.1429e-01,  ...,  9.1355e-01,
          -3.8151e-01,  8.4504e-03]],

        [[ 3.9108e-01, -2.6256e-03, -1.3900e-03,  ..., -3.3196e-03,
          -9.3250e-01,  1.8947e-02]],

        ...,

        [[ 3.9787e-02,  4.6953e-02, -1.2364e-03,  ...,  8.3952e-01,
          -9.0389e-01, -7.6257e-02]],

        [[ 3.4540e-03,  2.7472e-01,  2.6960e-01,  ...,  9.4742e-01,
          -9.4734e-01,  3.4846e-02]],

        [[-9.8432e-01,  9.5227e-01,  9.7874e-01,  ..., -2.7077e-02,
           9.5720e-04,  1.4023e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 1.1878e-01, -2.8187e-02, -2.0204e-02,  ..., -8.1826e-03,
          -7.0518e-01,  5.4104e-02],
         [ 6.7835e-03,  8.9064e-01,  9.1429e-01,  ...,  9.1355e-01,
          -3.8151e-01,  8.4504e-03],
         [ 3.9108e-01, -2.6256e-03, -1.3900e-03,  ..., -3.3196e-03

hiddens tensor([[[ 6.5740e-01, -2.6028e-02, -1.7345e-02,  ..., -2.1327e-03,
          -7.9792e-01,  2.0696e-02]],

        [[-3.5448e-01,  9.4971e-01,  9.6289e-01,  ...,  9.2698e-01,
          -4.1922e-01,  7.0016e-02]],

        [[ 6.3351e-01, -2.8540e-03, -8.7299e-04,  ..., -8.8562e-04,
          -9.5192e-01,  1.4388e-02]],

        ...,

        [[ 6.6798e-02,  7.3442e-02,  4.4778e-02,  ...,  8.6053e-01,
          -9.5536e-01, -4.9540e-02]],

        [[ 5.5714e-03,  3.4603e-01,  3.7210e-01,  ...,  9.6874e-01,
          -9.7596e-01,  7.6998e-02]],

        [[-9.9841e-01,  9.6595e-01,  9.9082e-01,  ..., -5.7174e-03,
          -9.7018e-04,  1.7163e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 6.5740e-01, -2.6028e-02, -1.7345e-02,  ..., -2.1327e-03,
          -7.9792e-01,  2.0696e-02],
         [-3.5448e-01,  9.4971e-01,  9.6289e-01,  ...,  9.2698e-01,
          -4.1922e-01,  7.0016e-02],
         [ 6.3351e-01, -2.8540e-03, -8.7299e-04,  ..., -8.8562e-04

hiddens tensor([[[-0.8059, -0.0470,  0.0128,  ..., -0.0013, -0.3480,  0.6939]],

        [[-0.5357,  0.9494,  0.9805,  ...,  0.6561, -0.0047,  0.9359]],

        [[-0.1410,  0.1383,  0.3897,  ...,  0.0310, -0.3484,  0.6299]],

        ...,

        [[-0.9406,  0.9207,  0.9143,  ...,  0.7251, -0.1763, -0.0102]],

        [[-0.5960,  0.0012, -0.0230,  ...,  0.1591, -0.8761, -0.0019]],

        [[-0.9385,  0.6810,  0.7912,  ..., -0.0078, -0.0270,  0.8543]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.8059, -0.0470,  0.0128,  ..., -0.0013, -0.3480,  0.6939],
         [-0.5357,  0.9494,  0.9805,  ...,  0.6561, -0.0047,  0.9359],
         [-0.1410,  0.1383,  0.3897,  ...,  0.0310, -0.3484,  0.6299],
         ...,
         [-0.9406,  0.9207,  0.9143,  ...,  0.7251, -0.1763, -0.0102],
         [-0.5960,  0.0012, -0.0230,  ...,  0.1591, -0.8761, -0.0019],
         [-0.9385,  0.6810,  0.7912,  ..., -0.0078, -0.0270,  0.8543]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-9.7239e-01,  5.9119e-02,  1.9544e-01,  ..., -3.3182e-04,
          -5.8543e-01,  5.6342e-01]],

        [[-8.3037e-01,  9.8252e-01,  9.9441e-01,  ...,  7.0179e-01,
          -1.0340e-02,  9.6349e-01]],

        [[-7.3362e-01,  3.9588e-01,  5.9983e-01,  ...,  2.2326e-02,
          -5.3542e-01,  7.2906e-01]],

        ...,

        [[-9.9559e-01,  9.5447e-01,  9.5320e-01,  ...,  5.6402e-01,
          -1.9571e-01, -6.6482e-03]],

        [[-9.5324e-01,  6.7001e-02,  5.4700e-02,  ...,  8.6906e-02,
          -9.1589e-01, -7.8072e-04]],

        [[-9.9486e-01,  8.4127e-01,  8.9793e-01,  ..., -1.7147e-03,
          -5.6986e-02,  7.9697e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.7239e-01,  5.9119e-02,  1.9544e-01,  ..., -3.3182e-04,
          -5.8543e-01,  5.6342e-01],
         [-8.3037e-01,  9.8252e-01,  9.9441e-01,  ...,  7.0179e-01,
          -1.0340e-02,  9.6349e-01],
         [-7.3362e-01,  3.9588e-01,  5.9983e-01,  ...,  2.2326e-02

hiddens tensor([[[ 5.1476e-01, -9.4795e-04, -4.1789e-04,  ...,  4.5657e-02,
          -9.4973e-01, -6.0258e-03]],

        [[-8.8951e-01, -1.5564e-02, -1.0020e-02,  ..., -1.1287e-03,
          -7.0739e-01, -2.9403e-03]],

        [[ 2.3914e-02, -1.8864e-02,  6.8803e-03,  ..., -1.1857e-02,
          -5.8122e-01,  1.5210e-02]],

        ...,

        [[-4.9180e-01,  7.0903e-01,  1.1757e-01,  ...,  5.8312e-01,
          -4.4545e-01, -2.6187e-02]],

        [[-7.6295e-01, -5.9978e-02,  2.4567e-05,  ..., -7.0712e-04,
          -2.7794e-01,  5.5631e-01]],

        [[-6.6460e-01,  9.2732e-01,  9.2366e-01,  ...,  1.4585e-01,
           2.1042e-03,  7.8827e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 5.1476e-01, -9.4795e-04, -4.1789e-04,  ...,  4.5657e-02,
          -9.4973e-01, -6.0258e-03],
         [-8.8951e-01, -1.5564e-02, -1.0020e-02,  ..., -1.1287e-03,
          -7.0739e-01, -2.9403e-03],
         [ 2.3914e-02, -1.8864e-02,  6.8803e-03,  ..., -1.1857e-02

hiddens tensor([[[-0.2791, -0.0063, -0.0075,  ..., -0.0035, -0.2626, -0.0049]],

        [[-0.0254,  0.0759,  0.0649,  ..., -0.1145, -0.0719,  0.0107]],

        [[ 0.0173, -0.0344, -0.0315,  ...,  0.5121, -0.4656, -0.0435]],

        ...,

        [[-0.0446, -0.0482, -0.0449,  ...,  0.3993, -0.3769, -0.0169]],

        [[-0.1179,  0.5392,  0.5389,  ..., -0.1360,  0.0210,  0.5488]],

        [[-0.4232, -0.0498, -0.0328,  ..., -0.0095,  0.0651,  0.5826]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
states (tensor([[[-0.2791, -0.0063, -0.0075,  ..., -0.0035, -0.2626, -0.0049],
         [-0.0254,  0.0759,  0.0649,  ..., -0.1145, -0.0719,  0.0107],
         [ 0.0173, -0.0344, -0.0315,  ...,  0.5121, -0.4656, -0.0435],
         ...,
         [-0.0446, -0.0482, -0.0449,  ...,  0.3993, -0.3769, -0.0169],
         [-0.1179,  0.5392,  0.5389,  ..., -0.1360,  0.0210,  0.5488],
         [-0.4232, -0.0498, -0.0328,  ..., -0.0095,  0.0651,  0.5826]]],
       device='cuda:0', grad_fn=<Cudnn

hiddens tensor([[[-9.4250e-01, -5.0872e-03, -2.2120e-03,  ..., -6.1572e-05,
          -8.4633e-01,  2.1784e-02]],

        [[-7.5359e-01,  7.1752e-01,  6.0590e-01,  ..., -1.2791e-02,
          -6.3401e-01,  3.6207e-01]],

        [[ 4.9585e-02,  1.4948e-01,  1.2898e-01,  ...,  8.4642e-01,
          -9.3524e-01, -1.7350e-02]],

        ...,

        [[-8.5941e-01,  2.5130e-01,  1.5118e-01,  ...,  4.5658e-01,
          -8.6585e-01, -5.4380e-04]],

        [[-9.3151e-01,  9.7752e-01,  9.7790e-01,  ..., -8.3842e-02,
          -1.0716e-02,  9.4937e-01]],

        [[-9.8473e-01,  6.0069e-02,  3.4537e-01,  ..., -1.9028e-04,
          -1.6993e-01,  8.5555e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.4250e-01, -5.0872e-03, -2.2120e-03,  ..., -6.1572e-05,
          -8.4633e-01,  2.1784e-02],
         [-7.5359e-01,  7.1752e-01,  6.0590e-01,  ..., -1.2791e-02,
          -6.3401e-01,  3.6207e-01],
         [ 4.9585e-02,  1.4948e-01,  1.2898e-01,  ...,  8.4642e-01

hiddens tensor([[[-8.0059e-01,  2.1636e-01,  1.7621e-01,  ...,  1.0664e-01,
          -2.5744e-01, -2.5379e-02]],

        [[ 8.3362e-02,  1.2931e-03, -2.2129e-03,  ...,  5.7585e-01,
          -9.4882e-01, -4.4729e-03]],

        [[ 1.3804e-02,  8.6664e-01,  7.9214e-01,  ...,  6.7209e-01,
          -3.7443e-02,  6.7857e-01]],

        ...,

        [[-6.7199e-01,  8.2902e-01,  9.2235e-01,  ..., -7.3768e-02,
           3.4496e-03,  8.1585e-01]],

        [[-4.0142e-01, -4.3050e-02,  1.9766e-01,  ..., -2.2237e-02,
           3.3754e-02,  7.6818e-01]],

        [[ 1.1602e-01, -2.1459e-03, -5.5094e-04,  ...,  2.3518e-02,
          -9.1900e-01, -1.5505e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-8.0059e-01,  2.1636e-01,  1.7621e-01,  ...,  1.0664e-01,
          -2.5744e-01, -2.5379e-02],
         [ 8.3362e-02,  1.2931e-03, -2.2129e-03,  ...,  5.7585e-01,
          -9.4882e-01, -4.4729e-03],
         [ 1.3804e-02,  8.6664e-01,  7.9214e-01,  ...,  6.7209e-01

hiddens tensor([[[-9.9326e-01,  7.7374e-01,  7.5443e-01,  ...,  7.2257e-02,
          -4.0467e-01, -6.4084e-03]],

        [[ 3.8170e-01,  1.8021e-02, -7.4238e-04,  ...,  4.4378e-01,
          -9.9607e-01, -7.7940e-04]],

        [[-4.7851e-01,  9.8060e-01,  9.6534e-01,  ...,  8.1732e-01,
          -1.6692e-01,  8.7167e-01]],

        ...,

        [[-9.7427e-01,  9.7323e-01,  9.9423e-01,  ..., -2.7115e-02,
          -1.6023e-03,  9.4134e-01]],

        [[-9.2747e-01,  3.6671e-01,  6.8339e-01,  ..., -2.4898e-03,
          -5.9990e-02,  8.8147e-01]],

        [[ 4.7237e-01, -1.4395e-03, -2.0017e-04,  ...,  9.2385e-03,
          -9.7891e-01, -3.2982e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.9326e-01,  7.7374e-01,  7.5443e-01,  ...,  7.2257e-02,
          -4.0467e-01, -6.4084e-03],
         [ 3.8170e-01,  1.8021e-02, -7.4238e-04,  ...,  4.4378e-01,
          -9.9607e-01, -7.7940e-04],
         [-4.7851e-01,  9.8060e-01,  9.6534e-01,  ...,  8.1732e-01

hiddens tensor([[[ 1.5878e-01,  2.3812e-01, -6.9140e-04,  ...,  8.4773e-01,
          -9.4351e-01, -1.6964e-04]],

        [[-9.4047e-01,  9.0971e-01,  9.6278e-01,  ...,  4.9799e-01,
           9.0098e-04, -1.0783e-01]],

        [[ 6.4935e-03,  5.0638e-03, -1.0647e-02,  ...,  8.2031e-01,
          -9.7451e-01, -5.0110e-02]],

        ...,

        [[-9.7006e-01,  8.9884e-01,  9.0683e-01,  ..., -3.3906e-02,
          -9.3640e-03,  2.2405e-02]],

        [[ 7.0135e-02, -9.5502e-02, -6.2777e-02,  ..., -2.2044e-02,
          -4.1126e-01,  8.0436e-01]],

        [[-1.9895e-01,  5.4196e-02,  2.9838e-01,  ...,  3.1658e-02,
          -4.2287e-01,  7.6492e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[ 1.5878e-01,  2.3812e-01, -6.9140e-04,  ...,  8.4773e-01,
          -9.4351e-01, -1.6964e-04],
         [-9.4047e-01,  9.0971e-01,  9.6278e-01,  ...,  4.9799e-01,
           9.0098e-04, -1.0783e-01],
         [ 6.4935e-03,  5.0638e-03, -1.0647e-02,  ...,  8.2031e-01

prob true tensor([[1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [0],
        [1],
        [1],
        [0],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1],
        [1],
        [0],
        [1],
        [1],
        [1]], device='cuda:0')
hiddens tensor([[[-2.6956e-01,  3.5515e-01,  7.0628e-02,  ...,  8.0219e-01,
          -9.5771e-01, -5.9884e-05]],

        [[-9.9357e-01,  9.5968e-01,  9.8829e-01,  ...,  2.9319e-01,
          -5.8120e-03, -2.2183e-02]],

        [[ 1.2086e-02,  9.7374e-03, -2.9800e-03,  ...,  8.2647e-01,
          -9.8885e-01, -3.2804e-02]],

        ...,

        [[-9.9766e-01,  9.4551e-01,  9.5896e-01,  ..., -2.8908e-03,
          -3.7593e-02,  6.6325e-02]],

        [[-2.2468e-01,  1.9916e-02,  6.7845e-02,  ..., -6.0801e-03,
          -6.1420e-01,  8.3673e-01]],


pred prob tensor([[[6.3553e-04, 9.9936e-01]],

        [[5.0120e-06, 9.9999e-01]],

        [[2.3712e-02, 9.7629e-01]],

        [[9.8979e-06, 9.9999e-01]],

        [[6.7388e-06, 9.9999e-01]],

        [[7.9635e-06, 9.9999e-01]],

        [[2.7698e-05, 9.9997e-01]],

        [[3.6069e-06, 1.0000e+00]],

        [[4.1596e-06, 1.0000e+00]],

        [[1.0978e-06, 1.0000e+00]],

        [[1.1963e-04, 9.9988e-01]],

        [[1.7336e-05, 9.9998e-01]],

        [[5.5172e-06, 9.9999e-01]],

        [[1.8750e-05, 9.9998e-01]],

        [[1.3557e-04, 9.9986e-01]],

        [[8.6595e-06, 9.9999e-01]],

        [[1.2094e-04, 9.9988e-01]],

        [[2.2861e-06, 1.0000e+00]],

        [[3.7798e-06, 1.0000e+00]],

        [[2.0349e-03, 9.9797e-01]],

        [[4.5110e-06, 1.0000e+00]],

        [[1.0181e-04, 9.9990e-01]],

        [[4.7744e-05, 9.9995e-01]],

        [[7.5906e-07, 1.0000e+00]],

        [[1.5551e-06, 1.0000e+00]],

        [[3.5236e-05, 9.9996e-01]],

        [[3.0608e-06, 1.0000

hiddens tensor([[[-9.5959e-01,  8.1316e-01,  9.4425e-01,  ..., -1.0679e-02,
          -2.7791e-02,  4.7577e-01]],

        [[-8.0208e-01, -1.1182e-02, -8.2939e-04,  ..., -1.8319e-04,
          -8.5422e-01,  3.4444e-01]],

        [[-1.8595e-01,  1.6883e-01,  1.3271e-01,  ...,  1.2337e-01,
          -6.2953e-01,  2.6984e-01]],

        ...,

        [[-9.9118e-01,  2.5883e-01,  1.2803e-01,  ...,  9.6556e-02,
          -8.3236e-01, -3.4789e-04]],

        [[ 2.6878e-03,  7.4408e-01,  6.2682e-01,  ...,  9.7836e-01,
          -8.8683e-01, -4.1727e-02]],

        [[-9.6128e-01,  6.3258e-01,  4.6326e-01,  ...,  4.6766e-01,
          -8.2352e-01, -2.4935e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-9.5959e-01,  8.1316e-01,  9.4425e-01,  ..., -1.0679e-02,
          -2.7791e-02,  4.7577e-01],
         [-8.0208e-01, -1.1182e-02, -8.2939e-04,  ..., -1.8319e-04,
          -8.5422e-01,  3.4444e-01],
         [-1.8595e-01,  1.6883e-01,  1.3271e-01,  ...,  1.2337e-01

hiddens tensor([[[-1.5595e-01, -2.8523e-03,  8.7282e-05,  ..., -4.5159e-02,
          -8.4117e-01,  1.4065e-01]],

        [[-9.5860e-01,  1.1190e-01,  5.2370e-01,  ..., -1.6171e-03,
          -1.8110e-02,  3.7451e-01]],

        [[-4.4678e-02,  2.6113e-01,  1.6210e-02,  ...,  8.2957e-01,
          -8.8580e-01, -2.2275e-03]],

        ...,

        [[-6.6925e-02,  9.4190e-01,  7.4654e-01,  ...,  8.2989e-01,
          -4.2659e-01,  5.5826e-01]],

        [[-9.7227e-01,  9.7018e-01,  9.8781e-01,  ...,  2.6932e-02,
           2.2004e-04,  6.9726e-01]],

        [[ 9.7592e-02,  3.1190e-02, -8.3585e-03,  ...,  3.5681e-01,
          -9.2271e-01, -1.8481e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-1.5595e-01, -2.8523e-03,  8.7282e-05,  ..., -4.5159e-02,
          -8.4117e-01,  1.4065e-01],
         [-9.5860e-01,  1.1190e-01,  5.2370e-01,  ..., -1.6171e-03,
          -1.8110e-02,  3.7451e-01],
         [-4.4678e-02,  2.6113e-01,  1.6210e-02,  ...,  8.2957e-01

hiddens tensor([[[-7.9878e-01,  7.6531e-02,  3.1311e-02,  ..., -6.5765e-03,
          -9.2413e-01,  9.1563e-02]],

        [[-9.9593e-01,  4.8173e-01,  7.2485e-01,  ..., -4.3448e-04,
          -7.8888e-02,  2.6023e-01]],

        [[-7.7492e-01,  4.2350e-01,  1.3172e-01,  ...,  7.5452e-01,
          -9.2160e-01, -6.2447e-04]],

        ...,

        [[-3.5570e-01,  9.7696e-01,  8.7108e-01,  ...,  8.4407e-01,
          -4.7082e-01,  7.0445e-01]],

        [[-9.9640e-01,  9.8592e-01,  9.9698e-01,  ...,  2.0428e-02,
           2.0498e-04,  7.0434e-01]],

        [[ 4.4956e-01,  6.2690e-02,  2.9338e-03,  ...,  1.6916e-01,
          -9.6325e-01, -3.9889e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-7.9878e-01,  7.6531e-02,  3.1311e-02,  ..., -6.5765e-03,
          -9.2413e-01,  9.1563e-02],
         [-9.9593e-01,  4.8173e-01,  7.2485e-01,  ..., -4.3448e-04,
          -7.8888e-02,  2.6023e-01],
         [-7.7492e-01,  4.2350e-01,  1.3172e-01,  ...,  7.5452e-01

hiddens tensor([[[-2.2766e-01,  8.8902e-01,  9.6123e-01,  ...,  7.5157e-01,
          -1.7532e-02,  8.0987e-01]],

        [[-7.0582e-01,  3.4834e-02, -3.5774e-02,  ..., -5.5333e-02,
          -6.7675e-01, -1.6353e-02]],

        [[-9.2652e-01,  5.9119e-01,  7.8108e-01,  ..., -1.6590e-02,
          -2.3095e-03,  3.4118e-01]],

        ...,

        [[-9.7149e-01,  9.3158e-01,  9.7365e-01,  ...,  2.2815e-02,
           2.8206e-04,  2.3337e-01]],

        [[-7.1067e-01, -3.8013e-03, -9.4482e-04,  ...,  2.2789e-02,
          -9.5886e-01, -2.8068e-04]],

        [[-1.2888e-01,  6.8663e-01,  5.7066e-01,  ...,  9.2364e-01,
          -9.1196e-01, -1.7951e-02]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-2.2766e-01,  8.8902e-01,  9.6123e-01,  ...,  7.5157e-01,
          -1.7532e-02,  8.0987e-01],
         [-7.0582e-01,  3.4834e-02, -3.5774e-02,  ..., -5.5333e-02,
          -6.7675e-01, -1.6353e-02],
         [-9.2652e-01,  5.9119e-01,  7.8108e-01,  ..., -1.6590e-02

hiddens tensor([[[-6.9046e-01,  9.6660e-01,  9.8648e-01,  ...,  8.2474e-01,
          -3.4764e-02,  9.1543e-01]],

        [[-9.7898e-01,  3.6038e-01,  1.1290e-01,  ..., -1.9539e-03,
          -8.0018e-01, -6.8025e-04]],

        [[-9.9544e-01,  8.0836e-01,  9.0283e-01,  ..., -2.8582e-03,
          -4.8905e-02,  2.4179e-01]],

        ...,

        [[-9.9728e-01,  9.6133e-01,  9.9255e-01,  ...,  2.0997e-02,
          -1.5141e-03,  2.2059e-01]],

        [[-9.6775e-01,  1.4239e-03, -1.0593e-03,  ...,  9.1559e-03,
          -9.7778e-01, -1.7425e-04]],

        [[-5.8808e-01,  8.0160e-01,  6.9580e-01,  ...,  9.1440e-01,
          -9.4901e-01, -7.4842e-03]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-6.9046e-01,  9.6660e-01,  9.8648e-01,  ...,  8.2474e-01,
          -3.4764e-02,  9.1543e-01],
         [-9.7898e-01,  3.6038e-01,  1.1290e-01,  ..., -1.9539e-03,
          -8.0018e-01, -6.8025e-04],
         [-9.9544e-01,  8.0836e-01,  9.0283e-01,  ..., -2.8582e-03

hiddens tensor([[[-1.2180e-01,  9.0310e-01,  8.6093e-01,  ...,  5.9496e-01,
          -2.2308e-01,  9.1276e-01]],

        [[-2.8835e-01, -1.9147e-02,  4.9706e-02,  ..., -2.0869e-03,
          -5.5503e-01,  7.1799e-01]],

        [[-6.3431e-01,  9.7145e-01,  9.8360e-01,  ...,  7.0915e-01,
          -4.9674e-04,  9.0450e-01]],

        ...,

        [[-8.3988e-01, -2.6377e-02, -7.4990e-03,  ..., -1.1165e-03,
          -6.9113e-01,  2.9245e-01]],

        [[ 1.1021e-01, -2.5557e-03, -5.5696e-04,  ...,  2.0419e-02,
          -9.6110e-01, -1.4873e-03]],

        [[-8.5427e-01,  9.0452e-01,  9.8150e-01,  ...,  1.4794e-01,
           4.5746e-04,  9.0951e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-1.2180e-01,  9.0310e-01,  8.6093e-01,  ...,  5.9496e-01,
          -2.2308e-01,  9.1276e-01],
         [-2.8835e-01, -1.9147e-02,  4.9706e-02,  ..., -2.0869e-03,
          -5.5503e-01,  7.1799e-01],
         [-6.3431e-01,  9.7145e-01,  9.8360e-01,  ...,  7.0915e-01

hiddens tensor([[[-4.0353e-01,  9.6818e-01,  9.3029e-01,  ...,  7.5075e-01,
          -2.7604e-01,  9.5604e-01]],

        [[-8.7997e-01,  7.2327e-02,  1.5408e-01,  ..., -4.5230e-04,
          -7.1947e-01,  6.3468e-01]],

        [[-8.8988e-01,  9.9054e-01,  9.9568e-01,  ...,  7.5219e-01,
          -6.5565e-03,  9.4264e-01]],

        ...,

        [[-9.7107e-01,  2.7309e-02,  2.4038e-02,  ..., -3.2544e-04,
          -8.0105e-01,  1.7288e-01]],

        [[-1.0287e-01, -1.8460e-03, -6.1454e-04,  ...,  9.0938e-03,
          -9.7795e-01, -7.0767e-04]],

        [[-9.8043e-01,  9.6696e-01,  9.9635e-01,  ...,  1.1134e-01,
           1.2192e-04,  9.3513e-01]]], device='cuda:0',
       grad_fn=<CudnnRnnBackward>)
states (tensor([[[-4.0353e-01,  9.6818e-01,  9.3029e-01,  ...,  7.5075e-01,
          -2.7604e-01,  9.5604e-01],
         [-8.7997e-01,  7.2327e-02,  1.5408e-01,  ..., -4.5230e-04,
          -7.1947e-01,  6.3468e-01],
         [-8.8988e-01,  9.9054e-01,  9.9568e-01,  ...,  7.5219e-01

In [20]:
im2p.save_model(22.2373046875,"model_small")

Saved Model in model_small


In [7]:
# Randomly sample a caption length, and sample indices with that length.
(images, image_id, target, prob) = next(iter(data_loader))

In [8]:
images.shape

torch.Size([3, 3, 224, 224])

In [9]:
target.shape

torch.Size([3, 6, 18])

In [10]:
prob

tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])

In [45]:
# Specify the dimensionality of the image embedding.
embed_size = 256

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Initialize the encoder. (Optional: Add additional arguments if necessary.)
encoder = EncoderCNN(embed_size)

# Move the encoder to GPU if CUDA is available.
encoder.to(device)
    
# Move last batch of images (from Step 2) to GPU if CUDA is available.   
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print('type(features):', type(features))
print('features.shape:', features.shape)

# Check that your encoder satisfies some requirements of the project! :D
assert type(features)==torch.Tensor, "Encoder output needs to be a PyTorch Tensor." 
assert (features.shape[0]==batch_size) & (features.shape[1]==embed_size), "The shape of the encoder output is incorrect."

TypeError: super(type, obj): obj must be an instance or subtype of type

In [159]:
from model import SentenceRNN

In [160]:
# Specify the dimensionality of the image embedding.
hiddem_size = 256

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Initialize the encoder. (Optional: Add additional arguments if necessary.)
sentRnn = SentenceRNN(256,256,256)

# Move the encoder to GPU if CUDA is available.
sentRnn.to(device)
    
# Move last batch of images (from Step 2) to GPU if CUDA is available.   
images = images.to(device)

# Pass the images through the encoder.
features = encoder(images)

print('type(features):', type(features))
print('features.shape:', features.shape)

probs, topic, hiddens = sentRnn(features = features)

type(features): <class 'torch.Tensor'>
features.shape: torch.Size([3, 256])


In [41]:
probs.shape

torch.Size([3, 2])

In [40]:
topic.shape

torch.Size([3, 256])

In [38]:
hiddens.shape

torch.Size([3, 256])

In [15]:
from model import WordRNN

In [19]:
target[0].shape

torch.Size([6, 18])

In [154]:
from model import WordRNN, SentenceRNN

In [59]:
topic.shape

torch.Size([3, 1, 256])

In [86]:
target_0.shape

torch.Size([3, 18])

In [155]:
# Specify the number of features in the hidden state of the RNN decoder.
hidden_size = 512

#-#-#-# Do NOT modify the code below this line. #-#-#-#

# Store the size of the vocabulary.
vocab_size = len(vocab)

# Initialize the decoder.
wordRnn = WordRNN(embed_size, hidden_size, vocab_size)

# Move the decoder to GPU if CUDA is available.
wordRnn.to(device)
 
# Move last batch of captions (from Step 1) to GPU if CUDA is available 
target_0 = target[:,0,:].to(device)
topic = topic.to(device)
print(target_0.shape)
print(topic.shape)

# Pass the encoder output and captions through the decoder.
outputs = wordRnn(topic, target_0)

print('type(outputs):', type(outputs))
print('outputs.shape:', outputs.shape)

# Check that your decoder satisfies some requirements of the project! 
assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor."
assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==target_0.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect."
 

torch.Size([3, 18])
torch.Size([3, 1, 256])
topics torch.Size([3, 1, 256])
embedding torch.Size([3, 17, 256])
inner inputs torch.Size([3, 18, 256])
type(outputs): <class 'torch.Tensor'>
outputs.shape: torch.Size([3, 18, 4667])


In [70]:
outputs.shape

torch.Size([3, 18, 4667])

In [54]:
target_0.shape

torch.Size([6, 18])

In [101]:
s_max = 6
n_max = 50
states = None

In [175]:
for i in range(s_max):
    p, topic, states = sentRnn.sample(features, states)
    samples_ids = wordRnn.sample(topic, max_len=n_max)
    print(p)
    p = (p > 0.5).squeeze(1)
    print(p.shape)
    print(p)
    print(samples_ids.shape)
    print(p[:,0].view(3,1))
    print(torch.Tensor(samples_ids).to(device))
    samples_ids = samples_ids * p[:,0].cpu().data.numpy().reshape(3,1)
    print(samples_ids)
    break

torch.Size([3, 1, 2])
tensor([[[0.5179, 0.4821]],

        [[0.4756, 0.5244]],

        [[0.4941, 0.5059]]], device='cuda:0', grad_fn=<SoftmaxBackward>)
torch.Size([3, 2])
tensor([[1, 0],
        [0, 1],
        [0, 1]], device='cuda:0', dtype=torch.uint8)
(3, 50)
tensor([[1],
        [0],
        [0]], device='cuda:0', dtype=torch.uint8)
tensor([[2589., 2437., 1423.,  540.,  618., 4498.,  577.,  140.,  701., 2128.,
         3688.,   96., 3841., 4473., 1564., 2486., 1455., 4657., 4593., 2525.,
         4358., 3038., 3270., 2296., 2296., 3365., 1014., 2012.,  275., 2894.,
         4281., 2814.,  956., 4091., 1694., 3925., 3571., 1751., 4643., 2490.,
         2490., 4296., 1930., 1500., 3848., 1691., 2791., 1118., 4214., 1244.],
        [2589., 2437., 1423.,  540.,  618., 4498.,  577.,  140.,  701., 2128.,
         3688.,   96., 3841., 4473., 1564., 2486., 1455., 4657., 4593., 2525.,
         4358., 3038., 3270., 2296., 2296., 3365., 1014., 2012.,  275., 2894.,
         4281., 2814.,  95