In [1]:
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

In [2]:
import torch
import torch. nn as nn
import torch . optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from tensorboardX import SummaryWriter # to print to tensorboard

import inspect

In [3]:
spacy_ger = spacy.load("de_core_news_sm")
spacy_eng = spacy.load("en_core_web_sm")

In [4]:
def tokenizer_ger (text) :
    return [tok. text for tok in spacy_ger.tokenizer(text)]
def tokenizer_eng (text) :
    return [tok. text for tok in spacy_eng.tokenizer(text)]


german = Field(tokenize=tokenizer_ger, lower=True,
               init_token= '<sos>', eos_token= '<eos>' )
english = Field(tokenize=tokenizer_eng, lower=True, 
                init_token= '<sos>', eos_token= '<eos>')

In [5]:
# Pre-process
train_data , test_data, validation_data = Multi30k.splits(path = 'dataset/multi30k/', exts=('.de', '.en'), fields=(german, english))
vars(train_data.examples[0])

{'src': ['zwei',
  'junge',
  'weiße',
  'männer',
  'sind',
  'im',
  'freien',
  'in',
  'der',
  'nähe',
  'vieler',
  'büsche',
  '.'],
 'trg': ['two',
  'young',
  ',',
  'white',
  'males',
  'are',
  'outside',
  'near',
  'many',
  'bushes',
  '.']}

In [6]:
german.build_vocab(train_data, max_size = 10000, min_freq = 2)
english.build_vocab(train_data, max_size = 10000, min_freq = 2)


print(f"Unique tokens in source (de) vocabulary: {len(german.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(english.vocab)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)



Unique tokens in source (de) vocabulary: 7853
Unique tokens in target (en) vocabulary: 5893


In [7]:
train_iterator

<torchtext.data.iterator.BucketIterator at 0x7fb2f39091c0>

# LSTM Output detail

In [8]:
import ipyplot
images_list = ["img/LSTM1.png" , "img/LSTM2.jpg"]
ipyplot.plot_images(images_list, max_images=10, img_width=400)


In [9]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size,hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(dropout)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout)
    
    def forward(self , x): 
        #x.shape : (seq_length, Batch size (N))
        embedding = self.dropout(self.embedding(x))
        output , (hidden,cell) = self.rnn(embedding)
        
        return hidden, cell
        
        

In [10]:
class Decoder(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers, dropout):
        super().__init__()
        
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(output_size, embedding_size)
        
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hidden_size, output_size)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [seq len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #seq len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell


In [11]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hidden_size == decoder.hidden_size, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.num_layers == decoder.num_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_size
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs


In [12]:
INPUT_DIM = len(german.vocab)
OUTPUT_DIM = len(english.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)


In [13]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)


Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7853, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  )
  (decoder): Decoder(
    (embedding): Embedding(5893, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=5893, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


The model has 13,898,501 trainable parameters


In [15]:
optimizer = optim.Adam(model.parameters())


In [16]:
german_PAD_IDX = german.vocab.stoi[german.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = german_PAD_IDX)


In [17]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        print(output)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


In [18]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


In [19]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [20]:
import time
import math
N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


tensor([[-9.1250e-03,  5.9617e-02, -2.2392e-02,  ..., -6.7885e-02,
          1.1833e-02,  6.3940e-02],
        [ 8.2838e-03,  5.7121e-02, -5.9192e-03,  ..., -3.2309e-02,
          6.7698e-03,  6.5129e-02],
        [ 1.4593e-02,  5.9003e-02, -1.1490e-02,  ..., -6.7660e-02,
         -4.3615e-03,  4.4321e-02],
        ...,
        [-8.6233e-05, -2.8174e-05, -1.1163e-01,  ..., -5.7044e-02,
          1.1153e-03,  6.7094e-02],
        [ 1.2327e-02,  2.1827e-03, -1.0614e-01,  ..., -1.0215e-01,
          2.3264e-02,  1.8117e-02],
        [ 9.3751e-03, -2.6570e-03, -1.0497e-01,  ..., -1.0068e-01,
          4.3729e-03,  1.9723e-02]], device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 0.0470,  0.0581,  0.0253,  ..., -0.0176, -0.0406,  0.0379],
        [ 0.0253,  0.0142,  0.0281,  ..., -0.0299, -0.0540,  0.0478],
        [ 0.0498,  0.0314,  0.0144,  ..., -0.0205, -0.0520,  0.0346],
        ...,
        [ 0.0201, -0.0632, -0.1774,  ..., -0.1299, -0.0518, -0.0081],
        [ 0.0423, -0.0361, -0.1509

tensor([[ 4.1359, -3.8560, -4.0241,  ..., -3.3123, -3.4429, -2.8834],
        [ 4.1229, -3.8974, -4.0905,  ..., -3.2599, -3.3998, -2.9682],
        [ 3.9871, -3.9965, -4.0501,  ..., -3.2201, -3.3764, -2.6193],
        ...,
        [ 4.1017, -4.8615, -6.0527,  ..., -4.0497, -3.2019, -3.9366],
        [ 4.1302, -4.8061, -6.1336,  ..., -3.8221, -3.3062, -3.8573],
        [ 4.1505, -4.8947, -6.1366,  ..., -4.0090, -3.4121, -3.7013]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.9426, -4.0973, -4.2981,  ..., -3.4113, -3.4025, -2.8644],
        [ 3.8033, -4.1261, -4.1546,  ..., -3.3857, -3.4607, -3.0475],
        [ 3.9838, -4.1261, -4.0463,  ..., -3.5803, -3.6337, -2.7831],
        ...,
        [ 3.9624, -4.7003, -6.2427,  ..., -3.9504, -3.4225, -3.7042],
        [ 4.0273, -4.8789, -6.1957,  ..., -3.8142, -3.4606, -3.8695],
        [ 4.0812, -4.7681, -6.2090,  ..., -4.0053, -3.4933, -3.8730]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.8681, -4.0514, -4.0973, 

tensor([[ 3.7226, -4.8341, -4.0933,  ..., -4.2499, -4.6985, -2.6589],
        [ 3.6463, -4.5804, -4.0913,  ..., -4.1937, -4.7135, -2.7613],
        [ 3.5182, -4.6165, -4.1645,  ..., -4.1398, -4.4445, -2.5933],
        ...,
        [ 3.8990, -5.3587, -5.9045,  ..., -5.0257, -4.8974, -2.8423],
        [ 3.8220, -5.0930, -5.8737,  ..., -4.8096, -4.8377, -2.6407],
        [ 3.7719, -5.3647, -6.0527,  ..., -4.9464, -4.8601, -2.6559]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.7787, -4.7409, -4.1366,  ..., -4.2191, -4.7399, -2.6042],
        [ 3.7190, -4.6260, -4.2356,  ..., -4.1996, -4.7308, -2.6745],
        [ 3.7785, -4.7412, -4.2376,  ..., -4.2426, -4.6196, -2.6727],
        ...,
        [ 3.7764, -5.2307, -5.9399,  ..., -5.0436, -4.5594, -2.7020],
        [ 3.8778, -5.0966, -5.8165,  ..., -4.8248, -4.6987, -2.7367],
        [ 3.8871, -5.2196, -5.9985,  ..., -4.9296, -4.7289, -2.7067]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.7696, -4.7474, -4.2227, 

tensor([[ 4.2044, -4.7097, -4.2400,  ..., -4.1504, -4.8123, -2.6461],
        [ 4.1747, -4.6754, -4.1955,  ..., -4.2416, -4.9741, -2.6885],
        [ 4.2622, -4.6524, -4.3058,  ..., -4.2894, -4.9307, -2.7622],
        ...,
        [ 3.5438, -5.4985, -5.9924,  ..., -4.8400, -5.0023, -2.7384],
        [ 3.6363, -5.4673, -5.9466,  ..., -5.0199, -4.9134, -2.8988],
        [ 3.5628, -5.4281, -5.6179,  ..., -5.0009, -4.7696, -2.8285]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.2579, -4.8597, -4.3707,  ..., -4.2418, -4.8847, -2.7402],
        [ 4.1257, -4.6432, -4.2559,  ..., -4.2402, -4.9619, -2.7661],
        [ 4.2453, -4.7786, -4.2670,  ..., -4.1916, -4.8277, -2.6526],
        ...,
        [ 3.6537, -5.5220, -5.7783,  ..., -4.9495, -4.8942, -2.8021],
        [ 3.5790, -5.4783, -5.9402,  ..., -5.1086, -4.9157, -2.8737],
        [ 3.7369, -5.5835, -5.9601,  ..., -5.0502, -5.0519, -2.9266]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.3215, -4.7333, -4.2443, 

tensor([[ 4.4495, -4.5169, -4.2387,  ..., -4.1778, -4.9773, -2.8256],
        [ 4.2400, -4.6446, -4.2754,  ..., -4.2612, -4.9111, -2.8231],
        [ 4.3767, -4.5236, -4.0812,  ..., -4.3025, -4.9624, -2.9081],
        ...,
        [ 3.8010, -5.7068, -6.1520,  ..., -5.1959, -4.9989, -3.2943],
        [ 3.7586, -5.5765, -6.2350,  ..., -5.0739, -4.8778, -3.2259],
        [ 3.7995, -5.6913, -6.0935,  ..., -5.1708, -5.1657, -3.3273]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.2891, -4.6309, -4.1585,  ..., -4.3603, -5.0368, -2.9535],
        [ 4.3120, -4.7251, -4.1342,  ..., -4.3802, -5.0425, -2.9875],
        [ 4.2660, -4.5733, -4.1220,  ..., -4.2658, -5.0450, -2.8418],
        ...,
        [ 3.7784, -5.5781, -6.0960,  ..., -4.9741, -5.1268, -3.0750],
        [ 3.7204, -5.6147, -6.0841,  ..., -5.0059, -5.0545, -3.1660],
        [ 3.7682, -5.7286, -6.2565,  ..., -5.1771, -5.0747, -3.1411]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.3451, -4.6394, -4.3228, 

tensor([[ 4.1605, -4.6487, -4.1504,  ..., -4.2287, -5.1475, -2.6627],
        [ 3.9969, -4.6189, -4.0902,  ..., -4.2709, -5.0257, -2.6189],
        [ 4.0422, -4.5435, -4.3107,  ..., -4.3610, -5.1458, -2.7464],
        ...,
        [ 3.5654, -5.9391, -6.3439,  ..., -5.2406, -5.4443, -3.1861],
        [ 3.5818, -5.9960, -6.3040,  ..., -5.2144, -5.4566, -3.0847],
        [ 3.5254, -5.9206, -6.2477,  ..., -5.1501, -5.2895, -3.0206]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.0431, -4.6595, -4.2256,  ..., -4.3356, -5.0616, -2.6377],
        [ 4.1255, -4.7159, -4.2681,  ..., -4.2426, -5.0198, -2.7268],
        [ 3.9658, -4.4811, -4.1474,  ..., -4.2693, -5.1388, -2.7809],
        ...,
        [ 3.3665, -5.6238, -5.8122,  ..., -4.7889, -5.1441, -2.7633],
        [ 3.3147, -5.6910, -5.9627,  ..., -4.8534, -5.1199, -2.7588],
        [ 3.3064, -5.8060, -6.0242,  ..., -4.9765, -5.1347, -2.8448]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.0715, -4.6731, -4.2774, 

tensor([[ 4.6336, -5.0440, -4.6136,  ..., -3.5042, -5.4098, -3.3077],
        [ 4.4716, -4.8019, -4.3854,  ..., -3.3483, -5.2718, -3.1231],
        [ 4.5592, -4.9687, -4.5339,  ..., -3.6017, -5.3768, -3.1976],
        ...,
        [ 3.6856, -6.0708, -6.3737,  ..., -3.6629, -5.4640, -3.3507],
        [ 3.7400, -5.9594, -6.4720,  ..., -3.6352, -5.6389, -3.4580],
        [ 3.7200, -6.0375, -6.2250,  ..., -3.4757, -5.4234, -3.3286]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.5044, -4.8276, -4.4064,  ..., -3.3319, -5.3120, -3.0677],
        [ 4.4798, -4.9831, -4.5165,  ..., -3.4473, -5.3926, -3.1648],
        [ 4.4941, -4.7931, -4.4213,  ..., -3.2829, -5.3295, -3.1233],
        ...,
        [ 3.7486, -6.0179, -6.0697,  ..., -3.4805, -5.2324, -3.0338],
        [ 3.6884, -5.8632, -5.9141,  ..., -3.3355, -5.1577, -3.0390],
        [ 3.7655, -5.7502, -6.0043,  ..., -3.2894, -5.2647, -2.9696]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.3412, -4.8250, -4.4291, 

tensor([[ 4.1827, -4.9736, -4.5912,  ..., -3.5061, -5.2800, -3.3128],
        [ 4.1425, -4.9699, -4.4620,  ..., -3.5217, -5.4206, -3.4231],
        [ 4.1793, -4.9430, -4.4755,  ..., -3.5169, -5.3357, -3.5250],
        ...,
        [ 3.8424, -5.7777, -5.9382,  ..., -3.3608, -5.0193, -3.1676],
        [ 3.9988, -6.3192, -6.6649,  ..., -3.7094, -5.5880, -3.6754],
        [ 3.9339, -5.9051, -6.0093,  ..., -3.4056, -5.1070, -3.3260]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.1867, -4.9991, -4.6323,  ..., -3.5880, -5.4026, -3.5901],
        [ 4.1029, -4.8608, -4.5091,  ..., -3.4457, -5.2913, -3.5212],
        [ 4.1626, -4.9246, -4.4679,  ..., -3.5384, -5.3835, -3.4631],
        ...,
        [ 4.1313, -6.2826, -6.6007,  ..., -3.6911, -5.4282, -3.5457],
        [ 4.1575, -6.2730, -6.3532,  ..., -3.6848, -5.4542, -3.5344],
        [ 4.1767, -6.3463, -6.5716,  ..., -3.7349, -5.5020, -3.6394]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.1172, -5.0090, -4.5581, 

tensor([[ 4.1976, -5.0954, -4.8157,  ..., -3.8367, -5.5423, -3.8104],
        [ 4.2310, -5.0962, -4.6494,  ..., -3.6399, -5.5423, -3.7580],
        [ 4.1061, -5.1389, -4.8289,  ..., -3.7068, -5.6851, -3.8211],
        ...,
        [ 3.7028, -6.4611, -6.6589,  ..., -4.1297, -5.4719, -3.8987],
        [ 3.6339, -6.4134, -6.6358,  ..., -4.2039, -5.5629, -4.0456],
        [ 3.6825, -6.3855, -6.5461,  ..., -4.1567, -5.5715, -3.9487]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.1402, -5.0529, -4.7423,  ..., -3.7852, -5.6037, -3.8371],
        [ 4.2478, -5.0523, -4.6780,  ..., -3.7278, -5.4369, -3.7520],
        [ 4.2026, -5.0098, -4.5950,  ..., -3.6624, -5.4317, -3.7257],
        ...,
        [ 3.6409, -6.6517, -7.0203,  ..., -4.3707, -5.9283, -4.3801],
        [ 3.6168, -6.6477, -6.8348,  ..., -4.2141, -5.8735, -4.2468],
        [ 3.6174, -6.5771, -6.8661,  ..., -4.1836, -5.7743, -4.3739]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.3416, -5.1424, -4.8002, 

tensor([[ 4.3558, -5.1736, -4.8725,  ..., -4.0707, -5.7718, -3.8604],
        [ 4.4209, -5.2147, -4.9938,  ..., -3.9965, -5.7504, -3.9068],
        [ 4.2980, -5.1340, -4.7663,  ..., -3.9914, -5.7544, -3.9740],
        ...,
        [ 4.0655, -6.5234, -6.4964,  ..., -4.3750, -5.7129, -4.0979],
        [ 4.0204, -6.1587, -6.1142,  ..., -4.2035, -5.3443, -3.8330],
        [ 4.1055, -6.3976, -6.3081,  ..., -4.3328, -5.6311, -3.9947]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.4358, -5.3016, -4.9309,  ..., -4.1226, -5.7363, -4.0061],
        [ 4.3717, -5.2472, -4.8499,  ..., -4.0465, -5.7286, -3.9009],
        [ 4.3454, -5.3128, -4.8183,  ..., -4.0542, -5.8158, -3.9568],
        ...,
        [ 4.1413, -6.9251, -7.0712,  ..., -4.8037, -5.9998, -4.6162],
        [ 4.2019, -6.8492, -6.9863,  ..., -4.7030, -6.0206, -4.3751],
        [ 4.2269, -6.9152, -7.0283,  ..., -4.6912, -6.0675, -4.6145]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 4.1873, -5.2307, -4.8869, 

tensor([[ 3.7572, -4.9477, -4.7320,  ..., -3.7890, -5.4933, -3.8996],
        [ 3.7849, -5.0676, -4.8309,  ..., -3.9320, -5.5630, -4.0168],
        [ 3.7114, -5.0518, -4.7980,  ..., -3.8008, -5.5693, -3.9718],
        ...,
        [ 3.5070, -6.5639, -6.5590,  ..., -4.5824, -5.8307, -4.3815],
        [ 3.5635, -6.6008, -6.5339,  ..., -4.5798, -5.8565, -4.3346],
        [ 3.4866, -6.3405, -6.4098,  ..., -4.3600, -5.5724, -4.0658]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.7904, -5.0771, -4.9194,  ..., -3.6739, -5.5768, -3.8887],
        [ 3.7832, -5.2533, -4.9761,  ..., -4.0328, -5.6920, -4.2108],
        [ 3.7057, -4.8078, -4.6821,  ..., -3.6996, -5.5250, -3.7778],
        ...,
        [ 3.5608, -6.3034, -6.1264,  ..., -4.3098, -5.7330, -4.1376],
        [ 3.6052, -6.7020, -6.6059,  ..., -4.6334, -6.0488, -4.4237],
        [ 3.4951, -6.2983, -6.1852,  ..., -4.2796, -5.5929, -4.0546]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.7772, -5.0644, -4.7942, 

tensor([[ 3.7806, -5.2391, -5.4031,  ..., -4.0356, -5.8734, -3.9331],
        [ 3.7656, -5.2638, -5.3280,  ..., -3.9224, -5.8121, -3.9641],
        [ 3.8595, -5.3049, -5.3775,  ..., -4.0047, -5.8668, -4.0064],
        ...,
        [ 3.8462, -7.0573, -7.1098,  ..., -5.1190, -6.3302, -4.8586],
        [ 3.9004, -7.1873, -7.2487,  ..., -5.2253, -6.4536, -4.8820],
        [ 3.9157, -7.1883, -7.2188,  ..., -5.3124, -6.4630, -4.9035]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.8824, -5.2279, -5.4264,  ..., -4.0024, -5.9049, -4.0088],
        [ 3.8801, -5.2371, -5.3408,  ..., -3.9933, -5.9586, -4.0253],
        [ 3.8241, -5.1519, -5.2521,  ..., -3.8794, -5.8428, -3.9582],
        ...,
        [ 3.7360, -6.5034, -6.3384,  ..., -4.4280, -5.8434, -4.2079],
        [ 3.7881, -6.7300, -6.5308,  ..., -4.5378, -5.9514, -4.3637],
        [ 3.7377, -6.3506, -6.3176,  ..., -4.3370, -5.6968, -4.3108]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.8604, -5.0774, -5.2268, 

tensor([[ 3.9737, -5.2497, -5.0691,  ..., -4.1515, -4.9477, -4.1047],
        [ 3.9691, -5.2807, -5.1807,  ..., -4.1790, -4.9241, -4.1673],
        [ 3.9652, -5.2275, -5.0705,  ..., -4.1637, -4.9140, -4.1022],
        ...,
        [ 3.9525, -7.0343, -7.1370,  ..., -5.0679, -3.8800, -4.7624],
        [ 3.8873, -6.8961, -6.9617,  ..., -5.1098, -3.7354, -4.6429],
        [ 3.8539, -6.7598, -6.9832,  ..., -4.8961, -3.6736, -4.5907]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.8931, -5.1784, -4.9008,  ..., -4.0555, -4.7956, -3.9452],
        [ 3.9115, -5.1530, -4.9803,  ..., -4.0817, -4.8167, -3.9480],
        [ 3.8956, -5.1771, -4.8381,  ..., -4.0388, -4.7475, -3.9940],
        ...,
        [ 4.0148, -7.0634, -7.1385,  ..., -5.2784, -3.8277, -4.6855],
        [ 3.8921, -6.9115, -7.1290,  ..., -5.1461, -3.7395, -4.7483],
        [ 4.1019, -7.2519, -7.3061,  ..., -5.3017, -3.9614, -4.8269]],
       device='cuda:0', grad_fn=<ViewBackward>)
tensor([[ 3.9558, -5.2142, -5.0428, 

In [21]:
for i in train_iterator:
    print(i.trg.shape[0])

37
29
31
25
27
33
34
28
27
27
30
28
29
30
37
26
25
29
31
26
28
30
35
28
35
34
25
30
36
28
30
26
28
36
27
29
29
42
31
30
32
32
30
30
28
40
32
27
34
30
27
32
30
41
38
28
34
28
38
43
28
37
26
28
31
30
29
28
36
40
31
31
26
33
25
30
34
35
33
34
28
25
29
26
25
26
26
33
33
30
36
28
37
28
40
30
32
27
27
26
34
33
29
28
31
28
30
29
24
31
26
27
35
28
24
25
29
32
30
28
28
32
25
31
26
33
33
35
37
31
25
28
28
25
27
30
29
27
27
28
30
25
27
26
28
27
28
30
27
32
37
26
26
35
28
26
28
30
38
25
32
28
30
27
31
39
33
30
33
30
30
31
27
30
33
34
31
28
31
29
37
29
31
29
29
32
28
24
37
28
27
27
31
41
27
25
26
25
36
29
30
28
36
40
30
27
27
36
27
30
29
31
30
29
30
34
42
28
34
29
35
31
28
29
28
32
36
