In [1]:
%matplotlib inline

In [2]:
cd ..

/home/mehdi2277/Documents/HarveyMuddWork/Neural_Nets_Research/neural_nets_research


In [3]:
from neural_nets_library import training

In [4]:
# A lot of inspiration from https://github.com/loudinthecloud/pytorch-ntm. Hyperparameters were chosen based
# upon his experiments.

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

import numpy as np
import random

In [5]:
def init_seed(seed=None):
    """Seed the RNGs for predicatability/reproduction purposes."""
    if seed is None:
        seed = int(get_ms() // 1000)

    print("Using seed=%d", seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)

In [6]:
class DNC_Memory(nn.Module):
    """
    Class that stores memory so that a single representation of memory can be passed easily
    throughout the program.
    """
    def __init__(self, address_count, address_dimension, batch_size):
        """
        Initializes DNC_Memory and prepares for training
        
        :param address_count: Number of addresses in memory
        :param address_dimension: The size of a current location in memory
        """
        super(DNC_Memory, self).__init__()
        self.initial_memory = nn.Parameter(torch.zeros(1, address_count, address_dimension))
        self.batch_size = batch_size
        self.reset_parameters()
        self.initialize_state()
    
    def reset_parameters(self):
        """
        Resets the parameters in memory, should be used before beginning training
        """
        _, N, M = self.initial_memory.size()
        stdev = 1 / np.sqrt(N + M)
        nn.init.uniform(self.initial_memory, -stdev, stdev)
    
    def initialize_state(self):
        """
        Initializes the state of DNC_Memory, specifically expands memory to match batch_size for training
        """
        self.memory = self.initial_memory.repeat(self.batch_size, 1, 1)
    
    def content_address_memory(self, key_vec, β):
        """
        Used to find memory addresses to read/write to based on controller output for read/write heads
        by cosine similarity.
        
        :param key_vec: A vector of size address_count that points to places in memory to access
        :param β: The write strength outputted by the controller
        :return: A softmax over the addresses based on cosine similarity of key_vec to keys in memory
        """
        result = F.cosine_similarity(key_vec.unsqueeze(1).expand_as(self.memory), 
                                     self.memory, dim = 2)
        result = β * result
        result = result.exp()
        result = result / result.sum(1, keepdim=True)
        return result
    
    def read_memory(self, address_vec):
        """
        :param address_vec: Vector of length address_count corresponding to the read weight
                at each memory location.
        :return: the result of a read to memory based on an address_vec
        """
        return torch.bmm(self.memory.transpose(1,2), address_vec.unsqueeze(2)).squeeze(2)
    
    def update_memory(self, address_vec, erase_vec, add_vec):
        """
        Updates memory based on the results of a write head and controller output
        :param address_vec: Vector of length address_count corresponding to the write weight
                at each memory location.
        :param erase_vec: A of length address_count vector used in conjuntion with the address_vec to determine where
                in memory to erase
        :param add_vec: A of length address_count vector used in conjuntion with the address_vec to determine where
                in memory to write
        """
        self.memory = self.memory * (1 - torch.bmm(address_vec.unsqueeze(2), erase_vec.unsqueeze(1)))
        self.memory += torch.bmm(address_vec.unsqueeze(2), add_vec.unsqueeze(1))

In [7]:
class DNC_Usage(nn.Module):
    """
    Class that represents the usage vector for the DNC, or the vector that tracks which places in memory have
    been recently written to or recently read from in order to determine which memory locations should be freed.
    
    """
    def __init__(self, address_count, batch_size):
        """
        Initialize a usage vector
        :param address_count: The number of memory locations in the DNC
        :param batch_size: The batch size for training
        """
        super(DNC_Usage, self).__init__()
        self.register_buffer('initial_usage', Variable(torch.zeros(1, address_count)))
        self.batch_size = batch_size
        self.initialize_state()
        
    def initialize_state(self):
        """
        Initializes the usage vector to the proper demention, should be used before training starts.
        """
        self.usage = self.initial_usage.repeat(self.batch_size, 1)
    
    def read_update_usage(self, address_vec, rfree_weights):
        """
        Updates the usage vector based on recent reads, decreases usage where locations are heavily read or
        controller output dictates
        :param address_vec: Vector that was used to read from locations in memory (length address_count)
        :param rfree_weights: Controller output for how much usage should be freed after a round of reading.
        """
        self.usage *= 1 - rfree_weights * address_vec
    
    def write_update_usage(self, address_vec):
        """
        Updates usage vector based on recent reads, increases usage where locations are heavily written to.
        :param address_vec: Vector that was used to write to memory (length address_count)
        """
        self.usage += (1 - self.usage) * address_vec
    
    def allocation_weights(self):
        """
        Calculates the allocation vector used to calculate where the next writes can occur.
        :return: Allocation vector that tells the DNC where it can write to without freeing up memory.
        """
        sorted_usage, indices_usage = torch.sort(self.usage)
        prod_sorted_usage = torch.cumprod(torch.cat((Variable(torch.ones(self.batch_size, 1).cuda()), 
                                                     sorted_usage), dim=1), dim=1)[:, :-1]
        sorted_allocation = (1 - sorted_usage) * prod_sorted_usage
        return sorted_allocation.gather(1, indices_usage)

In [8]:
def _split_cols(mat, lengths):
    """Split a 2D matrix to variable length columns."""
    assert mat.size()[1] == sum(lengths), "Lengths must be summed to num columns"
    l = np.cumsum([0] + lengths)
    results = []
    for s, e in zip(l[:-1], l[1:]):
        results += [mat[:, s:e]]
    return results

class DNC_Head(nn.Module):
    """
    Parent class of read/write heads, used to read and write to memory.
    """
    def __init__(self, address_count, address_dimension, 
                 controller_output_size):
        """
        Initializes a read or write head
        :param address_count: Number of addresses in memory
        :param address_dimension: Size of each memory location
        :param controller_output_size: Size of controller output
        """
        super(DNC_Head, self).__init__()
        
        self.controller_output_size = controller_output_size
        self.N = address_count
        self.M = address_dimension
    
    def is_read_head(self):
        raise NotImplementedError
    
    def reset_parameters(self):
        raise NotImplementedError
    
    def initialize_state(self):
        raise NotImplementedError

In [9]:
class DNC_Read_Head(DNC_Head):
    def __init__(self, address_count, address_dimension, controller_output_size, batch_size, num_write_heads):
        """
        Initializes a read head
        :param address_count: Number of addresses in memory
        :param address_dimension: Size of each memory location
        :param controller_output_size: Size of controller output
        :param num_write_heads: Number of write heads in the DNC
        """
        super(DNC_Read_Head, self).__init__(address_count, address_dimension, controller_output_size)
        # key_vec, β, read_mode, rfree_gate
        # self.M is the number of rows
        self.num_write_heads = num_write_heads
        
        key_vec_dim = self.M
        β_dim = 1
        # There are 2 * num_write_heads + 1 read modes per head, 1 forward 1 backward per head,
        # 1 Content based address.
        read_mode_dim = 2 * self.num_write_heads + 1
        rfree_gate_dim = self.N
        
        self.read_parameters_lengths = [key_vec_dim, β_dim, read_mode_dim, rfree_gate_dim]
        self.fc_read_parameters = nn.Linear(controller_output_size, sum(self.read_parameters_lengths))
        
        self.batch_size = batch_size

        self.reset_parameters()
        self.initialize_state()
    
    def reset_parameters(self):
        """
        Resets the parameters in read head, should be used before beginning training
        """
        nn.init.xavier_uniform(self.fc_read_parameters.weight, gain=1.4)
        nn.init.normal(self.fc_read_parameters.bias, std=0.01)
        
        self.initial_address_vec = nn.Parameter(torch.zeros(self.N))
        self.initial_read = nn.Parameter(torch.randn(1, self.M) * 0.01)
    
    def initialize_state(self):
        """
        Initializes the read head, should be used before training starts.
        """
        self.prev_address_vec = self.initial_address_vec.repeat(self.batch_size, 1)
        self.prev_read = self.initial_read.repeat(self.batch_size, 1)
    
    def is_read_head(self):
        """
        :returns: True for read heads
        """
        return True
    
    def forward(self, x, memory, usage, linked_matrices):
        """
        Function called on a forward pass of the network
        :param x: The output of the controller
        :param memory: The DNC_Memory object to read from
        :param usage: The usage vector to update after a read
        :param linked_matrices: A list of linked matrices from each read head, used to read forward and backward
        for different read modes
        :return: The value of a read
        """
        read_parameters = self.fc_read_parameters(x)
        # -------------------- Controller Parameter Description ----------------------
        # Key_Vec represents the vector key for content based addressing and is used to find read locations
        # β corresponds to the write strength and is not used in the read head
        # read_modes is a weighted vector representing how strong the read should be based on content based
        #     addressing and the forward and backward moves along the linked matrices of different write heads
        # rfree_weight determines how much locations should be erased after this round of reading
        key_vec, β, read_modes, rfree_weight = _split_cols(read_parameters, self.read_parameters_lengths)
        β = F.softplus(β)
        rfree_weight = F.sigmoid(rfree_weight)
        read_modes = F.softmax(read_modes, dim = 1)
        content_address_vec = memory.content_address_memory(key_vec, β)
        
        forward = []
        backward = []
        
        address_vec = content_address_vec * read_modes[:, 0].unsqueeze(1)
        
        for i, linked_matrix in enumerate(linked_matrices):
            address_vec += read_modes[:, i+1].unsqueeze(1) * torch.bmm(linked_matrix, self.prev_address_vec.unsqueeze(2)).squeeze(2)
            address_vec += read_modes[:, i+self.num_write_heads+1].unsqueeze(1) * torch.bmm(linked_matrix.transpose(1,2), 
                                                                               self.prev_address_vec.unsqueeze(2)).squeeze(2)
        
        new_read = memory.read_memory(address_vec)
        self.prev_address_vec = address_vec
        self.prev_read = new_read
        usage.read_update_usage(address_vec, rfree_weight)
        return new_read

In [10]:
class DNC_Write_Head(DNC_Head):
    
    def __init__(self, address_count, address_dimension, controller_output_size, batch_size):
        """
        Initializes a write head
        :param address_count: Number of addresses in memory
        :param address_dimension: Size of each memory location
        :param controller_output_size: Size of controller output
        
        """
        super(DNC_Write_Head, self).__init__(address_count, address_dimension, controller_output_size)
        self.batch_size = batch_size
        self.write_parameters_lengths = [self.M, 1, 1, 1, self.M, self.M]
        self.fc_write_parameters = nn.Linear(controller_output_size, sum(self.write_parameters_lengths))
        
        #initialize the precedence vector
        self.register_buffer('initial_precedence_vec', Variable(torch.zeros(batch_size, self.N)))
        #initialize the link matrix
        self.register_buffer('initial_link_matrix', Variable(torch.zeros(batch_size, self.N, self.N)))
        
        self.reset_parameters()
        self.initialize_state()
    
    def reset_parameters(self):
        """
        Resets the parameters in write head, should be used before beginning training
        """        
        nn.init.xavier_uniform(self.fc_write_parameters.weight, gain=1.4)
        nn.init.normal(self.fc_write_parameters.bias, std=0.01)
        
        self.initial_address_vec = nn.Parameter(torch.zeros(self.N))
    
    def initialize_state(self):
        """
        Initializes the write head, should be used before training starts.
        """
        self.prev_address_vec = self.initial_address_vec.clone()
        self.precedence_vec = self.initial_precedence_vec
        self.link_matrix = self.initial_link_matrix
    
    def is_read_head(self):
        """
        :return: If the head is a read head (False)
        """
        return False
    
    def forward(self, x, memory, usage):
        """
        Function called on a forward pass of the network
        :param x: The output of the controller
        :param memory: The DNC_Memory object to write to
        :param usage: The usage vector to update after a write
        """
        write_parameters = self.fc_write_parameters(x)
        # -------------------- Controller Parameter Description ----------------------
        # Key_Vec represents the vector key for content based addressing and is used to find write locations
        # β is the write strength, multiplied by write weights, gives controller control over writes
        # g is the write gate, used to determine where final writes occur
        # alloc_gate Determines how strongly allocated memory locations are
        # erase_vec Similar to the forget get in LSTM, helps the write head get rid of info it no longer needs
        # add_vec A vector that represents what the write head will write to memory
        key_vec, β, g, alloc_gate, erase_vec, add_vec = _split_cols(write_parameters, 
                                                                    self.write_parameters_lengths)
        β = F.softplus(β)
        g = F.sigmoid(g)
        alloc_gate = F.sigmoid(alloc_gate)
        erase_vec = F.sigmoid(erase_vec)
                                               
        content_address_vec = memory.content_address_memory(key_vec, β)
        address_vec = g * (alloc_gate * usage.allocation_weights() + (1 - alloc_gate) * content_address_vec)
        self.prev_address_vec = address_vec
        memory.update_memory(address_vec, erase_vec, add_vec)
        
        # Update the link matrix and precedence vector.
        tempMatrix = 1 - address_vec.unsqueeze(1).repeat(1,self.N,1) - address_vec.unsqueeze(1).repeat(1,self.N,1)
        tempMatrix2 = torch.bmm(address_vec.unsqueeze(2), self.precedence_vec.unsqueeze(1))
        self.link_matrix = tempMatrix * self.link_matrix + tempMatrix2
        diag_mask = 1 - Variable(torch.eye(self.N).unsqueeze(0).expand_as(self.link_matrix).cuda())
        self.link_matrix *= diag_mask
        
        self.precedence_vec = (1 - torch.sum(address_vec, 1, keepdim=True)) * self.precedence_vec + address_vec
        
        # Update usage.
        usage.write_update_usage(address_vec)

In [11]:
class DNC(nn.Module):
    """
    
    """
    def __init__(self, batch_size, controller, controller_output_size, 
                 output_size, address_count, address_dimension, heads):
        super(DNC, self).__init__()
        
        self.batch_size = batch_size
        
        # Initialize controller
        self.controller = controller
        
        # Create output gate. No activation function is used with it because
        # I used BCEWithLogitsLoss which deals with the sigmoid in a more
        # numerically stable manner.
        self.outputGate = nn.Linear(controller_output_size, output_size)
        
        # Initialize memory
        self.memory = DNC_Memory(address_count, address_dimension, batch_size)

        # Construct the heads.
        self.heads = nn.ModuleList()
        
        # Initialize usage vector, might not need batch size
        self.usage = DNC_Usage(address_count, batch_size)
        num_writes = heads.count(1)
        
        self.linked_matrices = []
        for head_id in heads:
            if head_id == 0:
                self.heads.append(DNC_Read_Head(address_count, address_dimension, controller_output_size, batch_size, num_writes))
            else:
                self.heads.append(DNC_Write_Head(address_count, address_dimension, controller_output_size, batch_size))
        
        self.initialize_state()
        
    def initialize_state(self):
        self.prev_reads = []
        self.linked_matrices = []
        for head in self.heads:
            head.initialize_state()
            if head.is_read_head():
                self.prev_reads.append(head.prev_read)
            #Added this else statement to initialize the linked matrices for write heads
            else:
                self.linked_matrices.append(head.link_matrix)
        
        self.memory.initialize_state()
        self.usage.initialize_state()
        
    def reset_parameters(self):
        nn.init.xavier_uniform(self.outputGate.weight)
        nn.init.normal(self.outputGate.bias, std=0.01)
        
    def forward(self, x):
        self.initialize_state()
        outputs = []
        
        for current_observation in x.transpose(0,1):
            self.prev_reads.append(current_observation)
            controller_input = torch.cat(self.prev_reads, 1)
            controller_output = self.controller(controller_input).squeeze()

            self.prev_reads = []
            write_index = 0
            for i in range(len(self.heads)):
                head = self.heads[i]
                if head.is_read_head():
                    self.prev_reads.append(head(controller_output, self.memory, self.usage, self.linked_matrices))
                else:
                    head(controller_output, self.memory, self.usage)
                    self.linked_matrices[write_index] = head.link_matrix
                    write_index += 1
            
            current_output = self.outputGate(controller_output)
            outputs.append(current_output)
        
        return torch.stack(outputs).transpose(0, 1)

In [12]:
class CopyTaskDataset(data.Dataset):
    def __init__(self, num_batches, batch_size, lower, upper, seq_size):
        self.input_list = []
        self.label_list = []
        
        for _ in range(num_batches):
            data, label = self.generate_batch(batch_size, lower, upper, seq_size)
            self.input_list.append(data)
            self.label_list.append(label)
        
        self.batch_size = batch_size

    def generate_batch(self, batch_size, lower, upper, seq_size):
        seq_length = random.randint(lower, upper)
        label = torch.from_numpy(
                np.random.binomial(1, 0.5, (seq_length, batch_size, seq_size))).float()
        end_marker = torch.zeros(seq_length, batch_size, 1)
        seq = torch.cat((label, end_marker), 2)
        delimiter_column = torch.zeros(1, batch_size, seq_size+1)
        delimiter_column[0, :, seq_size] = 1
        seq = torch.cat((seq, delimiter_column), 0)
        output_time = torch.zeros(seq_length, batch_size, seq_size+1)
        seq = torch.cat((seq, output_time), 0)
        return seq, label
    
    def __len__(self):
        return len(self.input_list)*self.batch_size
    
    def __getitem__(self, i):
        batch_index = i//self.batch_size
        index_in_batch = i % self.batch_size
        return self.input_list[batch_index][:, index_in_batch, :], self.label_list[batch_index][:, index_in_batch, :]

In [13]:
class EncapsulatedLSTM(nn.Module):
    def __init__(self, batch_size, all_hiddens, *args, **kwargs):
        super(EncapsulatedLSTM, self).__init__()
        self.lstm = nn.LSTM(*args, **kwargs)
        self.all_hiddens = all_hiddens
        self.batch_size = batch_size
                
        self.num_inputs = args[0]
        self.hidden_size = args[1]
        self.num_layers = args[2]
        
        self.reset_parameters()
        self.initialize_state()
          
    def initialize_state(self):
        self.state_tuple = (self.initial_hidden_state.repeat(1, self.batch_size, 1), 
                            self.initial_cell_state.repeat(1, self.batch_size, 1))
    
    def reset_parameters(self):
        self.initial_hidden_state = nn.Parameter(torch.randn(self.num_layers, 1, self.hidden_size) * 0.05)
        self.initial_cell_state = nn.Parameter(torch.randn(self.num_layers, 1, self.hidden_size) * 0.05)
        
        for p in self.lstm.parameters():
            if p.dim() == 1:
                nn.init.constant(p, 0)
            else:
                stdev = 5 / (np.sqrt(self.num_inputs +  self.hidden_size))
                nn.init.uniform(p, -stdev, stdev)
        
    def forward(self, input):
        self.initialize_state()
        output, self.state_tuple = self.lstm(input.unsqueeze(0), self.state_tuple)
        
        if self.all_hiddens:
            return self.state_tuple[0]
        else:
            return output

In [14]:
def copy_task_loss(output, label):
    
    _, seq_length, _ = label.size()
    return F.binary_cross_entropy_with_logits(output[:, -seq_length:, :], label.type(torch.FloatTensor).cuda())

In [15]:
def error_bits_per_sequence(output, label):
    batch_size, seq_length, _ = label.size()
    binarized_output = output[:, -seq_length:, :].sign()/2 + 0.5
    
    # The cost is the number of error bits per sequence
    return torch.sum(torch.abs(binarized_output - label))/batch_size

In [16]:
# def construct_clipped_optimizer(optimizer_type):
#     class ClippedOptimizer(optimizer_type):
#         def step(closure=None):
#             for group in self.param_groups:
#                 for p in group['params']:
#                     if p.grad is not None:
#                         p.grad = p.grad.clamp(-10,10)
            
#             super().step(closure)
    
#     return ClippedOptimizer

In [17]:
# ClippedRMSProp = construct_clipped_optimizer(optim.RMSprop)

In [18]:
batch_size = 64
hidden_size = 100
num_layers = 3
seq_size = 8
address_size = 20
controller = EncapsulatedLSTM(batch_size, False, # all hiddens
                              seq_size + address_size + 1, hidden_size, 
                              num_layers)

In [19]:
address_count = 128
controller_output_size = hidden_size

dnc = DNC(batch_size, controller, controller_output_size, 
          seq_size, address_count, address_size, [0, 1]) 

In [20]:
dnc = dnc.cuda()

In [21]:
lower_seq_length = 3
upper_seq_length = 10
num_batches = 10000

dataset = CopyTaskDataset(num_batches, batch_size, lower_seq_length, upper_seq_length, seq_size)
data_loader = data.DataLoader(dataset, batch_size=batch_size)

In [22]:
optimizer = optim.RMSprop(dnc.parameters(), momentum=0.9,
                          alpha=0.95, lr=1e-6)

In [23]:
best_model, train_plot_losses, validation_plot_losses = training.train_model(dnc, data_loader, copy_task_loss, optimizer, None, print_every=25, num_epochs = 1, deep_copy_desired=False, validation_criterion=error_bits_per_sequence)

Epoch 0/0
----------
Epoch Number: 0, Batch Number: 25, Training Loss: 0.6937, Validation Loss: 24.9906
Time so far is 0m 7s
Epoch Number: 0, Batch Number: 50, Training Loss: 0.6936, Validation Loss: 27.1350
Time so far is 0m 14s
Epoch Number: 0, Batch Number: 75, Training Loss: 0.6937, Validation Loss: 28.0269
Time so far is 0m 22s
Epoch Number: 0, Batch Number: 100, Training Loss: 0.6934, Validation Loss: 25.3400
Time so far is 0m 29s
Epoch Number: 0, Batch Number: 125, Training Loss: 0.6935, Validation Loss: 25.0900
Time so far is 0m 36s
Epoch Number: 0, Batch Number: 150, Training Loss: 0.6935, Validation Loss: 29.6250
Time so far is 0m 44s
Epoch Number: 0, Batch Number: 175, Training Loss: 0.6936, Validation Loss: 26.1244
Time so far is 0m 51s
Epoch Number: 0, Batch Number: 200, Training Loss: 0.6935, Validation Loss: 27.0206
Time so far is 0m 58s
Epoch Number: 0, Batch Number: 225, Training Loss: 0.6934, Validation Loss: 23.1831
Time so far is 1m 5s
Epoch Number: 0, Batch Number:

Epoch Number: 0, Batch Number: 1950, Training Loss: 0.6931, Validation Loss: 27.1737
Time so far is 7m 55s
Epoch Number: 0, Batch Number: 1975, Training Loss: 0.6932, Validation Loss: 25.8531
Time so far is 8m 0s
Epoch Number: 0, Batch Number: 2000, Training Loss: 0.6931, Validation Loss: 30.6300
Time so far is 8m 7s
Epoch Number: 0, Batch Number: 2025, Training Loss: 0.6931, Validation Loss: 26.7006
Time so far is 8m 14s
Epoch Number: 0, Batch Number: 2050, Training Loss: 0.6932, Validation Loss: 25.4262
Time so far is 8m 20s
Epoch Number: 0, Batch Number: 2075, Training Loss: 0.6932, Validation Loss: 28.8181
Time so far is 8m 25s
Epoch Number: 0, Batch Number: 2100, Training Loss: 0.6932, Validation Loss: 22.7144
Time so far is 8m 29s
Epoch Number: 0, Batch Number: 2125, Training Loss: 0.6931, Validation Loss: 26.8244
Time so far is 8m 34s
Epoch Number: 0, Batch Number: 2150, Training Loss: 0.6932, Validation Loss: 26.4537
Time so far is 8m 41s
Epoch Number: 0, Batch Number: 2175, Tr

Epoch Number: 0, Batch Number: 3875, Training Loss: 0.6931, Validation Loss: 25.5369
Time so far is 16m 29s
Epoch Number: 0, Batch Number: 3900, Training Loss: 0.6931, Validation Loss: 25.4444
Time so far is 16m 36s
Epoch Number: 0, Batch Number: 3925, Training Loss: 0.6931, Validation Loss: 24.2100
Time so far is 16m 44s
Epoch Number: 0, Batch Number: 3950, Training Loss: 0.6931, Validation Loss: 25.6469
Time so far is 16m 51s
Epoch Number: 0, Batch Number: 3975, Training Loss: 0.6931, Validation Loss: 31.2881
Time so far is 17m 0s
Epoch Number: 0, Batch Number: 4000, Training Loss: 0.6931, Validation Loss: 25.3412
Time so far is 17m 8s
Epoch Number: 0, Batch Number: 4025, Training Loss: 0.6931, Validation Loss: 22.8412
Time so far is 17m 15s
Epoch Number: 0, Batch Number: 4050, Training Loss: 0.6931, Validation Loss: 24.9225
Time so far is 17m 23s
Epoch Number: 0, Batch Number: 4075, Training Loss: 0.6931, Validation Loss: 22.8681
Time so far is 17m 30s
Epoch Number: 0, Batch Number:

Epoch Number: 0, Batch Number: 5775, Training Loss: 0.6931, Validation Loss: 25.5087
Time so far is 25m 16s
Epoch Number: 0, Batch Number: 5800, Training Loss: 0.6931, Validation Loss: 27.7131
Time so far is 25m 21s
Epoch Number: 0, Batch Number: 5825, Training Loss: 0.6931, Validation Loss: 27.3356
Time so far is 25m 25s
Epoch Number: 0, Batch Number: 5850, Training Loss: 0.6931, Validation Loss: 25.3881
Time so far is 25m 29s
Epoch Number: 0, Batch Number: 5875, Training Loss: 0.6931, Validation Loss: 24.3100
Time so far is 25m 33s
Epoch Number: 0, Batch Number: 5900, Training Loss: 0.6932, Validation Loss: 26.0550
Time so far is 25m 38s
Epoch Number: 0, Batch Number: 5925, Training Loss: 0.6931, Validation Loss: 27.2269
Time so far is 25m 45s
Epoch Number: 0, Batch Number: 5950, Training Loss: 0.6931, Validation Loss: 26.6506
Time so far is 25m 53s
Epoch Number: 0, Batch Number: 5975, Training Loss: 0.6931, Validation Loss: 23.9256
Time so far is 26m 1s
Epoch Number: 0, Batch Number

Epoch Number: 0, Batch Number: 7675, Training Loss: 0.6931, Validation Loss: 26.2681
Time so far is 34m 48s
Epoch Number: 0, Batch Number: 7700, Training Loss: 0.6931, Validation Loss: 24.4956
Time so far is 34m 55s
Epoch Number: 0, Batch Number: 7725, Training Loss: 0.6931, Validation Loss: 24.5675
Time so far is 35m 2s
Epoch Number: 0, Batch Number: 7750, Training Loss: 0.6931, Validation Loss: 22.7606
Time so far is 35m 8s
Epoch Number: 0, Batch Number: 7775, Training Loss: 0.6931, Validation Loss: 25.6131
Time so far is 35m 16s
Epoch Number: 0, Batch Number: 7800, Training Loss: 0.6931, Validation Loss: 23.0444
Time so far is 35m 22s
Epoch Number: 0, Batch Number: 7825, Training Loss: 0.6931, Validation Loss: 25.2544
Time so far is 35m 30s
Epoch Number: 0, Batch Number: 7850, Training Loss: 0.6931, Validation Loss: 24.2900
Time so far is 35m 37s
Epoch Number: 0, Batch Number: 7875, Training Loss: 0.6931, Validation Loss: 25.8244
Time so far is 35m 44s
Epoch Number: 0, Batch Number:

Epoch Number: 0, Batch Number: 9575, Training Loss: 0.6931, Validation Loss: 26.8525
Time so far is 43m 45s
Epoch Number: 0, Batch Number: 9600, Training Loss: 0.6931, Validation Loss: 25.9850
Time so far is 43m 53s
Epoch Number: 0, Batch Number: 9625, Training Loss: 0.6931, Validation Loss: 25.6062
Time so far is 44m 1s
Epoch Number: 0, Batch Number: 9650, Training Loss: 0.6931, Validation Loss: 26.8631
Time so far is 44m 9s
Epoch Number: 0, Batch Number: 9675, Training Loss: 0.6931, Validation Loss: 27.1206
Time so far is 44m 17s
Epoch Number: 0, Batch Number: 9700, Training Loss: 0.6931, Validation Loss: 26.6837
Time so far is 44m 25s
Epoch Number: 0, Batch Number: 9725, Training Loss: 0.6931, Validation Loss: 22.8994
Time so far is 44m 32s
Epoch Number: 0, Batch Number: 9750, Training Loss: 0.6931, Validation Loss: 27.7838
Time so far is 44m 40s
Epoch Number: 0, Batch Number: 9775, Training Loss: 0.6931, Validation Loss: 27.2138
Time so far is 44m 48s
Epoch Number: 0, Batch Number: