# Antes de empezar

conda activate python3.6_cv2

In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader
import cv2

from scipy import ndimage
import math
import random
import skimage 
import h5py

import cProfile, pstats
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline 

# Ignore harmless warnings:

import warnings
warnings.filterwarnings("ignore")
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [2]:
import platform
print(torch.__version__)
print(platform.python_version())
torch.cuda.get_device_name(1)
path_IAM = "/home/abarreiro/data/handwriting/seq2seq/IAM_words_48_192.hdf5"

path_OSB = "/home/abarreiro/data/handwriting/seq2seq/OSBORNE_words_48_192.hdf5"

1.8.1
3.6.10


## Definiendo diccionario, codificación y longitud máxima

In [3]:
# Dictionary used in seq2seq paper
decoder_dict = {0: '0', 1: '!', 2: 'L', 3: 'z', 4: 'G', 5: 'm', 6: '6', 7: '/', 8: 'j', 9: 's', 10: 'S', 11: '5',
                12: 'R', 13: ')', 14: 'u', 15: 'y', 16: '9', 17: 'g', 18: '3', 19: '1', 20: 'e', 21: "'", 22: ':',
                23: 'Q', 24: '2', 25: 'a', 26: 't', 27: 'A', 28: '7', 29: ';', 30: 'i', 31: 'H', 32: 'W', 33: ',',
                34: '(', 35: 'O', 36: 'U', 37: 'K', 38: 'd', 39: '*', 40: '.', 41: '?', 42: 'q', 43: '-', 44: 'r',
                45: 'n', 46: '&', 47: 'C', 48: '"', 49: 'h', 50: 'v', 51: 'f', 52: 'E', 53: 'p', 54: 'x', 55: '+',
                56: 'w', 57: 'b', 58: 'o', 59: ' ', 60: 'B', 61: 'P', 62: 'D', 63: 'I', 64: 'J', 65: 'V', 66: 'N',
                67: 'M', 68: '8', 69: 'k', 70: 'c', 71: '4', 72: 'T', 73: 'X', 74: 'l', 75: 'Z', 76: 'F', 77: 'Y',
                78: 'START', 79: 'END', 80: 'PAD'}

inverse_decoder_dict = {v: k for k, v in decoder_dict.items()}
print(inverse_decoder_dict['END'])

79


In [4]:
# One_hot_mapping assigns to each number in decoder_dict its corresponding one-hot vector:

one_hot_mapping = {}

cont = 0
for item in decoder_dict:
    vector = torch.zeros(1, 1, len(decoder_dict))
    vector[0, 0, cont] = 1.0
    one_hot_mapping[item] = vector
    cont += 1

# Inverse_one_hot_mapping assigns to each one-hot vector its corresponding number in decoder_dict
inverse_one_hot_mapping = {v: k for k, v in one_hot_mapping.items()}

# One_hot_to_char assigns to each possible one-hot vector its corresponding character from decoder_dict
one_hot_to_char = {}
for one_hot, char in zip(inverse_one_hot_mapping, inverse_decoder_dict):
    one_hot_to_char[one_hot] = char 
    
# char_to_one_hot converts each character 'END', 'a', etc into a one-hot vector
char_to_one_hot = {}
for char, one_hot in zip(inverse_decoder_dict, inverse_one_hot_mapping): 
    char_to_one_hot[char] = one_hot
    
# Some examples...

print(one_hot_mapping[80])
print(inverse_one_hot_mapping[one_hot_mapping[80]])
print(one_hot_to_char[one_hot_mapping[80]])
print(char_to_one_hot['END'])

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]]])
80
PAD
tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]])


In [5]:
MAX_LENGTH = 19
output_size = len(decoder_dict)

# Definiendo funciones para el Data Augmentation

In [6]:
def get_bbox(img):
    
    height = img.shape[0]
    width = img.shape[1]
    a = np.where(img != 0) 
    
    bottom = np.max(a[0])
    right = np.max(a[1])
    x = np.min(a[1])
    y = np.min(a[0])

    bbox = img[y:bottom+1, x:right+1]    
    w = bbox.shape[1]
    h = bbox.shape[0]
    
    bbox_params = [x, y, w, h]
    
    left_margin = x
    right_margin = width - right
    top_margin = y
    bottom_margin = height - bottom
    
    margin_sizes = [left_margin, right_margin, top_margin, bottom_margin]
    
    return bbox, bbox_params, margin_sizes 

In [7]:
def erosion(img, kernel_size):
    kernel = np.ones((kernel_size, kernel_size), 'uint8')
    eroded = cv2.erode(img, kernel, iterations=1)
    maximum = eroded.max()
    if maximum > 0: # we need at least one non-zero pixel after eroding
        eroded = eroded # otherwise, we won't be able to compute any bbox
        
    else:
        eroded = img
        
    return eroded

In [8]:
def dilation(img, margin_sizes, kernel_size):
    pixel_excess = math.ceil((kernel_size - 1)//2) # number of pixels that might overflow the image limits
    # after dilation
    condition = np.any(np.array(margin_sizes) < pixel_excess) # if we overflow the image limits
    if condition == True:
        dilated = img # we don't do anything
        
    else:
        kernel = np.ones((kernel_size, kernel_size), 'uint8')
        dilated = cv2.dilate(img, kernel, iterations=1)
        
    return dilated

In [9]:
def rotation(img, angle):
    rotated = ndimage.rotate(img, angle, reshape=True)
    return rotated

In [10]:
def zoom_X(img, new_width):
    dsize = (new_width, img.shape[0])
    zoomed_X = cv2.resize(img, dsize, interpolation = cv2.INTER_AREA)
    return zoomed_X

In [11]:
def zoom_Y(img, new_height):
    dsize = (img.shape[1], new_height)
    zoomed_Y = cv2.resize(img, dsize, interpolation = cv2.INTER_AREA)
    return zoomed_Y

In [12]:
def resize(new_img, src_img):
    
    height = src_img.shape[0]
    new_height = new_img.shape[0]
    width = src_img.shape[1] 
    new_width = new_img.shape[1]
    
    width_ratio = width/new_width
    height_resized = math.ceil(new_height*width_ratio)

    height_ratio = height/new_height
    width_resized = math.ceil(new_width*height_ratio)
    
    if new_width > width and new_height > height:
        
        if height_resized < height:
            
            dsize = (width, height_resized)
            img_rescaled = cv2.resize(new_img, dsize, interpolation = cv2.INTER_AREA)
            
        else:
            dsize = (width_resized, height)
            img_rescaled = cv2.resize(new_img, dsize, interpolation = cv2.INTER_AREA)
            
    elif new_width > width and new_height < height:
        
        dsize = (width, height_resized)
        img_rescaled = cv2.resize(new_img, dsize, interpolation = cv2.INTER_AREA)
        
    elif new_width < width and new_height > height:
        
        dsize = (width_resized, height)
        img_rescaled = cv2.resize(new_img, dsize, interpolation = cv2.INTER_AREA)
        
    else:
        
        img_rescaled = new_img
        
        
    return img_rescaled

In [13]:
def fitting(src_img, new_bbox, bbox, bbox_params, margin_sizes):
    
    height = src_img.shape[0]
    width = src_img.shape[1]
    x, y, w, h = bbox_params # parameters of the original bounding box to put top-left edge of the new_bbox
    # where the original was
    left_margin, right_margin, top_margin, bottom_margin = margin_sizes
    
    new_height = new_bbox.shape[0]
    new_width = new_bbox.shape[1]
    new_img = np.zeros(src_img.shape)
    
    fits_vertical = new_height + top_margin < height
    fits_horizont = new_width + left_margin < width

    if fits_vertical == False and fits_horizont == False:
        
        diff_vertical = (new_height + top_margin) - height
        diff_horizont = (new_width + left_margin) - width
        new_x = x - diff_horizont
        new_y = y - diff_vertical
        
        new_img[new_y:(new_y + new_height), new_x:(new_x + new_width)] = new_bbox
        
    elif fits_vertical == False and fits_horizont == True:
        
        diff_vertical = (new_height + top_margin) - height
        new_x = x
        new_y = y - diff_vertical
        new_img[new_y:(new_y + new_height), new_x:(new_x + new_width)] = new_bbox
        
    elif fits_vertical == True and fits_horizont == False:
        
        diff_horizont = (new_width + left_margin) - width
        new_x = x - diff_horizont
        new_y = y
        new_img[new_y:(new_y + new_height), new_x:(new_x + new_width)] = new_bbox
        
    else:
        new_x = x
        new_y = y
        new_img[new_y:(new_y + new_height), new_x:(new_x + new_width)] = new_bbox
        
    new_bbox_params = [new_x, new_y, new_width, new_height]
    
    return new_img, new_bbox_params

In [14]:
def displacement(new_img, new_bbox_params, d_x):
    # d_x has to be the amount of pixels
    new_x, new_y, new_width, new_height = new_bbox_params
    total_width = new_x + new_width + d_x
    if total_width > new_img.shape[1]:
        new_img_disp = new_img
        
    else:
        new_img_disp = new_img
        new_img_disp[:, d_x:] = new_img_disp[:, 0:-d_x]
        new_img_disp[:, 0:d_x] = 0
        
    return new_img_disp


In [15]:
def DataAugmentation(img):
    #img = img.numpy()
    bbox, bbox_params, margin_sizes = get_bbox(img)
    kernel_size = 3
    # Sequence of possible transformations:
    
    # Erosion/Dilation:
    p_er_dil = random.uniform(0,1)
    
    if p_er_dil < 0.33:
        #kernel_size = random.randint(2,3)
        new_img = erosion(img, kernel_size)
        
    elif p_er_dil > 0.66:
        #kernel_size = random.randint(2,5)
        new_img = dilation(img, margin_sizes, kernel_size)
        
    else:
        new_img = img
        
    updated_bbox, update_bbox_params, updated_margin_sizes = get_bbox(new_img)
    
    #Rotation:
    p_rot = random.uniform(0,1)
    if (p_rot < 0.5 and updated_bbox.max() > 0): # we need to rotate a non-zero matrix...
        angle = random.randint(-3,3)
        new_img = rotation(updated_bbox, angle)
        new_img, new_bbox_params, new_margin_sizes = get_bbox(new_img) # ... as we want a new bbox after rotation

    else:
        new_img = updated_bbox
        
    #Zoom in X axis:
    p_zoom_X = random.uniform(0,1)
    
    if p_zoom_X < 0.5:
        percentage_zoom_X = random.uniform(0.9,1.1)
        new_width = math.ceil(percentage_zoom_X*new_img.shape[1])
        new_img = zoom_X(new_img, new_width)
        
    else:
        new_img = new_img
        
    #Zoom in Y axis:
    p_zoom_Y = random.uniform(0,1)
    
    if p_zoom_Y < 0.5:
        percentage_zoom_Y = random.uniform(0.9,1.1)
        new_height = math.ceil(percentage_zoom_Y*new_img.shape[0])
        new_img = zoom_Y(new_img, new_height)
        
    else:
        new_img = new_img
    
    # Rescaling the new bbox in order to fit in the 48x192 original format:
    img_rescaled = resize(new_img, img)
    
    # Pasting the new bbox in the 48x192 image
    new_img, new_bbox_params = fitting(img, img_rescaled, bbox, bbox_params, margin_sizes)
    
    #Displacement:
    p_disp = random.uniform(0,1)
    
    if p_disp < 0.7:
        percentage_disp = random.uniform(0,0.1)
        d_x = math.ceil(percentage_disp*img.shape[1])
        new_img = displacement(new_img, new_bbox_params, d_x)
        
    else:
        new_img = new_img
        
    # Salt & Pepper noise:
    p_noise = random.uniform(0,1)
    if p_noise < 0.5:
        new_img = new_img/255 # normalisation
        new_img = skimage.util.random_noise(new_img, mode = 's&p')

    else:
        new_img = new_img/255 # normalisation
    
    #new_img = new_img/255    
    new_img = torch.from_numpy(new_img) # converting numpy-type image to torch tensor

    return new_img

In [16]:
def get_Data_Aug(image_set):
    images_da = []
    for image in image_set:
        try:
            image = DataAugmentation(image) # normalisation and torch.from_numpy included
            images_da.append(image)
            
        except:
            image = image / 255 # normalisation
            image = torch.from_numpy(image) # torch.from_numpy
            images_da.append(image)
            
    return images_da

# Generando patches y etiquetas para muestra finita

In [17]:
def patch_gen(image_set, batch_size, color_channels, height, width, patch_height, patch_width, stepsize):
    total_pt = []
    n_patches = int((width - patch_width)/stepsize + 1)
       
    for image in image_set:
        
        patches_tensor = torch.empty(n_patches, color_channels, patch_height, patch_width)    
        start = 0

        for p in range(n_patches):

            patches_tensor[p, 0, :, :] = image[:, start:start + patch_width] # sliding window
            start += stepsize # updating the bottom-left position of the patch adding the stepsize
            
        total_pt.append(patches_tensor)
 
    return total_pt

In [18]:
def get_one_hot_target_IAM(labels, seq_len, output_size, batch_size):
    # labels: tensor containing the labels of the words in the batch
    # each word label consists of a vector of length 19 (MAX LENGTH). The 19 elements are the encoded characters of the word
    # (according to Jorge's decoder dict, and completed with PADs to reach length = 19)
    one_hot_target = torch.empty(batch_size, seq_len, output_size) # future one-hot encoding tensor for the words of the batch
    START = inverse_decoder_dict['START'] # code number of the START token (according to Jorge's decoder_dict)
    END = inverse_decoder_dict['END']
    PAD = inverse_decoder_dict['PAD']

    for j, word in enumerate(labels):
        
        It_has_PADs = torch.any(word == PAD).item() # (majority case: the label vector of the word is completed with PADs)
        one_hot_target[j, 0, :] = one_hot_mapping[START] # START token's one-hot vector goes first
        
        for k, letter in enumerate(word):
            one_hot_target[j, k + 1, :] = one_hot_mapping[letter.item()] # one-hot encoding of the rest of letters (including PADs)
            
        one_hot_target[j, -1, :] = one_hot_mapping[END] # last = END token
        
        if It_has_PADs == True: # if we had PADs
            
            array_of_PADs = torch.where(word == PAD)[0] 
            first_PAD = torch.min(array_of_PADs).item() # we store the first position where it appeared
            first_PAD = first_PAD + 1 # (recall that we added the START as first element, so the indices won't match)
            one_hot_target[j, first_PAD, :] = one_hot_mapping[END] # we replace that first PAD by an END
            one_hot_target[j, -1, :] = one_hot_mapping[PAD] # then the last element was a PAD, and not the END token
            
    return one_hot_target
                

In [19]:
def one_hot_conversion(decoder_output, output_size):
    
    one_hot_output_letter = torch.zeros(1, 1, output_size)
    index = torch.argmax(decoder_output, dim = 2).item()
    one_hot_output_letter[0, 0, index] = 1.
    
    return one_hot_output_letter

# Generando datos por batch

In [22]:
def sort_by_batch(set_random_sample, set_length, batch_size):
    sorted_set_rs = []
    j = 0
    while (j + batch_size < set_length):
        sorted_set_rs.append(np.sort(set_random_sample[j:j+batch_size]))
        j = j + batch_size
    
    sorted_set_rs.append(np.sort(set_random_sample[j:]))
    sorted_set_rs = np.concatenate(sorted_set_rs)
    return sorted_set_rs

def data_generator(batch_size, len_set_IAM, len_set_OSB, image_set, target_set, random_sampling_IAM, random_sampling_OSB, mode):
    
    f_IAM = h5py.File(path_IAM, "r")
    f_OSB = h5py.File(path_OSB, "r")
    
    IAM_times = 0 # times we have covered the IAM dataset during this epoch
    OSB_times = 0 # times we have covered the OSB dataset during this epoch
    j = 0
    k = 0

    while 1:
        IAM = False
        OSB = False
        if random.random() > 0.125:
            IAM = True
            indices = random_sampling_IAM[j:j+batch_size]
            data_X = f_IAM[image_set][indices]
            data_y = f_IAM[target_set][indices]
                       
        else:
            OSB = True
            indices = random_sampling_OSB[k:k+batch_size]
            data_X = f_OSB[image_set][indices]
            data_y = f_OSB[target_set][indices]
        
        if mode == 'training':
            data_X = get_Data_Aug(data_X)
            
        elif mode == 'training_without_data_aug':
            data_X = data_X/255
            data_X = torch.from_numpy(data_X)
        
        elif mode == 'validation':
            data_X = data_X/255
            data_X = torch.from_numpy(data_X)
        
        # Getting patches from images (data augmented or not)
        data_X = patch_gen(image_set = data_X, batch_size = batch_size, color_channels = 1, 
                              height = 48, width = 192, patch_height = 48, patch_width = 10, stepsize = 2)
        data_X = torch.cat(data_X, dim = 0) # stacking patches tensors of the whole batch along the dim 0 (CNN input)
        
        data_y = torch.from_numpy(data_y) # converting numpy objects to PyTorch tensors
        data_y[data_y == 100.] = 80. # replacing Jorge's coding of PAD token (100.) by ours (80.)
        
        yield data_X, data_y
        
        # covered_data: ~ number of patterns that the network has seen during this epoch;  OSB (~7.000 training patterns) 
        # might have been picked and covered several times during 1 epoch (57.000 training patterns):
        covered_data = (IAM_times*len_set_IAM) + (OSB_times*len_set_OSB) + (j + k + 2*batch_size) 
        data_length = len_set_IAM + len_set_OSB # total number of patterns for both datasets combined
        
        if covered_data >= data_length: # epoch has been completed
            j = 0
            k = 0
            IAM_times = 0
            OSB_times = 0
            break
        
        else: # if not, parameters for each dataset are updated
            if IAM == True:
                if j + 2*batch_size >= len_set_IAM: # drop_last = True
                    j = 0
                    IAM_times += 1

                else:
                    j += batch_size

            if OSB == True:     
                if k + 2*batch_size >= len_set_OSB: # drop_last = True
                    k = 0
                    OSB_times += 1
                else:
                    k += batch_size
        
    f_IAM.close()
    f_OSB.close()

# Definiendo la arquitectura

In [23]:
class ConvolutionalNetwork(nn.Module):
    
    def __init__(self, IN_CHANNELS, FILTERS_CNN_1, FILTERS_CNN_2, NEURONS_IN_DENSE_LAYER,
                 PATCH_HEIGHT, PATCH_WIDTH, STRIDE, PADDING, KERNEL_SIZE, dropout_p):
        super().__init__()
        self.IN_CHANNELS = IN_CHANNELS
        self.FILTERS_CNN_1 = FILTERS_CNN_1
        self.FILTERS_CNN_2 = FILTERS_CNN_2
        self.NEURONS_IN_DENSE_LAYER = NEURONS_IN_DENSE_LAYER
        self.PATCH_HEIGHT_AFTER_POOLING = PATCH_HEIGHT//4
        self.PATCH_WIDTH_AFTER_POOLING = PATCH_WIDTH//4
        self.STRIDE = STRIDE
        self.PADDING = PADDING
        self.KERNEL_SIZE = KERNEL_SIZE
        self.dropout = nn.Dropout(dropout_p)
        
        self.conv1 = nn.Conv2d(in_channels = self.IN_CHANNELS, out_channels = self.FILTERS_CNN_1,
                               kernel_size = self.KERNEL_SIZE, stride = self.STRIDE, padding = self.PADDING)
        self.conv2 = nn.Conv2d(in_channels = self.FILTERS_CNN_1, out_channels = self.FILTERS_CNN_2,
                               kernel_size = self.KERNEL_SIZE, stride = self.STRIDE, padding = self.PADDING)
        self.fc1 = nn.Linear(self.PATCH_HEIGHT_AFTER_POOLING * self.PATCH_WIDTH_AFTER_POOLING * self.FILTERS_CNN_2, 
                             self.NEURONS_IN_DENSE_LAYER)
        
    def forward(self, X):
        X = F.relu((self.conv1(X)))
        X = F.max_pool2d(X, 2, 2)
        X = F.relu((self.conv2(X)))
        X = F.max_pool2d(X, 2, 2)
        X = X.view(-1, self.PATCH_HEIGHT_AFTER_POOLING*self.PATCH_WIDTH_AFTER_POOLING*self.FILTERS_CNN_2)
        X = self.dropout(self.fc1(X))

        return X

In [24]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, batch_size, encoder_seq_len, num_layers, num_directions, dropout_p):        
        super(EncoderRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.seq_len = encoder_seq_len
        self.num_layers = num_layers
        self.num_directions = num_directions
        self.dropout = dropout_p
        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first = True, dropout = self.dropout, 
                           bidirectional = True)

    def forward(self, input, hidden):        
        output = input.view(self.batch_size, self.seq_len, self.input_size)
        output, hidden = self.lstm(output, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(self.num_layers * self.num_directions, self.batch_size, self.hidden_size, device=device),
                torch.zeros(self.num_layers * self.num_directions, self.batch_size, self.hidden_size, device=device))

In [25]:
class BahdanauDecoder(nn.Module):
    
    def __init__(self, output_size, hidden_size, dropout_p, batch_size, encoder_seq_len, decoder_seq_len):
        super(BahdanauDecoder, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.encoder_seq_len = encoder_seq_len
        self.decoder_seq_len = decoder_seq_len
        self.dropout_p = dropout_p

        #self.embedding = nn.Embedding(self.output_size, self.hidden_size)

        self.fc_hidden = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.fc_encoder = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
        self.weight_vector = torch.FloatTensor(self.batch_size, self.hidden_size, self.decoder_seq_len)
        self.weight = nn.Parameter(nn.init.xavier_uniform_(self.weight_vector)) #xavier initializer avoids nans
        #self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.output_size + self.hidden_size, self.hidden_size, batch_first=True)
        self.classifier = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, inputs, hidden, encoder_outputs):
        #encoder_outputs = encoder_outputs.squeeze()
        # Embed input words
        #embedded = self.embedding(inputs).view(1, -1)
        #embedded = self.dropout(embedded)

        # Calculating Alignment Scores
        hidden_state = hidden[0].view(self.batch_size, self.decoder_seq_len, self.hidden_size)
        x = torch.tanh(self.fc_hidden(hidden_state) + self.fc_encoder(encoder_outputs))

        alignment_scores = torch.bmm(x, self.weight)
        alignment_scores = alignment_scores.view(self.batch_size, self.decoder_seq_len, self.encoder_seq_len)
 
        # Softmaxing alignment scores to get Attention weights
        attn_weights = F.softmax(alignment_scores, dim = 2)

        # Multiplying the Attention weights with encoder outputs to get the context vector
        context_vector = torch.bmm(attn_weights, encoder_outputs)

        # Concatenating context vector with embedded input word
        output = torch.cat((inputs, context_vector), 2)
        # Passing the concatenated vector as input to the LSTM cell
        output, hidden = self.lstm(output, hidden)
        # Passing the LSTM output through a Linear layer acting as a classifier
        output = F.log_softmax(self.classifier(output), dim = 2)
        return output, hidden, attn_weights

In [26]:
torch.manual_seed(1234)

CNN_model = ConvolutionalNetwork(IN_CHANNELS = 1, FILTERS_CNN_1 = 20, FILTERS_CNN_2 = 50, NEURONS_IN_DENSE_LAYER = 1024, 
                                 PATCH_HEIGHT = 48, PATCH_WIDTH = 10, STRIDE = 1, PADDING = 2, KERNEL_SIZE = 5, dropout_p = 0.5).cuda(1)
CNN_optimizer = torch.optim.Adam(CNN_model.parameters(), lr = 0.001)
CNN_scheduler = torch.optim.lr_scheduler.StepLR(CNN_optimizer, step_size = 1, gamma = 0.98) # decreasing lr 2% every epoch

Encoder_model = EncoderRNN(input_size = 1024, hidden_size = 256, batch_size = 256, encoder_seq_len = 92, 
                           num_layers = 2, num_directions = 2, dropout_p = 0.5).cuda(1)
Encoder_optimizer = torch.optim.Adam(Encoder_model.parameters(), lr = 0.001)
Encoder_scheduler = torch.optim.lr_scheduler.StepLR(Encoder_optimizer, step_size = 1, gamma = 0.98)

#Decoder_model = AttnDecoderRNN(output_size = len(decoder_dict), hidden_size = 256, dropout_p = 0, batch_size = 256,
#                               encoder_seq_len = 92, decoder_seq_len = 1).cuda(1)
Decoder_model = BahdanauDecoder(output_size = len(decoder_dict), hidden_size = 256, dropout_p = 0, batch_size = 256,
                               encoder_seq_len = 92, decoder_seq_len = 1).cuda(1)
Decoder_optimizer = torch.optim.Adam(Decoder_model.parameters(), lr = 0.001)
Decoder_scheduler = torch.optim.lr_scheduler.StepLR(Decoder_optimizer, step_size = 1, gamma = 0.98)
#criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()

## Entrenando

In [27]:
import time

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [28]:
def train():
    train_losses = []
    
    random_sampling_IAM = random.sample(range(len_trn_IAM), len_trn_IAM)
    random_sampling_IAM = sort_by_batch(random_sampling_IAM, len_trn_IAM, batch_size)
    
    random_sampling_OSB = random.sample(range(len_trn_OSB), len_trn_OSB)
    random_sampling_OSB = sort_by_batch(random_sampling_OSB, len_trn_OSB, batch_size)

    train_loader = data_generator(batch_size, len_trn_IAM, len_trn_OSB, image_set = 'X_trn', target_set = 'target_trn',
                                  random_sampling_IAM = random_sampling_IAM, random_sampling_OSB = random_sampling_OSB, 
                                  mode = 'training')
    
    for num_batch, (images, labels) in enumerate(train_loader):         
        num_batch += 1

        encoder_hidden = Encoder_model.initHidden()
        images = images.cuda(1)
        encoder_input = CNN_model(images)
        encoder_outputs, encoder_hidden = Encoder_model(encoder_input, encoder_hidden)
        encoder_outputs = encoder_outputs.reshape(batch_size, encoder_seq_len, enc_num_directions*enc_hidden_size)
        encoder_outputs = encoder_outputs.view(batch_size, encoder_seq_len, enc_num_directions, enc_hidden_size)
        encoder_outputs = encoder_outputs[:, :, 0, :] + encoder_outputs[:, :, 1, :]
        encoder_outputs = encoder_outputs.view(batch_size, encoder_seq_len, enc_hidden_size)

        #hidden_states = encoder_outputs[:, -1, 0, :].view(1, 256, 256)
        #hidden_last = hidden_states[:, 0, :].view(1,1,256)
        #h_n = encoder_hidden[0][2, 0, :].view(1,1,256)
        #print(hidden_last)
        #print(h_n)
        #decoder_hidden = encoder_hidden

        #decoder_hidden = (encoder_hidden[0][-2, :, :].unsqueeze(0), encoder_hidden[1][-2, :, :].unsqueeze(0)) # last encoder layer (and forward) hidden state
        hidden_state = encoder_hidden[0][-2, :, :].view(1, batch_size, enc_hidden_size) + encoder_hidden[0][-1, :, :].view(1, batch_size, enc_hidden_size)
        cell_state = encoder_hidden[1][-2, :, :].view(1, batch_size, enc_hidden_size) + encoder_hidden[1][-1, :, :].view(1, batch_size, enc_hidden_size)
        decoder_hidden = (hidden_state, cell_state)
        decoder_input = get_one_hot_target_IAM(labels=labels, seq_len = MAX_LENGTH + 2, output_size = output_size, batch_size = 256).cuda(1)
        
        decoder_output_total = []
        for num_letter in range(MAX_LENGTH + 2):
            
            decoder_input_letter = decoder_input[:, num_letter, :].unsqueeze(1)
            
            decoder_output, decoder_hidden, attn_weights = Decoder_model(decoder_input_letter, decoder_hidden, encoder_outputs)
            
            decoder_output_total.append(decoder_output)
            
        decoder_output_total = torch.cat(decoder_output_total, dim = 1)
        
        
        output_indices = torch.tensor(list(range(0, MAX_LENGTH + 2 -1))).cuda(1) # removing last token from the output
        decoder_output = torch.index_select(decoder_output_total, dim = 1, index = output_indices)

        ground_truth = torch.argmax(decoder_input, dim = 2)
        target_indices = torch.tensor(list(range(1, MAX_LENGTH + 2))).cuda(1) # remove SOS token from the input
        ground_truth = torch.index_select(ground_truth, dim = 1, index = target_indices)
        
        decoder_output = decoder_output.view(batch_size*(MAX_LENGTH + 1), output_size)
        ground_truth = ground_truth.view(batch_size*(MAX_LENGTH + 1))

        loss = criterion(decoder_output, ground_truth)
        
        CNN_optimizer.zero_grad()
        Encoder_optimizer.zero_grad()
        Decoder_optimizer.zero_grad()
        loss.backward()
        CNN_optimizer.step()
        Encoder_optimizer.step()
        Decoder_optimizer.step()
        train_losses.append(loss.item())
        
    return np.mean(train_losses)

In [29]:
def validation():
    
    valid_losses = []
    
    random_sampling_val_IAM = random.sample(range(len_val_IAM), len_val_IAM)
    random_sampling_val_IAM = sort_by_batch(random_sampling_val_IAM, len_val_IAM, batch_size)
    
    random_sampling_val_OSB = random.sample(range(len_val_OSB), len_val_OSB)
    random_sampling_val_OSB = sort_by_batch(random_sampling_val_OSB, len_val_OSB, batch_size)
    
    val_loader = data_generator(batch_size, len_val_IAM, len_val_OSB, image_set = 'X_val', target_set = 'target_val',
                                random_sampling_IAM = random_sampling_val_IAM, random_sampling_OSB = random_sampling_val_OSB, 
                                mode = 'validation')
    
    with torch.no_grad():       
        for num_batch_val, (images_val, labels_val) in enumerate(val_loader):        
            num_batch_val += 1
            encoder_hidden_val = Encoder_model.initHidden()
            images_val = images_val.cuda(1)
            encoder_input_val = CNN_model(images_val)
            encoder_outputs_val, encoder_hidden_val = Encoder_model(encoder_input_val, encoder_hidden_val)
            encoder_outputs_val = encoder_outputs_val.reshape(batch_size, encoder_seq_len, enc_num_directions*enc_hidden_size)
            encoder_outputs_val = encoder_outputs_val.view(batch_size, encoder_seq_len, enc_num_directions, enc_hidden_size)
            encoder_outputs_val = encoder_outputs_val[:, :, 0, :] + encoder_outputs_val[:, :, 1, :]
            
            #decoder_hidden_val = (encoder_hidden_val[0][-2, :, :].unsqueeze(0), encoder_hidden_val[1][-2, :, :].unsqueeze(0)) # last encoder layer (and forward) hidden state
            hidden_state_val = encoder_hidden_val[0][-2, :, :].view(1, batch_size, enc_hidden_size) + encoder_hidden_val[0][-1, :, :].view(1, batch_size, enc_hidden_size)
            cell_state_val = encoder_hidden_val[1][-2, :, :].view(1, batch_size, enc_hidden_size) + encoder_hidden_val[1][-1, :, :].view(1, batch_size, enc_hidden_size)
            decoder_hidden_val = (hidden_state_val, cell_state_val)
            decoder_input_val = get_one_hot_target_IAM(labels=labels_val, seq_len = MAX_LENGTH + 2, output_size = output_size, batch_size = 256).cuda(1)
            
            decoder_output_total_val = []
            for num_letter_val in range(MAX_LENGTH + 2):
            
                decoder_input_letter_val = decoder_input_val[:, num_letter_val, :].unsqueeze(1)

                decoder_output_val, decoder_hidden_val, attn_weights_val = Decoder_model(decoder_input_letter_val, decoder_hidden_val, encoder_outputs_val)
                
                decoder_output_total_val.append(decoder_output_val)
                
            decoder_output_total_val = torch.cat(decoder_output_total_val, dim = 1)
            
            output_indices_val = torch.tensor(list(range(0, MAX_LENGTH + 2 - 1))).cuda(1) # remove last token from the output
            decoder_output_val = torch.index_select(decoder_output_total_val, dim = 1, index = output_indices_val)

            ground_truth_val = torch.argmax(decoder_input_val, dim = 2)
            target_indices_val = torch.tensor(list(range(1, MAX_LENGTH + 2))).cuda(1) # remove START token from the input
            ground_truth_val = torch.index_select(ground_truth_val, dim = 1, index = target_indices_val)
            
            
            decoder_output_val = decoder_output_val.view(batch_size*(MAX_LENGTH + 1), output_size)
            ground_truth_val = ground_truth_val.view(batch_size*(MAX_LENGTH + 1))

            loss_val = criterion(decoder_output_val, ground_truth_val)
            valid_losses.append(loss_val.item())
    return np.mean(valid_losses)

In [30]:
class Patience():
    
    def __init__(self, patience):
        self.patience = patience
        self.current_patience = patience
        self.min_loss_val = float('inf')

    def more_patience(self,loss_val):
        self.current_patience -= 1
        if self.current_patience == 0:
            return False

        if loss_val < self.min_loss_val:
            self.min_loss_val = loss_val
            self.current_patience = patience

            model_name = f"2BILSTM_Bahdanau_IAM_OSBORNE"
            print(", saved best model.")
            
            torch.save({
                'CNN_model_state_dict_IAM_OSB': CNN_model.state_dict(),
                'CNN_optimizer_state_dict_IAM_OSB': CNN_optimizer.state_dict(),
                'Encoder_model_state_dict_IAM_OSB': Encoder_model.state_dict(),
                'Encoder_optimizer_state_dict_IAM_OSB': Encoder_optimizer.state_dict(),
                'Decoder_model_state_dict_IAM_OSB': Decoder_model.state_dict(),
                'Decoder_optimizer_state_dict_IAM_OSB': Decoder_optimizer.state_dict(),
            }, 'Attention_IAM_OSB'+model_name)
            
            torch.save(CNN_model.state_dict(), 'CNN_'+model_name)
            torch.save(Encoder_model.state_dict(), 'Encoder_'+model_name)
            torch.save(Decoder_model.state_dict(), 'Decoder_'+model_name)
    
        return True

In [None]:
torch.manual_seed(1234)
patience = 150

patience_controler = Patience(patience)
start_time = time.time()

len_trn_IAM = 47926
len_val_IAM = 7558
len_tst_IAM = 20292

len_trn_OSB = 7149
len_val_OSB = 286
len_tst_OSB = 194

encoder_seq_len = 92
enc_num_directions = 2
batch_size = 256
enc_hidden_size = 256
dec_hidden_size = 256
#hidden_size = 256

for num_epoch in range(5000000):

    train_loss = train()        
    valid_loss = validation()
    
    CNN_scheduler.step()
    Encoder_scheduler.step()
    Decoder_scheduler.step()
    
    writer.add_scalar('Loss/train', train_loss, num_epoch)
    writer.add_scalar('Loss/validation', valid_loss, num_epoch)
    
    print(f'Epoch: {num_epoch} Train loss: {train_loss} Valid loss: {valid_loss} Duration: {(time.time() - start_time)/60} minutes',)

    if not patience_controler.more_patience(valid_loss):
        print("Se acabó la paciencia")
        break


Epoch: 0 Train loss: 0.9544492380641331 Valid loss: 0.6936579902966817 Duration: 3.2211692690849305 minutes
, saved best model.
Epoch: 1 Train loss: 0.6971235216778016 Valid loss: 0.6151993115743001 Duration: 6.468083480993907 minutes
, saved best model.
Epoch: 2 Train loss: 0.6182667629741062 Valid loss: 0.5430951019128164 Duration: 9.71184621254603 minutes
, saved best model.
Epoch: 3 Train loss: 0.5773695260286331 Valid loss: 0.5342610627412796 Duration: 12.957438496748606 minutes
, saved best model.
Epoch: 4 Train loss: 0.553491602294913 Valid loss: 0.48882017433643343 Duration: 16.207496503988903 minutes
, saved best model.
Epoch: 5 Train loss: 0.5229471952558677 Valid loss: 0.45822919805844625 Duration: 19.45602447191874 minutes
, saved best model.
Epoch: 6 Train loss: 0.4994939879000744 Valid loss: 0.4619922677675883 Duration: 22.712537145614625 minutes
Epoch: 7 Train loss: 0.47409003569143954 Valid loss: 0.4270054340362549 Duration: 25.961822581291198 minutes
, saved best model

In [None]:
# activar pytorch_estoril (environment) en la terminal y ejecutar tensorboard --host 0.0.0.0 --logdir ./runs
# Tensorboard se ejecutará en un cierto puerto y nos dará el enlace. Habrá que sustituir la IP 0.0.0.0 por la del equipo
# en remoto en la que esté corriendo en el caso de Estoril 212.128.3.86: