In [2]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import time
import numpy
import h5py

from PIL import Image, ImageSequence

import cv2

import torchtext.data as data
import torchtext.datasets as datasets

import os
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, Sentence

import tensorwatch as tw

flair_embedding_forward = FlairEmbeddings('news-forward')

In [3]:
# Dataset creation with image directory, image -> 'RGB' -> transformed to Mobilenetv2 input, Ocr,
# Class and Segmentation

data_transforms = {
        'train': transforms.Compose([
            transforms.Resize(256),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])}
#Independent train and test transformations can be done

class H5Dataset(Dataset):

    def __init__(self, hdf5_file, data_transforms):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.h5_file = h5py.File(hdf5_file, "r")
        self.data = self.h5_file.get('train_img')
        self.target = self.h5_file.get('train_labels')
        self.ocr = self.h5_file.get('train_ocrs')
        self.data_transforms = data_transforms

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        
        img = self.data[idx,:,:,:],
        img = Image.fromarray(img[0].astype('uint8'), 'RGB')
        #doc_class = torch.from_numpy(self.target[idx,:,:,:]).float()
        doc_class = self.target[idx]
        doc_class = doc_class.astype(np.uint8)
        doc_class = torch.tensor(doc_class)
        
        ocr_text = self.ocr[idx]
        

        if self.data_transforms is not None:
            try:
                image = self.data_transforms(img)
            except:
                print("Cannot transform image: {}")
        

        ocr = ocr_text #ocr_text

        sentence = Sentence(ocr)

        #flair_embedding_fast = FlairEmbeddings('multi-forward-fast')
        #flair_embedding_fast.embed(sentence)

        flair_embedding_forward.embed(sentence)



        #flair_embedding_fast.embed(sentence)
        counter = 0
        for token in sentence:
            #print(token)
            token_embedding = token.embedding
            token_embedding = token_embedding.unsqueeze(0)
            #print(token_embedding)
            #print(token_embedding.shape)
            if counter == 0:
                prev_token_embedding = token_embedding
            if counter != 0:
                prev_token_embedding = torch.cat((prev_token_embedding, token_embedding),0)
            counter += 1

        
        
        #print('Sentence embedded size',prev_token_embedding.shape)
            
        #document_embeddings.embed(sentence_doc)
        
        #print('Document embedding', sentence_doc.get_embedding().shape)
        
        #prev_token_embedding = sentence_doc.get_embedding()
        
        sample = {'image': image, 'class': doc_class, 'ocr': prev_token_embedding}

        return sample

In [4]:
h5_dataset = H5Dataset(hdf5_file='./HDF5_files/hdf5_10.hdf5', data_transforms=data_transforms['train'])

In [5]:
dataloader = DataLoader(h5_dataset, batch_size=1,
                        shuffle=False, num_workers=0)

In [6]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [17]:
from models.imagenet import mobilenetv2
import torch.nn as nn

net = mobilenetv2()
net.load_state_dict(torch.load('pretrained/mobilenetv2_1.0-0c6065bc.pth', map_location='cpu'))
feature_extracting = True
set_parameter_requires_grad(net, feature_extracting)
net.classifier = nn.Linear(1280, 300)
print(net.classifier)

Linear(in_features=1280, out_features=300, bias=True)


## CNN Text model

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


class CNN_Text(nn.Module):
    
    def __init__(self, image_model):
        super(CNN_Text, self).__init__()
        #self.args = args
        
        #V = args.embed_num
        D = 2048 #embed_dim, 4196 for doc_embeddings
        C = 10 #class_num
        Ci = 1
        Co = 100 #kernel_num -> number of kernel with the same size
        Ks = [3,4,5] #kernel_sizes -> size = number of words

        #self.embed = nn.Embedding(V, D)
        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        
        
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(0.5)
        #self.fc1 = nn.Linear(len(Ks)*Co, C)
        self.fc1 = nn.Linear(600, C)
        
        self.image_model = image_model

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        
        #Output will be size (1,Ks*Co) -> Maxpool will get one ĉ value =  max(c_1,c_2...), where c_i is
        #the result of the convolution operation of the kernel over the input
        
        return x
    
    

    def forward(self, x, x2):
        #x = self.embed(x)  # (N, W, D)
        
        #if self.args.static:
            #x = Variable(x)
        #print('CNN Text entry',x.shape)

        x = x.unsqueeze(1)  # (N, Ci, W, D)
        #print('unsqueeze',x.shape)
        
        
        #print(x.shape)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        

        x = torch.cat(x, 1) #[1,100] + [1,100] + [1,100] = [1,300]
        
        #print('After cat', x.shape)

        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = self.dropout(x)  # (N, len(Ks)*Co)
        
        x2 = self.image_model(x2)
        
        x2 = torch.cat((x,x2),1)
        
        
        logit = self.fc1(x2)
        #logit = self.fc1(x)  # (N, C)
        return logit

In [19]:
# get model
model = CNN_Text(net)
#print(model)
# define loss function
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.00004)

In [20]:
params_to_update = model.parameters()
print("Params to learn:")
if feature_extracting:
    params_to_update = []
    for name,param in model.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in net.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

Params to learn:
	 convs1.0.weight
	 convs1.0.bias
	 convs1.1.weight
	 convs1.1.bias
	 convs1.2.weight
	 convs1.2.bias
	 fc1.weight
	 fc1.bias
	 image_model.classifier.weight
	 image_model.classifier.bias


In [15]:

max_epochs = 30
optimizer = optimizer_ft
batch_size=1
running_loss = 0.0
steps = 0
loss_values = []
epoch_values = []

# Loop over epochs
for epoch in range(max_epochs):
    running_loss = 0
    # Training
    
    for local_batch in dataloader:
        image, ocr_text, labels = Variable(local_batch['image']), Variable(local_batch['ocr']), Variable(local_batch['class'])
        # zero the parameter gradients
        optimizer.zero_grad()
        #print(local_batch['image_dir'])

        # forward
        outputs = model(ocr_text, image)
        _, preds = torch.max(outputs.data, 1)
        #print(preds, labels.long())
        loss = criterion(outputs, labels.long())
        print(outputs)

        # backward + optimize only if in training phase
        loss.backward()
        optimizer.step()
        #running_loss += loss.data[0]
        steps += 1
        #if steps % 5 == 0:
            #save(model,'./snapshot/', 'model', steps)
        
        running_loss += loss.item()
    #print(outputs)ack
    print('[Epoch {}/{}], loss {}'.format(
                      epoch, max_epochs,running_loss/10))
        
   
    loss_values.append(running_loss/10)
    epoch_values.append(epoch)




plt.plot(epoch_values, loss_values, '--', color="#111111",  label="Training score")

#save(model,'./snapshot/', 'model', steps)

tensor([[ 0.2897,  0.0826, -0.2104,  0.2128, -0.1190, -0.2183, -0.3800,  0.2326,
         -0.2472,  0.0993]], grad_fn=<AddmmBackward>)


KeyboardInterrupt: 