In [6]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import time
import numpy
import h5py

from PIL import Image, ImageSequence

import cv2

import torchtext.data as data
import torchtext.datasets as datasets

import os
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, Sentence

In [7]:
hdf5_file = './HD5F_files/hdf5_images_gzip.hdf5'
h5_file = h5py.File(hdf5_file, "r")

In [165]:
# Dataset creation with image directory, image -> 'RGB' -> transformed to Mobilenetv2 input, Ocr,
# Class and Segmentation

data_transforms = {
        'train': transforms.Compose([
            transforms.Resize(256),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])}
#Independent train and test transformations can be done

class H5Dataset(Dataset):

    def __init__(self, hdf5_file, data_transforms):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.h5_file = h5py.File(hdf5_file, "r")
        self.data = h5_file.get('train_img')
        self.target = h5_file.get('train_labels')
        self.data_transforms = data_transforms

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        
        img = self.data[idx,:,:,:],
        img = Image.fromarray(img[0].astype('uint8'), 'RGB')
        #doc_class = torch.from_numpy(self.target[idx,:,:,:]).float()
        doc_class = self.target[idx]
        doc_class = doc_class.astype(np.uint8)
        doc_class = torch.tensor(doc_class)
        print(doc_class)

        if self.data_transforms is not None:
            try:
                image = self.data_transforms(img)
            except:
                print("Cannot transform image: {}")
        
        ocr = 'Hello World is awesome .'

        sentence = Sentence(ocr)

        # just embed a sentence using the StackedEmbedding as you would with any single embedding.
        #ocr_embedd = []
        #stacked_embeddings.embed(sentence)
        #for token in sentence:
            #ocr_embedd.append(token.embedding)
            # init embedding
        flair_embedding_forward = FlairEmbeddings('news-forward')
            
        flair_embedding_forward.embed(sentence)
        counter = 0
        for token in sentence:
            #print(token)
            token_embedding = token.embedding
            token_embedding = token_embedding.unsqueeze(0)
            #print(token_embedding)
            #print(token_embedding.shape)
            if counter == 0:
                prev_token_embedding = token_embedding
            if counter != 0:
                prev_token_embedding = torch.cat((prev_token_embedding, token_embedding),0)
            counter += 1
        #print(prev_token_embedding.shape)
        #prev_token_embedding = prev_token_embedding.unsqueeze(0)
        
        sample = {'image': image, 'class': doc_class, 'ocr': prev_token_embedding}

        return sample

In [166]:
h5_dataset = H5Dataset(hdf5_file='./HD5F_files/hdf5_images_gzip.hdf5', data_transforms=data_transforms['train'])

In [167]:
dataloader = DataLoader(h5_dataset, batch_size=1,
                        shuffle=False, num_workers=1)

## Mobilenetv2 model

In [168]:
import torch.nn as nn
import math

__all__ = ['MobileNetV2', 'mobilenetv2_19']

class Bottleneck(nn.Module):

    def __init__(self, inplanes, planes, stride=1, downsample=None, expansion=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, inplanes*expansion, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(inplanes*expansion)
        self.conv2 = nn.Conv2d(inplanes*expansion, inplanes*expansion, kernel_size=3, stride=stride,
                               padding=1, bias=False, groups=inplanes*expansion)
        self.bn2 = nn.BatchNorm2d(inplanes*expansion)
        self.conv3 = nn.Conv2d(inplanes*expansion, planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class MobileNetV2(nn.Module):

    def __init__(self, block, layers, num_classes=16):
        self.inplanes = 32
        super(MobileNetV2, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu = nn.ReLU(inplace=True)
        self.layer1 = self._make_layer(block, 16, layers[0], stride=1, expansion = 1)
        self.layer2 = self._make_layer(block, 24, layers[1], stride=2, expansion = 6)
        self.layer3 = self._make_layer(block, 32, layers[2], stride=2, expansion = 6)
        self.layer4 = self._make_layer(block, 64, layers[3], stride=2, expansion = 6)
        self.layer5 = self._make_layer(block, 96, layers[4], stride=1, expansion = 6)
        self.layer6 = self._make_layer(block, 160, layers[5], stride=2, expansion = 6)
        self.layer7 = self._make_layer(block, 320, layers[6], stride=1, expansion = 6)
        self.conv8 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, bias=False)
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.conv9 = nn.Conv2d(1280,num_classes, kernel_size=1, stride=1, bias=False)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride, expansion):

        downsample = nn.Sequential(
            nn.Conv2d(self.inplanes, planes,
                      kernel_size=1, stride=stride, bias=False),
            nn.BatchNorm2d(planes),
        )

        layers = []
        layers.append(block(self.inplanes, planes, stride=stride, downsample=downsample, expansion=expansion))
        self.inplanes = planes
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, expansion=expansion))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        x = self.layer6(x)
        x = self.layer7(x)

        x = self.conv8(x)
        x = self.avgpool(x)
        x = self.conv9(x)
        x = x.view(x.size(0),-1)
        #print(x.size(0))#1Xnum_classes size

        return x


def mobilenetv2_19(**kwargs):
    """Constructs a MobileNetV2-19 model.
    """
    model = MobileNetV2(Bottleneck, [1, 2, 3, 4, 3, 3, 1], **kwargs)
    return model

In [169]:
 # get model
model = mobilenetv2_19(num_classes = 16)
# define loss function
criterion = nn.CrossEntropyLoss()
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.00004)

In [171]:
max_epochs = 5
optimizer = optimizer_ft
batch_size=1
running_loss = 0.0

# Loop over epochs
for epoch in range(max_epochs):
    # Training
    for local_batch in dataloader:
        inputs, labels = Variable(local_batch['image']), Variable(local_batch['class'])
        # zero the parameter gradients
        optimizer.zero_grad()
        #print(local_batch['image_dir'])

        # forward
        #print(labels)
        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)
        #print(preds)
        loss = criterion(outputs, labels.long())
        print(outputs)

        # backward + optimize only if in training phase
        loss.backward()
        optimizer.step()
        #running_loss += loss.data[0]
        
        #print(outputs)
        print('[Epoch {}/{}], loss {}'.format(
                          epoch, max_epochs,loss))

tensor(1, dtype=torch.uint8)
tensor(14, dtype=torch.uint8)
tensor([[-4.5266e+00,  7.8963e+00,  2.9438e+00,  3.1608e+00, -1.3839e+00,
          1.7627e+00, -2.1926e+00, -3.7390e+01,  4.2367e+00, -6.3124e+00,
         -1.7491e+00,  2.0745e+00, -1.5561e+00, -2.5088e+00,  3.1310e+01,
         -3.7051e-02]], grad_fn=<ViewBackward>)
tensor(14, dtype=torch.uint8)
[Epoch 0/5], loss 23.41333770751953
tensor([[ -4.4977,  21.4836,   2.8972,   3.3685,  -2.4084,   2.4687,  -1.6788,
         -48.2318,   3.9849,  -5.8777,  -3.2028,   2.3334,  -0.5693,  -2.1612,
          28.3483,  -0.1412]], grad_fn=<ViewBackward>)
tensor(14, dtype=torch.uint8)
[Epoch 0/5], loss 0.0010433197021484375
tensor([[ -4.6183,  34.4781,   3.4406,   2.0351,  -2.8026,   2.9564,  -1.2115,
         -58.2991,   4.6932,  -6.6107,  -3.5630,   1.4221,   1.7193,  -2.4530,
          25.2038,   0.8171]], grad_fn=<ViewBackward>)
[Epoch 0/5], loss 9.27444076538086
tensor(1, dtype=torch.uint8)
tensor([[ -3.5077,  30.3247,   3.7095,   3.49

KeyboardInterrupt: 