In [None]:
from flair.embeddings import WordEmbeddings, Sentence

from flair.embeddings import FlairEmbeddings, ELMoEmbeddings

# init embedding
flair_embedding_forward = FlairEmbeddings('news-forward')

# create a sentence
sentence = Sentence('The grass is green .')

# embed words in sentence
flair_embedding_forward.embed(sentence)

In [1]:
import flair

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
from flair.data import Corpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [3]:
from typing import List, Dict, Union
import logging
log = logging.getLogger("flair")

In [4]:
class Dictionary:
    """
    This class holds a dictionary that maps strings to IDs, used to generate one-hot encodings of strings.
    """

    def __init__(self, add_unk=True):
        # init dictionaries
        self.item2idx: Dict[str, int] = {}
        self.idx2item: List[str] = []
        self.multi_label: bool = False

        # in order to deal with unknown tokens, add <unk>
        if add_unk:
            self.add_item("<unk>")

    def add_item(self, item: str) -> int:
        """
        add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.
        :param item: a string for which to assign an id.
        :return: ID of string
        """
        item = item.encode("utf-8")
        if item not in self.item2idx:
            self.idx2item.append(item)
            self.item2idx[item] = len(self.idx2item) - 1
        return self.item2idx[item]

    def get_idx_for_item(self, item: str) -> int:
        """
        returns the ID of the string, otherwise 0
        :param item: string for which ID is requested
        :return: ID of string, otherwise 0
        """
        item = item.encode("utf-8")
        if item in self.item2idx.keys():
            return self.item2idx[item]
        else:
            return 0

    def get_items(self) -> List[str]:
        items = []
        for item in self.idx2item:
            items.append(item.decode("UTF-8"))
        return items

    def __len__(self) -> int:
        return len(self.idx2item)

    def get_item_for_index(self, idx):
        return self.idx2item[idx].decode("UTF-8")

    def save(self, savefile):
        import pickle

        with open(savefile, "wb") as f:
            mappings = {"idx2item": self.idx2item, "item2idx": self.item2idx}
            pickle.dump(mappings, f)

    @classmethod
    def load_from_file(cls, filename: str):
        import pickle

        dictionary: Dictionary = Dictionary()
        with open(filename, "rb") as f:
            mappings = pickle.load(f, encoding="latin1")
            idx2item = mappings["idx2item"]
            item2idx = mappings["item2idx"]
            dictionary.item2idx = item2idx
            dictionary.idx2item = idx2item
        return dictionary

    @classmethod
    def load(cls, name: str):
        from flair.file_utils import cached_path

        if name == "chars" or name == "common-chars":
            base_path = "https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models/common_characters"
            char_dict = cached_path(base_path, cache_dir="datasets")
            return Dictionary.load_from_file(char_dict)

        return Dictionary.load_from_file(name)

In [5]:
from flair.file_utils import Tqdm
def make_label_dictionarya(self) -> Dictionary:
        """
        Creates a dictionary of all labels assigned to the sentences in the corpus.
        :return: dictionary of labels
        """
        label_dictionary: Dictionary = Dictionary(add_unk=False)
        label_dictionary.multi_label = False

        from datasets import DataLoader

        loader = DataLoader(self.train, batch_size=1)

        log.info("Computing label dictionary. Progress:")
        for batch in Tqdm.tqdm(iter(loader)):

            for sentence in batch:
 

                for label in sentence.labels:
                    label_dictionary.add_item(label.value)

                if not label_dictionary.multi_label:
                    if len(sentence.labels) > 1:
                        label_dictionary.multi_label = True

        log.info(label_dictionary.idx2item)

        return label_dictionary

In [7]:
from data import Corpus
from datasets import ClassificationCorpus

# this is the folder in which train, test and dev files reside
data_folder = './'

# load corpus containing training, test and dev data
corpus: Corpus = ClassificationCorpus(data_folder,
                                      test_file='test.txt',
                                      dev_file='dev.txt',
                                      train_file='train.txt')


# 2. create the label dictionary
label_dict = make_label_dictionarya(corpus)
print(get_label_distribution(corpus))

# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove'),

                   # comment in flair embeddings for state-of-the-art results
                   # FlairEmbeddings('news-forward'),
                   # FlairEmbeddings('news-backward'),
                   ]

2019-07-21 18:25:02,465 Reading data from .
2019-07-21 18:25:02,466 Train: train.txt
2019-07-21 18:25:02,468 Dev: dev.txt
2019-07-21 18:25:02,469 Test: test.txt
2019-07-21 18:25:02,584 Computing label dictionary. Progress:


100%|██████████| 345/345 [00:03<00:00, 104.09it/s]

2019-07-21 18:25:05,998 [b'1', b'9', b'2', b'0', b'4', b'5', b'6', b'8', b'7', b'3']





NameError: name 'get_label_distribution' is not defined

In [None]:
# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     )

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

# 7. start the training
trainer.train('resources/taggers/ag_news',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)

In [None]:
from flair.embeddings import FlairEmbeddings
from flair.data import Sentence

# init embedding
flair_embedding_forward = FlairEmbeddings(model = 'news-forward')

print('pt loaded')

# create a sentence


In [None]:
sentence = Sentence('The grass is green .')

# embed words in sentence
flair_embedding_forward.embed(sentence)

for token in sentence:
    token_embedding = token.embedding
    print(token_embedding)

In [None]:
#embedding = ELMoEmbeddings('small')
glove_embedding = WordEmbeddings('glove')

sentence = Sentence('the grass is green .')

glove_embedding.embed(sentence)

In [None]:
for token in sentence:
    print(token.embedding)

## Sentences segmentation

In [None]:
text = "This is a sentence. This is another sentence."

from segtok.segmenter import split_single
#sentences = [Sentence(sent, use_tokenizer=True) for sent in split_single(text)]
sentences = [sent for sent in split_single(text)]

In [None]:
print(sentences)

In [None]:
flair_embedding_forward.embed(sentences)

In [None]:
for indiv_sentence in sentences:
    for token in indiv_sentence:
        print(token.embedding)

## Glove

In [None]:
import bcolz
import numpy as np
import pickle

In [None]:


words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'./Data/6B.100.dat', mode='w')

with open(f'./Data/glove.6B.100d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
vectors = bcolz.carray(vectors[1:].reshape((400000, 100)), rootdir=f'./Data/6B.100.dat', mode='w')
vectors.flush()
pickle.dump(words, open(f'./Data/6B.100_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'./Data/6B.100_idx.pkl', 'wb'))

In [None]:
vectors = bcolz.open(f'./Data/6B.100.dat')[:]
words = pickle.load(open(f'./Data/6B.100_words.pkl', 'rb'))
word2idx = pickle.load(open(f'./Data/6B.100_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [None]:
glove['the']

In [None]:
from segtok.tokenizer import split_contractions
from segtok.tokenizer import word_tokenizer

In [None]:
import h5py
from itertools import *
import matplotlib.pyplot as plt
plt.style.use("ggplot")

hdf5_file='./HDF5_files/hdf5_10.hdf5'
h5_file = h5py.File(hdf5_file, "r")
ocr = h5_file.get('train_ocrs')

In [None]:
contractions_full = []
for counter, text in enumerate(ocr):
    contractions = split_contractions(word_tokenizer(text))
    for element in contractions:
        element = element.lower()
        contractions_full.append(element)

In [None]:
print(contractions_full)
target_vocab = contractions_full

In [None]:
emb_dim =100
matrix_len = len(target_vocab)
print(matrix_len)
weights_matrix = np.zeros((matrix_len, 100))
words_found = 0

In [None]:
target_vocab[1]

In [None]:
for i, word in enumerate(target_vocab):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(emb_dim, ))

In [None]:
weights_matrix[1]

In [None]:
words_found

In [None]:
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [26]:
import torch

In [14]:
from mobilenetv3 import mobilenetv3_large, mobilenetv3_small

net_large = mobilenetv3_large()

net_large.load_state_dict(torch.load('./pretrained/mobilenetv3-large-657e7b3d.pth',map_location='cpu'))

RuntimeError: Error(s) in loading state_dict for MobileNetV3:
	Missing key(s) in state_dict: "classifier.0.weight", "classifier.0.bias", "classifier.3.weight", "classifier.3.bias". 
	Unexpected key(s) in state_dict: "classifier.1.weight", "classifier.1.bias", "classifier.5.weight", "classifier.5.bias". 

In [46]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [47]:
from models.imagenet import mobilenetv2
import torch.nn as nn

In [48]:
net = mobilenetv2()
net.load_state_dict(torch.load('pretrained/mobilenetv2_1.0-0c6065bc.pth', map_location='cpu'))
feature_extracting = True
set_parameter_requires_grad(net, feature_extracting)

In [49]:
net.classifier = nn.Linear(1280, 300)

In [50]:
print(net.classifier)

Linear(in_features=1280, out_features=300, bias=True)


In [51]:
params_to_update = net.parameters()
print("Params to learn:")
if feature_extracting:
    params_to_update = []
    for name,param in net.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in net.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

Params to learn:
	 classifier.weight
	 classifier.bias


In [52]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable


class CNN_Text(nn.Module):
    
    def __init__(self, image_model):
        super(CNN_Text, self).__init__()
        #self.args = args
        
        #V = args.embed_num
        D = 2048 #embed_dim, 4196 for doc_embeddings
        C = 10 #class_num
        Ci = 1
        Co = 100 #kernel_num -> number of kernel with the same size
        Ks = [3,4,5] #kernel_sizes -> size = number of words

        #self.embed = nn.Embedding(V, D)
        # self.convs1 = [nn.Conv2d(Ci, Co, (K, D)) for K in Ks]
        self.convs1 = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks])
        
        
        '''
        self.conv13 = nn.Conv2d(Ci, Co, (3, D))
        self.conv14 = nn.Conv2d(Ci, Co, (4, D))
        self.conv15 = nn.Conv2d(Ci, Co, (5, D))
        '''
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(len(Ks)*Co, C)
        
        self.image_model = image_model

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)  # (N, Co, W)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        
        #Output will be size (1,Ks*Co) -> Maxpool will get one ĉ value =  max(c_1,c_2...), where c_i is
        #the result of the convolution operation of the kernel over the input
        
        return x
    
    

    def forward(self, x, x2):
        #x = self.embed(x)  # (N, W, D)
        
        #if self.args.static:
            #x = Variable(x)
        #print('CNN Text entry',x.shape)

        x = x.unsqueeze(1)  # (N, Ci, W, D)
        #print('unsqueeze',x.shape)
        
        
        #print(x.shape)

        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        

        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        

        x = torch.cat(x, 1) #[1,100] + [1,100] + [1,100] = [1,300]
        
        #print('After cat', x.shape)

        '''
        x1 = self.conv_and_pool(x,self.conv13) #(N,Co)
        x2 = self.conv_and_pool(x,self.conv14) #(N,Co)
        x3 = self.conv_and_pool(x,self.conv15) #(N,Co)
        x = torch.cat((x1, x2, x3), 1) # (N,len(Ks)*Co)
        '''
        x = self.dropout(x)  # (N, len(Ks)*Co)
        
        x2 = self.image_model(x2)
        
        x2 = torch.cat((x,x2),1)
        
        
        logit = x2
        #logit = self.fc1(x)  # (N, C)
        return logit

In [53]:
text_model = CNN_Text(net)


In [54]:
print(text_model)

CNN_Text(
  (convs1): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 2048), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 2048), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 2048), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5)
  (fc1): Linear(in_features=300, out_features=10, bias=True)
  (image_model): MobileNetV2(
    (features): Sequential(
      (0): Sequential(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU6(inplace)
      )
      (1): InvertedResidual(
        (conv): Sequential(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace)
          (3): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (4): BatchNorm

In [None]:
base_path = './resources/combined'
flair_embedding_forward = FlairEmbeddings('./Data/news-forward-0.4.1.pt')

# Dataset creation with image directory, image -> 'RGB' -> transformed to Mobilenetv2 input, Ocr,
# Class and Segmentation
__all__ = ['MobileNetV2', 'mobilenetv2_19']

data_transforms = {
        'train': transforms.Compose([
            transforms.Resize(256),
            transforms.RandomCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])}
#Independent train and test transformations can be done
h5_dataset = H5Dataset(path='./HDF5_files/hdf5_small_tobacco_cover.hdf5', data_transforms=data_transforms['train'], embedding_model = flair_embedding_forward, phase = 'train')

dataloader_train = DataLoader(h5_dataset, batch_size=1, shuffle=False, num_workers=0)
# for x in dataloader:
#     x = x.to('cuda', non_blocking=True)

#https://github.com/d-li14/mobilenetv2.pytorch
net = mobilenetv2()
net.load_state_dict(torch.load('pretrained/mobilenetv2_1.0-0c6065bc.pth'))
feature_extracting = True
set_parameter_requires_grad(net, feature_extracting)
net.classifier = nn.Linear(1280, 300)

# get model
model = CNN_Text(net)
#print(model)
# define loss function
criterion = nn.CrossEntropyLoss()
# set optimize
optimizer_ft = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.00004)

max_epochs = 1
optimizer = optimizer_ft
batch_size=1
running_loss = 0.0
loss_values = []
epoch_values = []

# Loop over epochs
for epoch in range(max_epochs):
    running_loss = 0
    steps = 0
    # Training
    print('Antes del loop')
    for local_batch in dataloader_train:
        image, ocr_text, labels = Variable(local_batch['image']), Variable(local_batch['ocr']), Variable(local_batch['class'])
        #print(ocr_text.shape)
        steps += 1
        if ocr_text.shape[1]< 5:
            print(steps)
            pass
        else:
            # zero the parameter gradients
            optimizer.zero_grad()
            #print(local_batch['image_dir'])

            # forward
            outputs = model(ocr_text, image)
            _, preds = torch.max(outputs.data, 1)
            #print(preds, labels.long())
            loss = criterion(outputs, labels.long())
            #print(outputs)

            # backward + optimize only if in training phase
            loss.backward()
            optimizer.step()
            #running_loss += loss.data[0]
            if steps % 100 == 0:
                print(steps)
                #save(model,'./snapshot/', 'model', steps)
            running_loss += loss.item()

    #print(outputs)
    print('[Epoch {}/{}], loss {}'.format(
                    epoch, max_epochs,running_loss/800))

    loss_values.append(running_loss/800)
    epoch_values.append(epoch)