In [50]:
%matplotlib inline
import numpy as np
from pycocotools.coco import COCO
import skimage.io as io
import matplotlib.pyplot as plt
import theano.tensor as T
import theano
import lasagne
import lasagne.nonlinearities as nonlin
import pylab
import pickle
import re
import os
import imageio
import time
from IPython.display import display, clear_output
import PIL

from net.vgg16 import make_network_from_file
from net.process import preprocess, resize_img

In [2]:
dataDir='annotations_trainval2017'
dataType='val2017'
annFile='{}/annotations/captions_{}.json'.format(dataDir, dataType)

coco=COCO(annFile)

loading annotations into memory...
Done (t=0.11s)
creating index...
index created!


In [3]:
print(len(coco.anns))
display(list(coco.anns.items())[-3:])

25014


[(541200,
  {'caption': 'a glass vase with some flower coming out of it ',
   'id': 541200,
   'image_id': 159282}),
 (262135,
  {'caption': 'A tennis player is taking a swing on a red court.',
   'id': 262135,
   'image_id': 551804}),
 (589821,
  {'caption': 'there is a yellow notebook on a black desk ',
   'id': 589821,
   'image_id': 176446})]

# Build and load VGG16

In [4]:
# net, prob_and_vec = make_network_from_file()

# img2vec

In [5]:
from net.imgs import *
      
# %time imgs_process('annotations_trainval2017/img') # Wall time: 4h 43min 38s

In [6]:
imgs = imgs_load(dataDir)
display(len(imgs))

4991

In [7]:
temp = pickle.load(open('annotations_trainval2017\data\\130.data', 'rb'))
temp = list(temp.items())
print('max =', np.max(temp[10][1][1]))
print('min =', np.min(temp[10][1][1]))

max = 12.5911690933
min = 0.0


# texts

In [8]:
from net.sents_process import *

# sents_process(dict_from_anns(coco.anns), 'annotations_trainval2017')

In [9]:
anno = sent_load(dataDir)

In [10]:
danno = {}
for id, v in anno:
    danno[id] = danno.get(id, []) + [v]

In [11]:
captions = [text[1]['caption'] for text in coco.anns.items()]
display(captions[:5])

['Two mean are playing tennis and both are wearing sunglasses.  ',
 'a couple of people that are playing in a field',
 'A purple and white bus in a parking lot.',
 'a tennis player swinging a racket at a ball',
 'a person attempting a jump with a skateboard']

In [12]:
def mix_text_vec2vec(sent):
    only_vec = list(filter(lambda x: not isinstance(x, str), sent))
    matr = np.vstack(tuple(only_vec))
    return np.concatenate((
            np.mean(matr, axis=0), 
            np.max(matr, axis=0), 
            np.min(matr, axis=0)))

vects = {}
for id, val in anno:
    vects[id] = vects.get(id, []) + [mix_text_vec2vec(val)]

# neural networks

In [13]:
from net.train import *

# CNN

Generate X, y

In [14]:
H = 300
W = 14

In [15]:
train_size = 23300

X, y = [], []
for id, data in imgs.items():
    while id[0] == '0':
        id = id[1:]
    try:
        for ls in danno[id]:
            vects = []
            for elem in ls:
                if not isinstance(elem, str):
                    vects.append(elem)
            vects = np.pad(vects, [(0, W), (0, 0)], mode='constant', constant_values=0)[:W].T
            X.append([vects])
            y.append(data[1])
    except:
        pass

print(len(X))

X_train, y_train, X_val, y_val = X[:train_size], y[:train_size], X[train_size:], y[train_size:]

23968


In [16]:
#with open('data.net', 'wb') as fl:
#    pickle.dump((X_train, y_train, X_val, y_val), fl)

# new version

In [17]:
def build_cnn(file_name=None):
    net = NeuralNet(must_have=[
            'input_shape', 
            'learning_rate', 
            'train_fun', 
            'loss_fun', 
            'loss_fun_det', 
            'predict_fun_det'])
    
    input_shape = [None, H, W]
    
    input_X = T.tensor3("input X", dtype='float32')
    target_y = T.matrix("target Y", dtype='float32')

    
    net['inp'] = lasagne.layers.InputLayer(input_shape, input_var=input_X)
    
    net['max'] = lasagne.layers.GlobalPoolLayer(net['inp'], pool_function=theano.tensor.max)
    net['min'] = lasagne.layers.GlobalPoolLayer(net['inp'], pool_function=theano.tensor.min)
    net['mean'] = lasagne.layers.GlobalPoolLayer(net['inp'], pool_function=theano.tensor.mean)
    
    net['con_2'] = lasagne.layers.Conv1DLayer(net['inp'], num_filters=64, filter_size=2,nonlinearity=None)
    net['con_3'] = lasagne.layers.Conv1DLayer(net['inp'], num_filters=64, filter_size=3,nonlinearity=None)
    net['con_4'] = lasagne.layers.Conv1DLayer(net['inp'], num_filters=64, filter_size=4,nonlinearity=None)
    
    net['gmax_2'] = lasagne.layers.GlobalPoolLayer(net['con_2'], pool_function=T.max)
    net['gmax_3'] = lasagne.layers.GlobalPoolLayer(net['con_3'], pool_function=T.max)
    net['gmax_4'] = lasagne.layers.GlobalPoolLayer(net['con_4'], pool_function=T.max)
    
    net['merge'] = lasagne.layers.ConcatLayer((net['max'], net['min'], net['mean'], 
                                               net['gmax_2'], net['gmax_3'], net['gmax_4']))
    
    net['dens_1'] = lasagne.layers.DenseLayer(net['merge'], num_units=500, nonlinearity=nonlin.elu)
    net['drop_1'] = lasagne.layers.DropoutLayer(net['dens_1'], p=0.5)
    
    net['dens_2'] = lasagne.layers.DenseLayer(net['drop_1'], num_units=500, nonlinearity=nonlin.elu)
    net['drop_2'] = lasagne.layers.DropoutLayer(net['dens_2'], p=0.5)
    
    net['last'] = lasagne.layers.DenseLayer(net['drop_2'], num_units=4096)
    
    
    y_predicted = lasagne.layers.get_output(net['last'])
    y_predicted_det = lasagne.layers.get_output(net['last'], deterministic=True)

    all_weights = lasagne.layers.get_all_params(net['last'], trainable=True)
    
    learning_rate = theano.shared(lasagne.utils.floatX(0.001))
    loss = lasagne.objectives.squared_error(target_y, y_predicted).mean()
    loss_det = lasagne.objectives.squared_error(target_y, y_predicted_det).mean()
    
    # loss = loss + lasagne.regularization.regularize_layer_params(net['last'], lasagne.regularization.l2) * 0.01
    updates = lasagne.updates.adam(loss, all_weights, learning_rate=learning_rate)
    
    train_fun = theano.function([input_X, target_y], loss, updates=updates, allow_input_downcast=True)
    loss_fun = theano.function([input_X, target_y], loss, allow_input_downcast=True)
    loss_fun_det = theano.function([input_X, target_y], loss_det, allow_input_downcast=True)
    predict_fun_det = theano.function([input_X], y_predicted_det, allow_input_downcast=True)
    
    if file_name:
        load_net(net['last'], file_name, dataDir)
        
    def proc_vec(v):
        res = []
        for vv in v:
            res.append(vv[0])
        return res
    
    def proc_1(fun):
        def temp(v):
            v = proc_vec(v)
            return fun(v)
        return temp

    def proc_2(fun):
        def temp(v, u):
            v = proc_vec(v)
            return fun(v, u)

        return temp

    net.input_shape = input_shape
    net.learning_rate = learning_rate
    net.train_fun = proc_2(train_fun)
    net.loss_fun = proc_2(loss_fun)
    net.loss_fun_det = proc_2(loss_fun_det)
    net.predict_fun_det = proc_1(predict_fun_det)
    
    return net.check()

In [18]:
net = build_cnn()

In [21]:
for key,val in net.items():
    if isinstance(val,lasagne.layers.Layer):
        params = val.get_params()
        

        print (key, val.output_shape, [p.get_value().shape for p in params])

gmax_3 (None, 64) []
last (None, 4096) [(500, 4096), (4096,)]
dens_2 (None, 500) [(500, 500), (500,)]
inp (None, 300, 14) []
mean (None, 300) []
con_3 (None, 64, 12) [(64, 300, 3), (64,)]
max (None, 300) []
drop_2 (None, 500) []
dens_1 (None, 500) [(1092, 500), (500,)]
gmax_2 (None, 64) []
con_4 (None, 64, 11) [(64, 300, 4), (64,)]
gmax_4 (None, 64) []
min (None, 300) []
con_2 (None, 64, 13) [(64, 300, 2), (64,)]
drop_1 (None, 500) []
merge (None, 1092) []


In [None]:
training_c = Training(net, 40)
training_c.set_Xy(X_train, y_train, X_val, y_val)

In [46]:
from pandas import ewma

def my_plot(losses_val, losses_train):
    plt.figure(figsize=(16, 8))
    
    losses_train = ewma(np.array(losses_train), span=70)
    losses_val = ewma(np.array(losses_val), span=1.2)
    
    plt.subplot(221)
    plt.plot(np.arange(len(losses_val)) * Training.mod, losses_val, 'r', losses_train, 'b')
    plt.grid()
    plt.legend(['y = loss valid', 'y = loss train'], loc='upper right')

    plt.subplot(222)
    plt.plot(losses_train[-120:])
    plt.grid()

    plt.subplot(223)
    plt.plot(losses_val)
    plt.grid()

    plt.subplot(224)
    plt.plot(losses_val[-60:])
    plt.grid()

    plt.show()
    
    
training_c.set_ploter(my_plot)

In [None]:
training_c.training(dataDir)

In [None]:
save_net(net['last'], 'new_cnn2_0.net')