In [None]:
# Core imports
import numpy as np
import scipy.sparse as sprs
import sys
import shutil
import os
import tarfile
import argparse
import gzip
from scipy.special import expit as sigmoid

# Evaluation imports
from oct2py import octave
octave.addpath('../../evaluation/')

# Python 2 - 3 imports
import six

# Tensorflow imports
import tensorflow as tf
from tensorflow.contrib import learn as learn
from tensorflow.contrib import layers 
import tflearn

# Attalos specific imports
sys.path.append('/home/kni/local-kni/attalos')
from setops import replaceword, union, difference, intersection
from load_entire_dataset import load_entire_dataset, load_entire_dataset_di
import attalos.imgtxt_algorithms.util.readw2v as readw2v
from attalos.imgtxt_algorithms.util.readw2v import initVo, readvocab
from attalos.evaluation.evaluation import Evaluation
from models import imageWmodel, updateVoX, updateVoSum, get_batch, get_batch_image
from models import adaptWords, costYViVo

# Display
from IPython.display import clear_output
import matplotlib.pylab as plt
%matplotlib inline 

### Parameter Definitions
1. `epochs` = number of epochs
2. `bsize` = batch size
3. `updateVo` = None/updateVoSum/updateVoX <- types of update
4. `nsampims` = None/negativeims <-- sample from independent words / words from one image
5. `initWVVo` = True/False  <-- initialize Vo with word vectors or randomly
6. `learnrate` = learning rate
7. `datadir` = path to data directory. Need to replace: `<PATH-TO_DATASETS>`

In [None]:
epochs = 5000
bsize = 1024
updateVo =  'updateVoX' # 'updateVoSum' # None/'updateVoSum/updateVoX'
nsampims = None #'negativeims'
initWVVo = True
learnrate=0.1
minlearnrate = 1.0e-6
wweight = 0.3
hidden_units=[2048, 1024,200]
datadir='/data/fs4/teams/attalos/features/'

## Load in the *image* dataset

#### Load datasets  
The function `load_entire_dataset` will load an entire dataset in. Here,
- `dataset` can be `yfcc`, `iaprtc12`, or `espgame`  
- `split` can be `train` or `test`

#### The data that has been loaded in
- `x**`, image features
  - `xTr`, training features
  - `xTe`, testing features
  - `xVa`, validation features
- `y**`, labels
- `d**`, dictionary of word labels
- `****list`, list of images originally being used
- `trHot` is used as a one hot encoding object so that the validation has the same hot encoding

#### You need to fill in `datadir`

In [None]:
xTr, yTr, dTr, trainlist, dirTr, trHot = load_entire_dataset_di('iaprtc12', datadir=datadir, split='train')
xVa, yVa, dVa, validlist, dirVa, _ = load_entire_dataset_di('iaprtc12', datadir=datadir, split='test', allhot=trHot)
xTe, yTe, dTe, _, testlist, dirTe = load_entire_dataset_di('espgame', datadir=datadir, split='test')

## Read vocabulary form word2vec file

In [None]:
w2vfile = readw2v.ReadW2V('/local_data/kni/data/vectors-phrase.bin')
wordvecs = w2vfile.readlines(100000)
# Require rescale of word vectors to avoid NaNs
for word in wordvecs.keys():
    wordvecs[word] *= 0.1
Wd, Id = readvocab('/local_data/kni/data/vectors-phrase.vocab')

## Joint Vocabulary and Custom Set Operations

Recall that the dictionary between training and testing is different. The set operations that I have implemented below 
- `VoU` is the union of word vectors in both training and test set
- `VoD` is the set difference between test set and training set

Additionally, you will notice that I have only used the first one hundred thousand words above. Apparently, there are some words the image corpus labels that aren't actually in the word vectors, which can be remedied by replacing your dictionary with equivalent words. The below are words that are replaced with those found in word2vec.

In [None]:
# Replace words in the dictionary that aren't in word2vec space
replaceword(dTr, 'bedcover', 'bedding')
replaceword(dTr, 'tussock', 'turf')
replaceword(dTr, 'tee-shirt', 'shirts')
replaceword(dTr, 'table-cloth', 'tablecloth')
replaceword(dTr, 'cobblestone', 'stones')

# Cannot use Python set operations as we require indices and sorting
dUnion, iUtr, iUte = union(dTr, dTe)
dDiff, iDte = difference(dTe, dTr)
dXsect, iXtr, iXte = intersection( dTr, dTe )

VoU = initVo(wordvecs, dUnion)
VoD = initVo(wordvecs, dDiff)

print '----------------------------------'
print 'Union:{}, Xsect:{}, dTe-dTr:{}'.format(len(dUnion),len(dXsect),len(dDiff))

## Initialization

### Tensorflow Instantiation

In [None]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
# sess = tf.InteractiveSession(config=config)
graph = tf.Graph().as_default()
sess = tf.Session(config=config)

In [None]:
# Initialize Vo to word vectors or randomly
Vo = initVo(wordvecs, dTr)
    
# Use sum model or cross-entropy model
inputs,pvecs,nvecs,wvecs,wcorr,preds,imloss,wdloss,loss,opt,lrate = imageWmodel(hidden_units=hidden_units, 
                                                                                vec_size=Vo.shape[1])

# Tensorflow Initialization
init_op = tf.initialize_all_variables()
saver = tf.train.Saver()
sess.run(init_op)

### Neural Networks

#### Update $V_o$

Both functions take in:
- The input batch vectors ($v_{in}$: `vin`)
- The positive vectors ($V_p$: `pVecs`)
- The negative vectors ($V_n$: `nVecs`)
- The indices in $V_o$ of the positive vectors ($V_p[i]$: `vpindex`)
- The indices in $V_o$ of the negative vectors ($V_n[i]$: `vnindex`)
- The output vectors $V_o$, `Vo`.

## Begin Training

In [None]:
lossvals = []
losstrain = []
lossvalid = []

# Consistent validation batches
valididx = np.random.choice(range(len(yVa)), size=1024, replace=False, p=None)


Cmat = sigmoid(Vo.dot(VoD.T))
for epoch in range(epochs):
    
    randidx = np.random.permutation(len(yTr))
    yTrr = yTr[randidx]
    xTrr = xTr[randidx]
    
    for b in range(0,len(yTr),bsize):
        
        # Get a batch
        xBatch = xTrr[b:b+bsize]
        pVecs, nVecs, vpindex, vnindex = get_batch( yTrr[b:b+bsize], Vo, [5,5] )
                
        # Run the image updates
        _, lossval, vin = sess.run([opt,loss,preds], 
                                   feed_dict={inputs:xBatch, pvecs: pVecs, 
                                              nvecs: nVecs, lrate:learnrate,
                                              wvecs: VoD, wcorr: Cmat[vpindex].transpose(1,0,2)})
        
        # Run the word updates
        lossvals += [lossval]
        if updateVo == 'updateVoX':
            Vo = updateVoX(vin, pVecs, nVecs, vpindex, vnindex, Vo, learnrate=learnrate)
            VoU = np.array(list(Vo)+list(VoD))
            Cmat = sigmoid(VoD.dot(VoU.T))
            VoD = adaptWords( VoD, np.array( list(Vo)+list(VoD) ), Cmat, wordlr=learnrate )
            Cmat = sigmoid(Vo.dot(VoD.T))
            
        # Printout status
        sys.stdout.write("\rEpoch {}/{}: loss={}".format(epoch, epochs, lossval))
            
    # Validation
    pVecs, nVecs, vpindex, vnindex = get_batch( yVa[valididx], Vo, [5,5] )
    vapred, valossval = sess.run([preds,imloss], feed_dict={inputs:xVa[valididx], pvecs: pVecs, nvecs: nVecs})
    yHat = sigmoid(vapred.dot(Vo.T))
    precision,recall,f1score = Evaluation( yVa[valididx], yHat, 5).evaluate()
    outstring = '\rEpoch: {}, LR: {}, Train/Val: {}/{}, P: {}, R: {}\n'.format(epoch,learnrate,lossval,
                                                                               lossval,precision,recall)
    sys.stdout.write(outstring)
    
    # Keep track of training and validation loss
    losstrain += [lossval]
    lossvalid += [valossval]
    
    # Learning rate updating
    if len(losstrain) and len(losstrain) % 150 == 0:
        learnrate*=0.9
        if learnrate < minlearnrate:
            learnrate=minlearnrate


## Final Word Vector Tuning

### Given the image vectors, tune the word vectors to match based on original word vector correlation


Either run nonlinear or linear (below).

#### Nonlinear optimization

Nonlinear optimization has a cost function of:

$$\mathcal{L} = \frac{1}{N} C_{i,o} \log \sigma ( V_i^T V_o ) + (1 - C_{i,o}) \log \sigma (V_i^T V_o )$$

Here, $C_{i,o}$ is the *original* correlation between word *i* and word *o*. We may wish to adapt the nonlinearity that Kyle uses as the final layer for the sum of word vectors. 

Let $V_{oD}$ be the output vectors of the set difference between training and testing. Since we're in numpy, the updates to maximize are:

$$\frac{\partial \mathcal{L}^+}{ \partial V_{oD} } = C \left( 1 - \sigma\left( V_{oD} V_{oU} \right) \right) \cdot V_{oU}$$  

$$\frac{\partial \mathcal{L}^-}{ \partial V_{oD} } = (C - 1) \left( 1 - \sigma\left( 1 - V_{oD} V_{oU} \right) \right) \cdot V_{oU}$$



#### Linear optimization

Linear matrix optimization is done here:

Let $V_{oD}$ be the set difference output vectors between training set and testing set. That is, the vectors that have *not* been updated in the image optimization. We know that the semantic concepts are correlated with the original correlation matrix $C_m$, and at minimum, an unseen word $v_i$ is correlated with an optimized word from the image training corpus $v_o$ with linear correlation $c_{i,o}$, the $(i, j)^{th}$ entry in $C_m$. 

Similarly, in the absence of any image data, we know that the vectors are also correlated with each other with that same correlation. If we only optimize the unseen word vectors, then with $V_{oU}$ being the union of the *original* vectors of image labels (before image optimization) and words not in the image label set, then:

$$C_m = V_{oD} \cdot V_{oU}$$

With $\hat{V}_{oU}$ being the union of the *updated* vectors of image labels (after image optimization) and words not in the image label set, the solution to the inverse problem is then:

$$\hat{V}_{oD} = C_m \hat{V}_{oU} \left( \hat{V}_{oU}^T \hat{V}_{oU} \right)^{-1} $$



In [None]:
VoD = initVo(wordvecs, dDiff)
VoU = initVo(wordvecs, dUnion)

# word parameter optimization
wordlr = 1.0e-4
nonlinear = False

if nonlinear:
    Cmat = sigmoid( VoD.dot(VoU.T) )
    VoUnew = np.array( list(Vo)+list(VoD) )
    # New full vectors. Assumes that Vo is optimized through the image space
    for epoch in range(epochs):
        VoD = adaptWords( VoD, VoUnew, Cmat, wordlr=wordlr)
        sys.stdout.write('\r{}, Word Adaptation Cost = {}'.format(epoch, costYViVo(Cmat, VoD, VoUnew)))
    VoDnew = VoD
else:
    # Linear optimization
    Cmat = 0.3*VoD.dot(VoU.T)
    VoUnew = np.array( list(Vo)+list(VoD) )
    VoDnew = Cmat.dot(VoUnew).dot(np.linalg.inv(VoUnew.T.dot(VoUnew)))

### Assign new $V_{oD}^{new}$ to optimized value for set difference $V_{oD}$.

$V_{o Te}= V_{o Tr} ( D[ Tr\cap Te ] ) \cup V_{o Te} (  D[Te]-D[Tr]  )$

In [None]:
newVo = np.zeros((len(dTe),200))
newVo[iXte] = Vo[iXtr]
newVo[iDte] = VoD
newVo[iDte] = wweight*VoDnew

## Evaluation

In [None]:
def getsplit(splitname):
    # Returns data, labels, output vectors, image list, directory of images, and dictionary
    if splitname=='train':
        return xTr, yTr, Vo, trainlist, dirTr, dTr
    elif splitname=='valid':
        return xVa, yVa, Vo, validlist, dirVal, dTr
    else:
        return xTe, yTe, newVo, testlist, dirTe, dTe

In [None]:
splitname='test'
xEv, yEv, VE, evallist, dirEv, dEv = getsplit(splitname)

plt.plot(np.array(losstrain))
plt.plot(np.array(lossvalid),'r')
plt.xlabel('Epoch Number')
plt.ylabel('Loss Value')
plt.legend(['Training Loss', 'Validation Loss'])
plt.title('Epoch = {}'.format(epoch))
prediction = sess.run(preds, feed_dict={inputs:xEv})
yHat = sigmoid(prediction.dot(VE.T))

evaluated = Evaluation(yEv, yHat, k=5)
evaluated.evaluate()

from oct2py import octave
octave.addpath('../../evaluation/')
[precision, recall, f1score] = octave.evaluate(yEv.T, yHat.T, 5)
print "P: {},R: {},F1: {}".format(precision,recall,f1score)

print 2*(precision*recall) / (precision+recall)

## Save variables

In [None]:
modelname='save-name.model'

save_path = saver.save(sess, modelname)

print "Saved model to {}".format(save_path)

modelinfo = modelname+'.info.npz'
np.savez( modelinfo, lossvals=lossvals, epoch=epoch, Vo=Vo)

print "Saved model information to {}".format(modelinfo)

In [None]:
sess.close()