## Generate corpus and gruond-truth references of released videos

### Corpus file contents
0. train_data: captions and idxs of training videos in format [corpus_widxs, vidxs, corpus_pidxs], where:
    - corpus_widxs is a list of lists with the index of words in the vocabulary
    - vidxs is a list of indexes of video features in the features file
    - corpus_pidxs is a list of lists with the index of POS tags in the POS tagging vocabulary
1. val_data: same format of train_data.
2. test_data: [None, vidxs, None]
3. vocabulary: in format {'word': count}.
4. idx2word: is the vocabulary in format {idx: 'word'}.
5. word_embeddings: are the vectors of each word. The i-th row is the word vector of the i-th word in the vocabulary.
6. idx2pos: is the vocabulary of POS tagging in format {idx: 'POSTAG'}

In [1]:
import json
with open('../../../data/VATEX/v1.0/vatex_training_v1.0.json') as f:
    train_data = json.load(f)
    
with open('../../../data/VATEX/v1.0/vatex_validation_v1.0.json') as f:
    valid_data = json.load(f)
    
with open('../../../data/VATEX/v1.0/vatex_public_test_without_annotations.json') as f:
    test_data = json.load(f)

### Generate list.txt file with the order of videos for extracting features

In [2]:
import os

vidx = 0
with open('../../../data/VATEX/v1.0/videos/list.txt', 'w') as f:
    train_videos_filenames = {filename.strip().rsplit('.', 1)[0]: filename for filename in os.listdir('../../../data/VATEX/v1.0/videos/train')}
    train_vidxs, train_corpus = [], []
    for d in train_data:
        vid = d['videoID']
        if vid in train_videos_filenames:
            f.write('/train/'+train_videos_filenames[vid]+'\n')
            for cap in d['enCap']:
                train_vidxs.append(vidx)
                train_corpus.append(cap)
            vidx+=1

    valid_videos_filenames = {filename.strip().rsplit('.', 1)[0]: filename for filename in os.listdir('../../../data/VATEX/v1.0/videos/validation')}
    valid_vidxs, valid_corpus = [], []
    for d in valid_data:
        vid = d['videoID']
        if vid in valid_videos_filenames:
            f.write('/validation/'+valid_videos_filenames[vid]+'\n')
            for cap in d['enCap']:
                valid_vidxs.append(vidx)
                valid_corpus.append(cap)
            vidx+=1
    
    test_videos_filenames = {filename.strip().rsplit('.', 1)[0]: filename for filename in os.listdir('../../../data/VATEX/v1.0/videos/public_test_without_annotations')}
    test_vidxs = []
    for d in test_data:
        vid = d['videoID']
        if vid in test_videos_filenames:
            f.write('/public_test_without_annotations/'+test_videos_filenames[vid]+'\n')
            test_vidxs.append(vidx)
            vidx+=1
            
print(f'Train: unique videos: {len(set(train_vidxs))}, count of training pairs: {len(train_vidxs)}')
print(f'Val: unique videos: {len(set(valid_vidxs))}, count of validation pairs: {len(valid_vidxs)}')
print(f'Test: unque videos: {len(test_vidxs)} (w/o reference captions)')

Train: unique videos: 24631, count of training pairs: 246310
Val: unique videos: 2821, count of validation pairs: 28210
Test: unque videos: 5668 (w/o reference captions)


### Discard videos without features
**IMPORTANT**: I don't need to remove the features of the .h5 file. I only need to remove the indices and captions from corpus.

In [7]:
import h5py
with h5py.File('../../../data/VATEX/v1.0/features/features_linspace16_20-cnn_globals-cnn_sem_globals-cnn_features-c3d_features-eco_globals.h5', 'r+') as feats_file:
    dataset = feats_file['VATEX']
    print(dataset.keys())
    fts = dataset['c3d_features'][...]
    print(fts.shape)
    
import numpy as np
vidxs_to_discard = []
for i, v in enumerate(fts):
    if np.all(v == np.zeros((20,4096))):
        vidxs_to_discard.append(i)
print(len(vidxs_to_discard), ' videos to discard')

<KeysViewHDF5 ['c3d_features', 'cnn_features', 'cnn_globals', 'cnn_sem_globals', 'count_features', 'eco_globals', 'frames_tstamp']>
(33120, 20, 4096)
19  videos to discard


In [8]:
train_vidxs, train_corpus = zip(*[(vidx, train_corpus[i]) for (i, vidx) in enumerate(train_vidxs) if not vidx in vidxs_to_discard])
valid_vidxs, valid_corpus = zip(*[(vidx, valid_corpus[i]) for (i, vidx) in enumerate(valid_vidxs) if not vidx in vidxs_to_discard])
test_vidx = [vidx for vidx in test_vidxs if not vidx in vidxs_to_discard]

print('count of training pairs: ', len(train_vidxs))
print('count of validation pairs: ', len(valid_vidxs))

count of training pairs:  246310
count of validation pairs:  28210


### Get pretrained embeddings

In [9]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('./glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [10]:
import nltk
nltk.download('punkt')

vocab, total_len = {}, 0
for cap in train_corpus:
    tokens = nltk.word_tokenize(cap.lower())
    total_len += len(tokens)
    for w in tokens:
        try:
            vocab[w] += 1
        except:
            vocab[w] = 1

print('Avg. count of words per caption:', total_len/len(train_corpus))
print('Count of unique words: ', len(vocab))

to_del = []
for w in vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
        print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
EOS, UNK = 0, 1

print(len(vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word]

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Avg. count of words per caption: 15.245361536275425
count of unique words:  28634
missing word: rapelling
missing word: abseils
missing word: rappels
missing word: decending
missing word: rappling
missing word: mountaing
missing word: trecking
missing word: shoe/cleat
missing word: attatches
missing word: rappell
missing word: rapell
missing word: enbankment
missing word: bungee-jumping
missing word: rope-work
missing word: parasails
missing word: paraglide
missing word: absailing
missing word: frontwards
missing word: rappeling
missing word: bunjee
missing word: beutiful
missing word: offf
missing word: wering
missing word: audiencce
missing word: visably
missing word: girll
missing word: spectales
missing word: drama-comedy
missing word: preforming
missing word: poising
missing word: demostrative
missing word: audium
missing word: lound
missing word: ppay
missing word: aduidenc
missing word: monlogues
missing word: applaudes
missing word: acoss
missing word: scool
missing word: malea

missing word: traingles
missing word: lyfting
missing word: perfors
missing word: circler
missing word: 25lb
missing word: uexercising
missing word: execises
missing word: strenght
missing word: exersizing
missing word: half-lift
missing word: straight-arm
missing word: shoulder-lifts
missing word: repititions
missing word: bokchoy
missing word: prepairing
missing word: stir-fries
missing word: stir-fry
missing word: stirfry
missing word: sautés
missing word: stir-frying
missing word: whtn
missing word: suchs
missing word: reipe
missing word: process..
missing word: stir-frys
missing word: brocolli
missing word: somehting
missing word: stir-fryed
missing word: dumblings
missing word: inmeat
missing word: sqeezes
missing word: vegitables
missing word: guaged
missing word: leaf-covered
missing word: marbels
missing word: open-toe
missing word: camaflouge
missing word: it.a
missing word: screw-top
missing word: yoddles
missing word: geocoaching
missing word: jotter
missing word: canister-

missing word: grapping
missing word: pamplet
missing word: re-assembling
missing word: anoise
missing word: shreded
missing word: wrip
missing word: laughes
missing word: re-positions
missing word: unsnaps
missing word: bombfire
missing word: barell
missing word: marsh-mellows
missing word: marshmellow
missing word: marshmalow
missing word: fire.then
missing word: mashmellow
missing word: smores
missing word: marchmillen
missing word: roats
missing word: marsmallows
missing word: smore
missing word: s'more
missing word: mashmallows
missing word: spitroasted
missing word: rosted
missing word: spit-roasted
missing word: rotisseried
missing word: rotissery
missing word: crands
missing word: scewer
missing word: barbque
missing word: fork-tong
missing word: skeweer
missing word: barbecue-pit
missing word: raost
missing word: barbacue
missing word: stapping
missing word: charcola
missing word: belly-dances
missing word: panys
missing word: futuritic
missing word: streeet
missing word: publc

### Determine POS-tagging vocabulary from train split

In [11]:
import nltk

pos_vocab = {}
pos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower())):
        try:
            pos_vocab[tag[1]] += 1
            try: 
                pos_unique_words[tag[1]][tag[0]] += 1
            except:
                pos_unique_words[tag[1]][tag[0]] = 1
        except:
            pos_vocab[tag[1]] = 1
            pos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1

Unique words per tag:
 NNS:	5272
 VBG:	2909
 TO:	2
 VB:	2942
 RP:	140
 DT:	50
 NN:	13923
 .:	3
 IN:	405
 VBZ:	2521
 PRP:	59
 MD:	17
 VBN:	2067
 CC:	53
 RB:	1444
 ,:	1
 VBD:	1341
 WRB:	25
 VBP:	2037
 JJ:	8072
 JJR:	165
 CD:	186
 POS:	23
 PRP$:	13
 WP:	13
 NNP:	50
 ``:	2
 '':	6
 WDT:	19
 PDT:	47
 EX:	8
 ::	5
 RBR:	75
 JJS:	51
 UH:	7
 (:	1
 ):	1
 RBS:	14
 FW:	55
 WP$:	1
 $:	4
 #:	1
 SYM:	4


### Determine Universal POS-tagging from train split

In [10]:
import nltk
nltk.download('universal_tagset')

upos_vocab = {}
upos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset='universal'):
        try:
            upos_vocab[tag[1]] += 1
            try: 
                upos_unique_words[tag[1]][tag[0]] += 1
            except:
                upos_unique_words[tag[1]][tag[0]] = 1
        except:
            upos_vocab[tag[1]] = 1
            upos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Unique words per universal tag:
 NOUN:	18463
 VERB:	10356
 PRT:	165
 DET:	117
 .:	24
 ADP:	405
 PRON:	84
 CONJ:	53
 ADV:	1515
 ADJ:	8209
 NUM:	186
 X:	65


### Generate ground-truth references files (validation split only)

In [9]:
with open('../results/VATEX_val_references.txt', 'w') as f:
    for vidx, cap in zip(valid_vidxs, valid_corpus):
        f.write('{}\t{}\n'.format(vidx, cap))

### Generate corpus.pkl file

In [10]:
import pickle

train_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in train_corpus]
valid_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in valid_corpus]

train_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in train_corpus]
valid_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in valid_corpus]

assert len(train_corpus_widxs) == len(train_vidxs) and len(train_vidxs) == len(train_corpus_pidxs) and len(train_vidxs) == len(train_corpus)
assert len(valid_corpus_widxs) == len(valid_vidxs) and len(valid_vidxs) == len(valid_corpus_pidxs) and len(valid_vidxs) == len(valid_corpus)

train_data = [train_corpus_widxs, train_vidxs, train_corpus_pidxs, train_corpus]
valid_data = [valid_corpus_widxs, valid_vidxs, valid_corpus_pidxs, valid_corpus]
test_data = [None, test_vidxs, None]

with open('../../../data/VATEX/v1.0/vatex_corpus_pos.pkl', 'wb') as outfile:
    pickle.dump([train_data, valid_data, test_data, vocab, idx2word, word_embeddings, idx2pos], outfile)