## Generate corpus and references of video segments

### Corpus file contents
0. `train_data`: captions and idxs of training video segments in format `[corpus_widxs, vidxs, corpus_pidxs]`, where:
    - `corpus_widxs` is a list of lists with the index of words in the vocabulary
    - `vidxs` is a list of indexes of video features in the features file
    - `corpus_pidxs` is a list of lists with the index of POS tags in the POS tagging vocabulary
1. `val1_data`: same format as `train_data`.
2. `val2_data`: same format as `train_data`.
3. `vocabulary`: in format `{'word': count}`.
4. `idx2word`: is the vocabulary in format `{idx: 'word'}`.
5. `word_embeddings`: are the vectors of each word. The *i*-th row is the word vector of the *i*-th word in the vocabulary.
6. `idx2pos`: is the vocabulary of POS tagging in format `{idx: 'POSTAG'}`.

In [3]:
import json
with open('../../../data/ActivityNet/captions/train.json') as f:
    datainfo_train = json.load(f)
with open('../../../data/ActivityNet/captions/val_1.json') as f:
    datainfo_val_1 = json.load(f)
with open('../../../data/ActivityNet/captions/val_2.json') as f:
    datainfo_val_2 = json.load(f)
with open('../../../data/ActivityNet/captions/train_ids.json') as f:
    datainfo_train_ids = json.load(f)
with open('../../../data/ActivityNet/captions/val_ids.json') as f:
    datainfo_val_ids = json.load(f)
with open('../../../data/ActivityNet/captions/test_ids.json') as f:
    datainfo_test_ids = json.load(f)

## Generate datainfo (used for feature extraction of segments)

In [None]:
datainfo = {**datainfo_train, **datainfo_val_1, **datainfo_val_2}
with open('../../../data/ActivityNet/datainfo.json', 'w') as outfile:
    json.dump(datainfo, outfile)

In [None]:
print(len(datainfo_train), len(datainfo_val_1), len(datainfo_val_2), len(datainfo_test_ids))
print(len(datainfo), len([t for v in datainfo.values() for t in v['timestamps']]))

In [1]:
with open('../../../data/ActivityNet/features/fragments_videos_mapping_OK.txt') as f:
    vid2idx = {}
    for l in f.readlines():
        s = l.strip().split(' \t ')
        vid, _, _, fidx, vidx = tuple(s)
        try:
            vid2idx[vid][int(fidx)] = vidx
        except:
            vid2idx[vid] = {int(fidx): vidx}

In [4]:
train_vidxs, train_corpus = [], []
for vid, d in datainfo_train.items():
    for fidx, cap in enumerate(d['sentences']):
        if vid in vid2idx and fidx in vid2idx[vid]:
            train_vidxs.append(int(vid2idx[vid][fidx]))
            train_corpus.append(cap)
            
val_1_vidxs, val_1_corpus = [], []
with open('../results/ActivityNet-Fragments_val_1_references.txt', 'w') as f:
    for vid, d in datainfo_val_1.items():
        for fidx, cap in enumerate(d['sentences']):
            if vid in vid2idx and fidx in vid2idx[vid]:
                vidx = int(vid2idx[vid][fidx])
                cap = cap.strip().replace('\n', ' ')
                f.write('{}\t{}\n'.format(vidx, cap))
                val_1_vidxs.append(vidx)
                val_1_corpus.append(cap)
            
val_2_vidxs, val_2_corpus = [], []
with open('../results/ActivityNet-Fragments_val_2_references.txt', 'w') as f:
    for vid, d in datainfo_val_2.items():
        for fidx, cap in enumerate(d['sentences']):
            if vid in vid2idx and fidx in vid2idx[vid]:
                vidx = int(vid2idx[vid][fidx])
                cap = cap.strip().replace('\n', ' ')
                f.write('{}\t{}\n'.format(vidx, cap))
                val_2_vidxs.append(vidx)
                val_2_corpus.append(cap)

print('count of training pairs: ', len(train_vidxs))
print('count of val-1 pairs: ', len(val_1_vidxs))
print('count of val-2 pairs: ', len(val_2_vidxs))

count of training pairs:  33808
count of val-1 pairs:  13162
count of val-2 pairs:  15391


### Get pretrained embeddings

In [5]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('./glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [6]:
import nltk
nltk.download('punkt')

vocab, total_len = {}, 0
for cap in train_corpus:
    tokens = nltk.word_tokenize(cap.lower())
    total_len += len(tokens)
    for w in tokens:
        try:
            vocab[w] += 1
        except:
            vocab[w] = 1

print('Avg. count of words per caption:', total_len/len(train_corpus))
print('Count of unique words: ', len(vocab))

to_del = []
for w in vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
        print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
EOS, UNK = 0, 1

print(len(vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word]

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Avg. count of words per caption: 14.72441433980123
Count of unique words:  10162
missing word: hulte
missing word: intertubes
missing word: javeline
missing word: sideward
missing word: jump-stilts
missing word: woodfire
missing word: unscrews
missing word: leathe
missing word: gymasts
missing word: voleyball
missing word: liquir
missing word: sepperates
missing word: eliptical
missing word: rubic
missing word: canoesport
missing word: intertube
missing word: cheer-leading
missing word: kufiyyas
missing word: sanitizes
missing word: wearinf
missing word: aiter
missing word: unhooks
missing word: thorougly
missing word: absorbant
missing word: parasails
missing word: plyas
missing word: wreslers
missing word: sizzors
missing word: gargles
missing word: zucky
missing word: -end-
missing word: unboxing
missing word: rubix
missing word: gabs
missing word: themiddle
missing word: garnishments
missing word: whie
missing word: trakc
missing word: forearm-mounted
missing word: re-equips
missin

### Determine POS-tagging vocabulary from train split

In [7]:
import nltk

pos_vocab = {}
pos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower())):
        try:
            pos_vocab[tag[1]] += 1
            try: 
                pos_unique_words[tag[1]][tag[0]] += 1
            except:
                pos_unique_words[tag[1]][tag[0]] = 1
        except:
            pos_vocab[tag[1]] = 1
            pos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1

Unique words per tag:
 DT:	25
 JJ:	2061
 NN:	4671
 VBZ:	1043
 VBN:	642
 VBG:	968
 IN:	146
 CC:	15
 PRP$:	7
 .:	3
 NNS:	2056
 PRP:	21
 VBP:	693
 TO:	1
 POS:	3
 VB:	973
 CD:	113
 RB:	470
 RP:	28
 ,:	1
 EX:	6
 VBD:	477
 WRB:	7
 WDT:	7
 JJS:	17
 JJR:	62
 WP:	4
 RBR:	20
 RBS:	3
 '':	2
 MD:	15
 ``:	2
 PDT:	13
 WP$:	1
 (:	1
 ):	1
 ::	3
 NNP:	11
 FW:	18
 #:	1
 SYM:	1
 $:	1
 UH:	1


### Determine Universal POS-tagging from train split

In [8]:
import nltk
nltk.download('universal_tagset')

upos_vocab = {}
upos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset='universal'):
        try:
            upos_vocab[tag[1]] += 1
            try: 
                upos_unique_words[tag[1]][tag[0]] += 1
            except:
                upos_unique_words[tag[1]][tag[0]] = 1
        except:
            upos_vocab[tag[1]] = 1
            upos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Unique words per universal tag:
 DET:	45
 ADJ:	2131
 NOUN:	6540
 VERB:	3748
 ADP:	146
 CONJ:	15
 PRON:	32
 .:	15
 PRT:	32
 NUM:	113
 ADV:	493
 X:	20


In [10]:
import pickle

train_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in train_corpus]
val_1_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in val_1_corpus]
val_2_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in val_2_corpus]

train_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in train_corpus]
val_1_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in val_1_corpus]
val_2_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in val_2_corpus]

assert len(train_corpus_widxs) == len(train_vidxs) and len(train_vidxs) == len(train_corpus_pidxs) and len(train_vidxs) == len(train_corpus), f'{len(train_vidxs)}, {len(train_corpus_widxs)}, {len(train_corpus_pidxs)}, {len(train_corpus)}'
assert len(val_1_corpus_widxs) == len(val_1_vidxs) and len(val_1_vidxs) == len(val_1_corpus_pidxs) and len(val_1_vidxs) == len(val_1_corpus), f'{len(val_1_vidxs)}, {len(val_1_corpus_widxs)}, {len(val_1_corpus_pidxs)}, {len(val_1_corpus)}'
assert len(val_2_corpus_widxs) == len(val_2_vidxs) and len(val_2_vidxs) == len(val_2_corpus_pidxs) and len(val_2_vidxs) == len(val_2_corpus), f'{len(val_2_vidxs)}, {len(val_2_corpus_widxs)}, {len(val_2_corpus_pidxs)}, {len(val_2_corpus)}'

train_data = [train_corpus_widxs, train_vidxs, train_corpus_pidxs, train_corpus]
val_1_data = [val_1_corpus_widxs, val_1_vidxs, val_1_corpus_pidxs, val_1_corpus]
val_2_data = [val_2_corpus_widxs, val_2_vidxs, val_2_corpus_pidxs, val_2_corpus]

with open('../../../data/ActivityNet/activitynet_fragments_corpus.pkl', 'wb') as outfile:
    pickle.dump([train_data, val_1_data, val_2_data, vocab, idx2word, word_embeddings, idx2pos], outfile)

In [11]:
train_vidxs

[20845,
 20846,
 20847,
 32659,
 32660,
 32661,
 32662,
 32663,
 32664,
 40002,
 40003,
 40539,
 40540,
 40541,
 39508,
 39509,
 39510,
 39511,
 5460,
 5461,
 5462,
 10635,
 10636,
 10637,
 41359,
 41360,
 41361,
 40324,
 40325,
 40326,
 40327,
 33677,
 33678,
 33679,
 33680,
 33681,
 33682,
 33683,
 33684,
 33685,
 17141,
 17142,
 17143,
 44048,
 44049,
 44050,
 15073,
 15074,
 28798,
 28799,
 25977,
 25978,
 25979,
 25980,
 25981,
 5964,
 5965,
 5966,
 5967,
 11444,
 11445,
 35688,
 35689,
 31201,
 31202,
 31203,
 30413,
 30414,
 30415,
 21858,
 21859,
 28812,
 28813,
 31482,
 31483,
 49097,
 49098,
 49099,
 49100,
 49101,
 49102,
 49103,
 49104,
 49105,
 0,
 1,
 2,
 32731,
 32732,
 32733,
 32734,
 13101,
 13102,
 25132,
 25133,
 25134,
 38484,
 38485,
 38486,
 38487,
 24786,
 24787,
 24788,
 24789,
 40968,
 40969,
 40970,
 40971,
 33194,
 33195,
 33196,
 33197,
 33198,
 33199,
 33200,
 33201,
 33202,
 33203,
 33204,
 33205,
 33206,
 3159,
 3160,
 23371,
 23372,
 23373,
 4466,
 4467,