## Generate corpus and gruond-truth references of released videos

### Corpus file contents
0. train_data: captions and idxs of training videos in format [corpus_widxs, vidxs, corpus_pidxs], where:
    - corpus_widxs is a list of lists with the index of words in the vocabulary
    - vidxs is a list of indexes of video features in the features file
    - corpus_pidxs is a list of lists with the index of POS tags in the POS tagging vocabulary
1. val_data: same format of train_data.
2. test_data: same format of train_data.
3. vocabulary: in format {'word': count}.
4. idx2word: is the vocabulary in format {idx: 'word'}.
5. word_embeddings: are the vectors of each word. The i-th row is the word vector of the i-th word in the vocabulary.
6. idx2pos: is the vocabulary of POS tagging in format {idx: 'POSTAG'}

### Generate split for training and validation

In [1]:
import json
with open('../../../data/MSVD/allvideodatainfo.json') as f:
    vidxs, corpus = zip(*[(s['video_id'], s['caption']) for s in json.load(f)['sentences']]) 
    vidxs = [int(s[5:]) for s in vidxs]
    
train_vidxs, train_corpus = zip(*[(int(vidx), corpus[i]) for (i, vidx) in enumerate(vidxs) if vidx <= 1199])
valid_vidxs, valid_corpus = zip(*[(int(vidx), corpus[i]) for (i, vidx) in enumerate(vidxs) if vidx >= 1200 and vidx <= 1299])
test_vidxs, test_corpus = zip(*[(int(vidx), corpus[i]) for (i, vidx) in enumerate(vidxs) if vidx >= 1300])

print('count of training pairs: ', len(train_vidxs))
print('count of validation pairs: ', len(valid_vidxs))
print('count of testing pairs: ', len(test_vidxs))

count of training pairs:  48779
count of validation pairs:  4291
count of testing pairs:  27768


### Seach videos with specific captions (for paper examples only)

In [4]:
print(test_vidxs[test_corpus.index('A little girl is pushing a stroller through a grocery store')])
print(test_vidxs[test_corpus.index('A kid pushes a stroller')])
print(test_vidxs[test_corpus.index('A man is seasoning some bacon')])
print(test_vidxs[test_corpus.index('A person seasons some meat')])

1564
1564
1742
1742


In [58]:
patterns_to_search = ["holding a pet animal"]

# matchs = [(train_vidxs[i], s) for (i, s) in enumerate(train_corpus) if all([(p in s) for p in patterns_to_search])]
matchs = [(test_vidxs[i], s) for (i, s) in enumerate(test_corpus) if all([(p in s) for p in patterns_to_search])]
matchs

[(1362, 'a man is holding a pet animal')]

In [59]:
import os
from IPython.display import display, HTML

vidx = 1362
with open('../../../data/MSVD/list.txt') as f:
    mapping = f.read().splitlines() 

print(mapping[vidx])
display(Video(os.path.join('../../../data/MSVD', mapping[vidx])))
print('\n'.join([train_corpus[i] for i, idx in enumerate(train_vidxs) if idx == vidx]))

YouTubeClips/gp8XjWSoP2k_0_10.avi





In [60]:
os.path.join('../../../data/MSVD', mapping[vidx])

'../../../data/MSVD/YouTubeClips/gp8XjWSoP2k_0_10.avi'

In [89]:
ids, s, e = zip(*[x.split('_', -2) for x in mapping])
[x.rsplit('_', 2) for x in mapping]
print(f"{len(ids)} clips from {len(set(ids))} distinct videos")

1970 clips from 1464 distinct videos


### Get pretrained embeddings

In [74]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('./glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [75]:
import nltk
nltk.download('punkt')

vocab, total_len = {}, 0
for cap in train_corpus:
    tokens = nltk.word_tokenize(cap.lower())
    total_len += len(tokens)
    for w in tokens:
        try:
            vocab[w] += 1
        except:
            vocab[w] = 1

print('Avg. count of words per caption:', total_len/len(train_corpus))
print('Count of unique words: ', len(vocab))

to_del = []
for w in vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
        print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
EOS, UNK = 0, 1

print(len(vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word]

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Avg. count of words per caption: 7.141946329363045
Count of unique words:  9629
missing word: squirrel=l
missing word: ardilla
missing word: coreana
missing word: carboard
missing word: botal
missing word: platic
missing word: pionting
missing word: swoard
missing word: ocra
missing word: ladiesfinger
missing word: kichen
missing word: key-board
missing word: nusic
missing word: piyano
missing word: stroubery
missing word: strobery
missing word: strawberrys
missing word: pre-washed
missing word: ahild
missing word: bassket
missing word: dibbling
missing word: swiming
missing word: perlove
missing word: fluit
missing word: excersizing
missing word: shoting
missing word: hitted
missing word: ingrediants
missing word: banto
missing word: vegtable
missing word: english/seedless
missing word: preperation
missing word: vedgetables
missing word: raciepe
missing word: japanes
missing word: foode
missing word: leady
missing word: griddle-like
missing word: personis
missing word: wiyh
missing wo

### Determine POS-tagging vocabulary from train split

In [76]:
import nltk

pos_vocab = {}
pos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower())):
        try:
            pos_vocab[tag[1]] += 1
            try: 
                pos_unique_words[tag[1]][tag[0]] += 1
            except:
                pos_unique_words[tag[1]][tag[0]] = 1
        except:
            pos_vocab[tag[1]] = 1
            pos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: tag for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {tag: idx for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1
print(len(idx2pos))

Unique words per tag:
 DT:	18
 NN:	4756
 VBZ:	675
 VBG:	1182
 IN:	118
 PRP:	24
 JJ:	1708
 NNS:	1479
 CC:	9
 VB:	519
 WRB:	4
 TO:	1
 CD:	76
 RP:	30
 PRP$:	8
 VBD:	505
 JJR:	32
 ::	4
 POS:	4
 RBR:	17
 VBN:	522
 VBP:	332
 ,:	1
 MD:	12
 RB:	358
 .:	2
 EX:	4
 WDT:	8
 (:	1
 ):	1
 JJS:	19
 SYM:	8
 FW:	15
 ``:	2
 '':	2
 WP:	3
 NNP:	9
 UH:	2
 RBS:	4
 PDT:	7
 $:	2
 NNPS:	1
 WP$:	1
45


### Determine Universal POS-tagging from train split

In [77]:
import nltk
nltk.download('universal_tagset')

upos_vocab = {}
upos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset='universal'):
        try:
            upos_vocab[tag[1]] += 1
            try: 
                upos_unique_words[tag[1]][tag[0]] += 1
            except:
                upos_unique_words[tag[1]][tag[0]] = 1
        except:
            upos_vocab[tag[1]] = 1
            upos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
print(len(idx2upos))

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Unique words per universal tag:
 DET:	35
 NOUN:	6057
 VERB:	3211
 ADP:	118
 PRON:	35
 ADJ:	1751
 CONJ:	9
 ADV:	378
 PRT:	35
 NUM:	76
 .:	15
 X:	25
14


### Generate ground-truth references files

In [78]:
with open('../results/MSVD_val_references.txt', 'w') as f:
    for vidx, cap in zip(valid_vidxs, valid_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))
        
with open('../results/MSVD_test_references.txt', 'w') as f:
    for vidx, cap in zip(test_vidxs, test_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))

### Generate corpus.pkl file

In [79]:
import pickle

train_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in train_corpus]
valid_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in valid_corpus]
test_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in test_corpus]

train_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in train_corpus]
valid_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in valid_corpus]
test_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in test_corpus]

assert len(train_corpus_widxs) == len(train_vidxs) and len(train_vidxs) == len(train_corpus_pidxs) and len(train_vidxs) == len(train_corpus), f'{len(train_vidxs)}, {len(train_corpus_widxs)}, {len(train_corpus_pidxs)}, {len(train_corpus)}'
assert len(valid_corpus_widxs) == len(valid_vidxs) and len(valid_vidxs) == len(valid_corpus_pidxs) and len(valid_vidxs) == len(valid_corpus), f'{len(valid_vidxs)}, {len(valid_corpus_widxs)}, {len(valid_corpus_pidxs)}, {len(valid_corpus)}'
assert len(test_corpus_widxs) == len(test_vidxs) and len(test_vidxs) == len(test_corpus_pidxs) and len(test_vidxs) == len(test_corpus), f'{len(test_vidxs)}, {len(test_corpus_widxs)}, {len(test_corpus_pidxs)}, {len(test_corpus)}'

train_data = [train_corpus_widxs, train_vidxs, train_corpus_pidxs, train_corpus]
valid_data = [valid_corpus_widxs, valid_vidxs, valid_corpus_pidxs, valid_corpus]
test_data = [test_corpus_widxs, test_vidxs, test_corpus_pidxs, test_corpus]

with open('../../../data/MSVD/my_msvd_corpus.pkl', 'wb') as outfile:
    pickle.dump([train_data, valid_data, test_data, vocab, idx2word, word_embeddings, idx2pos], outfile)

# ------END------

# De aqui hacia delante son solo pruebas para sacar ejemplos y esas cosas

In [131]:
patterns_to_search = ["DT NN VBZ VBG NN IN DT NN"]

pos_templates = [" ".join([idx2pos[pidx] for pidx in c]) for c in train_corpus_pidxs]
matchs = [(train_vidxs[i], s, train_corpus[i]) for (i, s) in enumerate(pos_templates) if all([(p in s) for p in patterns_to_search])]
matchs

[(4,
  'DT NN VBZ VBG NN IN DT NN eos',
  'A woman is boiling finger in the water'),
 (7, 'DT NN VBZ VBG NN IN DT NN eos', 'A woman is shaking sugar into a pan'),
 (9,
  'DT NN VBZ VBG NN IN DT NN eos',
  'A woman starts swimming underwater in a swimming-pool'),
 (12,
  'DT NN VBZ VBG NN IN DT NN eos',
  'A woman is doing exercise in the room'),
 (12,
  'DT NN VBZ VBG NN IN DT NN NN eos',
  'A woman is doing exercise on a floor mat'),
 (14, 'DT NN VBZ VBG NN IN DT NN eos', 'A woman is mixing food in a bowl'),
 (16,
  'DT NN VBZ VBG NN IN DT NN eos',
  'A man is slicing cucumber with a knife'),
 (17,
  'DT NN VBZ VBG NN IN DT NN eos',
  'A person is slicing cucumber with a knife'),
 (19,
  'DT NN VBZ VBG NN IN DT NN IN NN eos',
  'A woman is removing food from a pot of oil'),
 (19, 'DT NN VBZ VBG NN IN DT NN eos', 'A man is removing food from a pot'),
 (20, 'DT NN VBZ VBG NN IN DT NN eos', 'A woman is forming rice into a ball'),
 (21, 'DT NN VBZ VBG NN IN DT NN eos', 'A person is arrang

In [136]:
mapping[25]

'YouTubeClips/-bjOB4zS0uE_100_105.avi'

In [97]:
d = {}
for i, t in enumerate(pos_templates):
    if not t in d:
        d[t] = [i]
    else:
        d[t].append(i)
for tags, idxs in d.items():
    if len(idxs)>1:
        print(f'{tags}\n{idxs}\n')

DT NN VBZ VBG eos
[1, 10, 13, 17, 19, 20, 22, 28, 64, 69, 282, 330, 333, 351, 352, 353, 354, 362, 458, 459, 469, 470, 471, 477, 484, 487, 495, 502, 503, 529, 537, 541, 574, 753, 782, 790, 983, 985, 986, 987, 988, 990, 991, 994, 995, 998, 1001, 1005, 1007, 1009, 1010, 1011, 1013, 1015, 1017, 1019, 1023, 1026, 1027, 1137, 1147, 1184, 1191, 1196, 1210, 1212, 1240, 1339, 1363, 1373, 1374, 1465, 1505, 1678, 1701, 1702, 1746, 1880, 1882, 1900, 1901, 2297, 2352, 2494, 2499, 2505, 2511, 2513, 2514, 2516, 2524, 2537, 2541, 2549, 2553, 2557, 2564, 2583, 2585, 2650, 2664, 2666, 2672, 2683, 2718, 2724, 2810, 2870, 2899, 2989, 3021, 3098, 3116, 3119, 3120, 3129, 3133, 3143, 3147, 3148, 3157, 3161, 3274, 3299, 3518, 3623, 3706, 3736, 3875, 3887, 3920, 3970, 4012, 4049, 4051, 4059, 4062, 4066, 4194, 4230, 4231, 4251, 4255, 4260, 4261, 4265, 4270, 4272, 4273, 4281, 4295, 4318, 4321, 4324, 4343, 4382, 4404, 4416, 4423, 4437, 4439, 4441, 4442, 4449, 4453, 4456, 4462, 4463, 4610, 4650, 4660, 4665, 4678, 

DT NN VBD DT NN DT NN IN DT NN IN PRP$ NN eos
[19411, 19425]

DT NN VBD DT NN DT NN IN PRP$ NN eos
[19414, 20480, 25575, 25578]

DT NN CC DT NN NN IN DT NNS eos
[19440, 19447]

PRP VBZ DT NN NN eos
[19449, 20692, 23499, 36178]

DT NNS CC NNS VBP VBG IN DT NN eos
[19464, 42519]

DT NN VBZ JJ NNS IN DT NN IN JJ NN eos
[19475, 43669]

DT NN VBZ VBG DT NN IN DT VBG NN eos
[19478, 21458, 44079]

DT NN VBD RP DT NN IN NN eos
[19483, 46840]

VBG TO VB eos
[19492, 44136]

VBG NNS IN NN IN DT NN eos
[19510, 32423, 47472]

DT NN VBZ WRB TO VB NNS eos
[19514, 28702]

CD NNS VBG IN NN eos
[19546, 24470, 29439, 42885]

CD NN VBP VBG DT JJ eos
[19562, 30515, 30541]

DT NN VBZ VBG IN PRP VBZ DT JJ NN eos
[19666, 48403]

DT NN VBZ DT NN IN NN IN DT NN NN eos
[19764, 33081]

DT NN VBD IN NN IN DT NN eos
[19773, 23493, 23496, 23512]

JJ NN NN VBG eos
[19835, 42827]

DT NN VBZ VBG IN DT NN RP DT NN eos
[19877, 44247]

DT NN NNS IN NN eos
[19880, 25391]

DT NN VBZ VBG DT NN RB RB eos
[19883, 25925, 32534]

In [128]:
mor_than_two_templates = [k for k in d.keys() if len(d[k])>1]
most_used = sorted(mor_than_two_templates, key=lambda k: len(d[k]))
print(f"count of templates: {len(d)}\n\
        count of templats that appears at least 2 times: {len(mor_than_two_templates)}\n\
        the most used pattern: {most_used[1000]}")

count of templates: 14255
        count of templats that appears at least 2 times: 2884
        the most used pattern: DT NN VBD IN PRP IN DT NN NN eos
