## Generate corpus and gruond-truth references of released videos

### Corpus file contents
0. train_data: captions and idxs of training videos in format [corpus_widxs, vidxs, corpus_pidxs], where:
    - corpus_widxs is a list of lists with the index of words in the vocabulary
    - vidxs is a list of indexes of video features in the features file
    - corpus_pidxs is a list of lists with the index of POS tags in the POS tagging vocabulary
1. val_data: same format of train_data.
2. test_data: same format of train_data.
3. vocabulary: in format {'word': count}.
4. idx2word: is the vocabulary in format {idx: 'word'}.
5. word_embeddings: are the vectors of each word. The i-th row is the word vector of the i-th word in the vocabulary.
6. idx2pos: is the vocabulary of POS tagging in format {idx: 'POSTAG'}

### Generate split for training and validation

In [None]:
import json
with open('../../../data/HowTo100M/caption.json') as f:
    data = json.load(f)

data

In [2]:
train_vidxs, train_corpus = [], []
for vid, subtitles in data.items():
    train_vidxs.append(vid)
    for subtitle in subtitles['text']:
        train_vidxs.append(vid)
        train_corpus.append(str(subtitle))
        
print('count of training pairs: ', len(train_vidxs))

count of training pairs:  139668840


### Get pretrained embeddings

In [3]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('./glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [9]:
import sys
import multiprocessing as mp
from multiprocessing import Pool

import nltk
nltk.download('punkt')

def determine_vocab(train_corpus):
    vocab, total_len = {}, 0
    for cap in train_corpus:
        tokens = nltk.word_tokenize(cap.lower())
        total_len += len(tokens)
        for w in tokens:
            try:
                vocab[w] += 1
            except:
                vocab[w] = 1
    return vocab, total_len

def merge_vocabs(partial_results):
    vocab, total_len  = {}, 0
    for i, (vocab_partial, total_len_partial) in enumerate(partial_results):
        total_len += total_len_partial
        for word, count in vocab_partial.items():
            try: 
                vocab[word] += count
            except:
                vocab[word] = count
                
        print(f'{i} partial merged')
    return vocab, total_len    

num_threads = mp.cpu_count()-2 #30
step = len(train_corpus) // num_threads 
args = [train_corpus[idx:idx+step] for idx in range(0, len(train_corpus), step)]

print('parallel processing')
pool = Pool(num_threads)
partial_results = pool.map(determine_vocab, args)

print('merging partials')
vocab, total_len = merge_vocabs(partial_results)

print('Avg. count of words per caption:', total_len/len(train_corpus))
print('Count of unique words: ', len(vocab))

to_del = []
for w in vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
#         print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
EOS, UNK = 0, 1

print(len(vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word] 

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


parallel processing
merging partials
0 partial merged
1 partial merged
2 partial merged
3 partial merged
4 partial merged
5 partial merged
6 partial merged
7 partial merged
8 partial merged
9 partial merged
10 partial merged
11 partial merged
12 partial merged
13 partial merged
14 partial merged
15 partial merged
16 partial merged
17 partial merged
18 partial merged
19 partial merged
20 partial merged
21 partial merged
22 partial merged
23 partial merged
24 partial merged
25 partial merged
26 partial merged
27 partial merged
28 partial merged
29 partial merged
30 partial merged
Avg. count of words per caption: 4.160451942440858
Count of unique words:  593238
count of missing words:  375877
217361 217363 217363


### Determine POS-tagging vocabulary from train split

In [14]:
import sys
import multiprocessing as mp
from multiprocessing import Pool
from functools import partial

import nltk
nltk.download('punkt')

def count_words_per_tag(train_corpus, tagset=None):
    upos_vocab,upos_unique_words  = {},{}
    for cap in train_corpus:
        for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset=tagset):
            try:
                upos_vocab[tag[1]] += 1
                try: 
                    upos_unique_words[tag[1]][tag[0]] += 1
                except:
                    upos_unique_words[tag[1]][tag[0]] = 1
            except:
                upos_vocab[tag[1]] = 1
                upos_unique_words[tag[1]] = {tag[0]: 1}
    return upos_vocab, upos_unique_words

def merge_tags(partial_results):
    tags_vocab, tag_unique_words  = {}, {}
    for i, (tags_vocab_partial, tag_unique_words_partial) in enumerate(partial_results):
        for tag, count in tags_vocab_partial.items():
            try: 
                tags_vocab[tag] += count
            except:
                tags_vocab[tag] = count

        for tag, unique_words in tag_unique_words_partial.items():
            for word, count in unique_words.items():
                try: 
                    tag_unique_words[tag][word] += count
                except:
                    try:
                        tag_unique_words[tag][word] = count
                    except:
                        tag_unique_words[tag] = {word: count}
        print(f'{i} partial merged')
    return tags_vocab, tag_unique_words

num_threads = mp.cpu_count()-2  # 30
step = len(train_corpus) // num_threads
args = [train_corpus[idx:idx+step] for idx in range(0, len(train_corpus), step)]

print('parallel processing')
pool = Pool(num_threads)
partial_results = pool.map(partial(count_words_per_tag, tagset=None), args)

print('merging partials')
pos_vocab, pos_unique_words = merge_tags(partial_results)

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: tag for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {tag: idx for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1
print(len(idx2pos))

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


parallel processing
merging partials
0 partial merged
1 partial merged
2 partial merged
3 partial merged
4 partial merged
5 partial merged
6 partial merged
7 partial merged
8 partial merged
9 partial merged
10 partial merged
11 partial merged
12 partial merged
13 partial merged
14 partial merged
15 partial merged
16 partial merged
17 partial merged
18 partial merged
19 partial merged
20 partial merged
21 partial merged
22 partial merged
23 partial merged
24 partial merged
25 partial merged
26 partial merged
27 partial merged
28 partial merged
29 partial merged
30 partial merged
Unique words per tag:
 NNS:	136311
 VBP:	131078
 NN:	422846
 VBN:	23227
 RB:	72254
 VBD:	56598
 JJ:	216737
 VBG:	13493
 VB:	62911
 IN:	28770
 POS:	3100
 VBZ:	57678
 WP:	886
 RP:	8588
 CD:	40352
 RBR:	10734
 PRP:	5495
 MD:	4046
 JJR:	7427
 UH:	54
 TO:	69
 JJS:	1984
 FW:	16056
 DT:	3841
 NNP:	17979
 CC:	8943
 WRB:	1997
 WDT:	537
 EX:	1373
 RBS:	4463
 LS:	60
 '':	1738
 SYM:	48
 PRP$:	431
 $:	4896
 PDT:	50
 ``:	1
 N

### Determine Universal POS-tagging from train split

In [6]:
import sys
import multiprocessing as mp
from multiprocessing import Pool

import nltk
nltk.download('universal_tagset')

num_threads = mp.cpu_count()-2  # 30
step = len(train_corpus) // num_threads
args = [train_corpus[idx:idx+step] for idx in range(0, len(train_corpus), step)]

print('parallel processing')
pool = Pool(num_threads)
partial_results = pool.map(partial(count_words_per_tag, tagset='universal'), args)

print('merging partials')
upos_vocab, upos_unique_words = merge_tags(partial_results)

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
print(len(idx2upos))

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


parallel processing
merging partials
0 partial merged
1 partial merged
2 partial merged
3 partial merged
4 partial merged
5 partial merged
6 partial merged
7 partial merged
8 partial merged
9 partial merged
10 partial merged
11 partial merged
12 partial merged
13 partial merged
14 partial merged
15 partial merged
16 partial merged
17 partial merged
18 partial merged
19 partial merged
20 partial merged
21 partial merged
22 partial merged
23 partial merged
24 partial merged
25 partial merged
26 partial merged
27 partial merged
28 partial merged
29 partial merged
30 partial merged
Unique words per universal tag:
 NOUN:	491488
 VERB:	198859
 ADV:	76535
 ADJ:	219719
 ADP:	28770
 PRT:	11311
 PRON:	6477
 NUM:	40352
 X:	16121
 DET:	5473
 CONJ:	8943
 .:	6072
14


### Generate ground-truth references files

In [31]:
with open('../results/MSVD_val_references.txt', 'w') as f:
    for vidx, cap in zip(valid_vidxs, valid_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))
        
with open('../results/MSVD_test_references.txt', 'w') as f:
    for vidx, cap in zip(test_vidxs, test_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))

### Generate corpus.pkl file

In [8]:
import pickle

train_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in train_corpus]
valid_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in valid_corpus]
test_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in test_corpus]

train_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in train_corpus]
valid_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in valid_corpus]
test_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in test_corpus]

assert len(train_corpus_widxs) == len(train_vidxs) and len(train_vidxs) == len(train_corpus_pidxs) and len(train_vidxs) == len(train_corpus), f'{len(train_vidxs)}, {len(train_corpus_widxs)}, {len(train_corpus_pidxs)}, {len(train_corpus)}'
assert len(valid_corpus_widxs) == len(valid_vidxs) and len(valid_vidxs) == len(valid_corpus_pidxs) and len(valid_vidxs) == len(valid_corpus), f'{len(valid_vidxs)}, {len(valid_corpus_widxs)}, {len(valid_corpus_pidxs)}, {len(valid_corpus)}'
assert len(test_corpus_widxs) == len(test_vidxs) and len(test_vidxs) == len(test_corpus_pidxs) and len(test_vidxs) == len(test_corpus), f'{len(test_vidxs)}, {len(test_corpus_widxs)}, {len(test_corpus_pidxs)}, {len(test_corpus)}'

train_data = [train_corpus_widxs, train_vidxs, train_corpus_pidxs, train_corpus]
valid_data = [valid_corpus_widxs, valid_vidxs, valid_corpus_pidxs, valid_corpus]
test_data = [test_corpus_widxs, test_vidxs, test_corpus_pidxs, test_corpus]

with open('../../../data/MSVD/my_msvd_corpus.pkl', 'wb') as outfile:
    pickle.dump([train_data, valid_data, test_data, vocab, idx2word, word_embeddings, idx2pos], outfile)