## Generate corpus and gruond-truth references of released videos

### Corpus file contents
0. train_data: captions and idxs of training videos in format [corpus_widxs, vidxs, corpus_pidxs], where:
    - corpus_widxs is a list of lists with the index of words in the vocabulary
    - vidxs is a list of indexes of video features in the features file
    - corpus_pidxs is a list of lists with the index of POS tags in the POS tagging vocabulary
1. val_data: same format of train_data.
2. test_data: same format of train_data.
3. vocabulary: in format {'word': count}.
4. idx2word: is the vocabulary in format {idx: 'word'}.
5. word_embeddings: are the vectors of each word. The i-th row is the word vector of the i-th word in the vocabulary.
6. idx2pos: is the vocabulary of POS tagging in format {idx: 'POSTAG'}

### Generate split for training and validation

In [1]:
import json
with open('../../../data/Something-Something-v2/label/something-something-v2-train.json', 'r') as f:
    train_data = json.load(f)
    
with open('../../../data/Something-Something-v2/label/something-something-v2-validation.json') as f:
    valid_data = json.load(f)
    
with open('../../../data/Something-Something-v2/label/something-something-v2-test.json') as f:
    test_data = json.load(f)
    
train_vidxs, train_corpus = zip(*[(int(d['id']), d['label']) for d in train_data])
valid_vidxs, valid_corpus = zip(*[(int(d['id']), d['label']) for d in valid_data])
test_vidxs = [(int(d['id'])) for d in test_data]

### Get pretrained embeddings

In [2]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('./glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [3]:
import nltk
nltk.download('punkt')

vocab, total_len = {}, 0
for cap in train_corpus:
    tokens = nltk.word_tokenize(cap.lower())
    total_len += len(tokens)
    for w in tokens:
        try:
            vocab[w] += 1
        except:
            vocab[w] = 1

print('Avg. count of words per caption:', total_len/len(train_corpus))
print('Count of unique words: ', len(vocab))

to_del = []
for w in vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
        print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
EOS, UNK = 0, 1

print(len(vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word]

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Avg. count of words per caption: 6.757715510351483
Count of unique words:  7433
missing word: invitaion
missing word: 'enak
missing word: 'tiga
missing word: taskpane
missing word: pencilbox
missing word: eatable
missing word: pendrive
missing word: flashdisk
missing word: wash-basin
missing word: lollypop
missing word: razor-blades
missing word: nailpolish
missing word: dettol
missing word: hairbands
missing word: powdertin
missing word: tooth-paste
missing word: music-player
missing word: chille
missing word: harddisk
missing word: tearable
missing word: controler
missing word: regulater
missing word: footcare
missing word: corector
missing word: bancle
missing word: dehodorant
missing word: palstic
missing word: laudry
missing word: rubix
missing word: powerbank
missing word: mixie
missing word: chapathi
missing word: agarbathy
missing word: reciever
missing word: plyers
missing word: fidgetspinner
missing word: dishwash
missing word: facepack
missing word: harpic
missing word: karc

### Determine POS-tagging vocabulary from train split

In [18]:
import nltk

pos_vocab = {}
pos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower())):
        try:
            pos_vocab[tag[1]] += 1
            try: 
                pos_unique_words[tag[1]][tag[0]] += 1
            except:
                pos_unique_words[tag[1]][tag[0]] = 1
        except:
            pos_vocab[tag[1]] = 1
            pos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: tag for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {tag: idx for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1
print(len(idx2pos))

Unique words per tag:
 VBG:	247
 NN:	5186
 JJ:	1806
 TO:	1
 NNS:	1161
 FW:	78
 IN:	91
 DT:	22
 RP:	199
 CD:	83
 ,:	1
 RB:	343
 PRP:	151
 VBZ:	200
 VB:	1179
 VBN:	112
 WP:	6
 (:	1
 ):	1
 CC:	21
 VBP:	252
 '':	6
 VBD:	363
 POS:	2
 MD:	11
 RBR:	21
 PRP$:	13
 WDT:	6
 WRB:	5
 JJR:	101
 NNP:	28
 JJS:	14
 .:	1
 UH:	1
 $:	4
 ``:	2
 PDT:	1
 RBS:	1
 EX:	1
 ::	1
 #:	1
43


### Determine Universal POS-tagging from train split

In [19]:
import nltk
nltk.download('universal_tagset')

upos_vocab = {}
upos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset='universal'):
        try:
            upos_vocab[tag[1]] += 1
            try: 
                upos_unique_words[tag[1]][tag[0]] += 1
            except:
                upos_unique_words[tag[1]][tag[0]] = 1
        except:
            upos_vocab[tag[1]] = 1
            upos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
print(len(idx2upos))

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Unique words per universal tag:
 VERB:	1874
 NOUN:	6087
 ADJ:	1889
 PRT:	202
 X:	79
 ADP:	91
 DET:	28
 NUM:	83
 .:	18
 ADV:	361
 PRON:	166
 CONJ:	21
14


### Generate ground-truth references files

In [20]:
with open('../results/20B-SS-v2_val_references.txt', 'w') as f:
    for vidx, cap in zip(valid_vidxs, valid_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))

### Generate corpus.pkl file

In [21]:
import pickle

train_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in train_corpus]
valid_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in valid_corpus]

train_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in train_corpus]
valid_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in valid_corpus]

assert len(train_corpus_widxs) == len(train_vidxs) and len(train_vidxs) == len(train_corpus_pidxs) and len(train_vidxs) == len(train_corpus)
assert len(valid_corpus_widxs) == len(valid_vidxs) and len(valid_vidxs) == len(valid_corpus_pidxs) and len(valid_vidxs) == len(valid_corpus)

train_data = [train_corpus_widxs, train_vidxs, train_corpus_pidxs, train_corpus]
valid_data = [valid_corpus_widxs, valid_vidxs, valid_corpus_pidxs, valid_corpus]
test_data = [None, test_vidxs, None]

with open('../../../data/Something-Something-v2/20b-ss-v2_corpus_pos.pkl', 'wb') as outfile:
    pickle.dump([train_data, valid_data, test_data, vocab, idx2word, word_embeddings, idx2pos], outfile)