## Generate corpus and gruond-truth references of released videos

### Corpus file contents
0. train_data: captions and idxs of training videos in format [corpus_widxs, vidxs, corpus_pidxs], where:
    - corpus_widxs is a list of lists with the index of words in the vocabulary
    - vidxs is a list of indexes of video features in the features file
    - corpus_pidxs is a list of lists with the index of POS tags in the POS tagging vocabulary
1. val_data: same format of train_data.
2. test_data: same format of train_data.
3. vocabulary: in format {'word': count}.
4. idx2word: is the vocabulary in format {idx: 'word'}.
5. word_embeddings: are the vectors of each word. The i-th row is the word vector of the i-th word in the vocabulary.
6. idx2pos: is the vocabulary of POS tagging in format {idx: 'POSTAG'}

### Generate split for training and validation

In [1]:
import pandas as pd
train_data = pd.read_csv('../../../data/LSMDC/LSMDC16_annos_training.csv', '\t', usecols=[0,5], names=['video-id', 'sentence'])  
valid_data = pd.read_csv('../../../data/LSMDC/LSMDC16_annos_val.csv', '\t', usecols=[0,5], names=['video-id', 'sentence'])  
test_data = pd.read_csv('../../../data/LSMDC/LSMDC16_annos_test.csv', '\t', usecols=[0,5], names=['video-id', 'sentence'])  

In [2]:
train_vidxs, train_corpus = list(train_data['video-id']), list(train_data['sentence'])
valid_vidxs, valid_corpus = list(valid_data['video-id']), list(valid_data['sentence'])
test_vidxs, test_corpus = list(test_data['video-id']), list(test_data['sentence'])

### Get pretrained embeddings

In [3]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('./glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [4]:
import nltk
nltk.download('punkt')

vocab, total_len = {}, 0
for cap in train_corpus:
    tokens = nltk.word_tokenize(cap.lower())
    total_len += len(tokens)
    for w in tokens:
        try:
            vocab[w] += 1
        except:
            vocab[w] = 1

print('Avg. count of words per caption:', total_len/len(train_corpus))
print('Count of unique words: ', len(vocab))

to_del = []
for w in vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
        print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
EOS, UNK = 0, 1

print(len(vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word]

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Avg. count of words per caption: 10.664331859238812
Count of unique words:  24267
missing word: c-
missing word: red-hands
missing word: kickstands
missing word: make-shift
missing word: ballsack
missing word: hand-scrawled
missing word: mini-van
missing word: manlier
missing word: cooldeets88
missing word: doe-like
missing word: one-percenter
missing word: head-butts
missing word: four-shot
missing word: squeamishly
missing word: tiddlywink
missing word: blithers
missing word: gun-shy
missing word: shaggy-haired
missing word: t-boned
missing word: understandingly
missing word: pistol-whips
missing word: tantastic
missing word: butter-colored
missing word: fast-forwards
missing word: pornos
missing word: on-suite
missing word: daps
missing word: hollow-eyed
missing word: someone's
missing word: expressionlessly
missing word: tipsily
missing word: broad-barreled
missing word: cold-eyed
missing word: unsheathes
missing word: stibble
missing word: dvs216
missing word: dv250
missing word: 

missing word: contenti
missing word: saremo
missing word: 'holy
missing word: inkstand
missing word: orsini-
missing word: recitativo
missing word: confutatis
missing word: postillions
missing word: guttered
missing word: toe-searches
missing word: hair-combing
missing word: oil-lease
missing word: hot-breath
missing word: money-filled
missing word: self-angry
missing word: resignment
missing word: undiscernible
missing word: rain-sheeted
missing word: pushbell
missing word: soft-
missing word: lock-folds
missing word: napkin-covered
missing word: halfslip
missing word: rehangs
missing word: by-
missing word: shower-bar
missing word: closed-for-lunch
missing word: bird-ridden
missing word: one-lamp
missing word: shrike-
missing word: desk-counter
missing word: relaxedly
missing word: uncanoped
missing word: chiffonier
missing word: big-doored
missing word: full-
missing word: pier-glass
missing word: recamier
missing word: clock-wise
missing word: leathery-brown
missing word: high-neck

### Determine POS-tagging vocabulary from train split

In [12]:
import nltk

pos_vocab = {}
pos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower())):
        try:
            pos_vocab[tag[1]] += 1
            try: 
                pos_unique_words[tag[1]][tag[0]] += 1
            except:
                pos_unique_words[tag[1]][tag[0]] = 1
        except:
            pos_vocab[tag[1]] = 1
            pos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: tag for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {tag: idx for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1
print(len(idx2pos))

Unique words per tag:
 RB:	1503
 ,:	1
 NN:	11146
 NNS:	4269
 IN:	207
 DT:	29
 JJ:	6950
 .:	3
 VBG:	1867
 VBZ:	2226
 RP:	34
 VBD:	1351
 CD:	266
 PRP:	32
 PRP$:	7
 CC:	30
 VBP:	1491
 ::	4
 ``:	2
 '':	4
 POS:	6
 WDT:	8
 VB:	1596
 EX:	3
 JJR:	93
 VBN:	1451
 TO:	1
 WRB:	10
 WP:	9
 MD:	23
 RBR:	45
 UH:	6
 $:	1
 PDT:	16
 JJS:	52
 NNP:	36
 FW:	29
 WP$:	1
 RBS:	5
 (:	1
 ):	1
 SYM:	6
44


### Determine Universal POS-tagging from train split

In [13]:
import nltk
nltk.download('universal_tagset')

upos_vocab = {}
upos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset='universal'):
        try:
            upos_vocab[tag[1]] += 1
            try: 
                upos_unique_words[tag[1]][tag[0]] += 1
            except:
                upos_unique_words[tag[1]][tag[0]] = 1
        except:
            upos_vocab[tag[1]] = 1
            upos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
print(len(idx2upos))

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Unique words per universal tag:
 ADV:	1545
 .:	17
 NOUN:	15095
 ADP:	207
 DET:	52
 ADJ:	7078
 VERB:	7726
 PRT:	41
 NUM:	266
 PRON:	48
 CONJ:	30
 X:	41
14


### Generate ground-truth references files

In [20]:
with open('../results/LSDMC_val_references.txt', 'w') as f:
    for vidx, cap in zip(valid_vidxs, valid_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))
        
with open('../results/LSDMC_test_references.txt', 'w') as f:
    for vidx, cap in zip(test_vidxs, test_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))

### Generate corpus.pkl file

In [21]:
import pickle

train_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in train_corpus]
valid_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in valid_corpus]
test_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in test_corpus]

train_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in train_corpus]
valid_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in valid_corpus]
test_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in test_corpus]

assert len(train_corpus_widxs) == len(train_vidxs) and len(train_vidxs) == len(train_corpus_pidxs) and len(train_vidxs) == len(train_corpus), f'{len(train_vidxs)}, {len(train_corpus_widxs)}, {len(train_corpus_pidxs)}, {len(train_corpus)}'
assert len(valid_corpus_widxs) == len(valid_vidxs) and len(valid_vidxs) == len(valid_corpus_pidxs) and len(valid_vidxs) == len(valid_corpus), f'{len(valid_vidxs)}, {len(valid_corpus_widxs)}, {len(valid_corpus_pidxs)}, {len(valid_corpus)}'
assert len(test_corpus_widxs) == len(test_vidxs) and len(test_vidxs) == len(test_corpus_pidxs) and len(test_vidxs) == len(test_corpus), f'{len(test_vidxs)}, {len(test_corpus_widxs)}, {len(test_corpus_pidxs)}, {len(test_corpus)}'

train_data = [train_corpus_widxs, train_vidxs, train_corpus_pidxs, train_corpus]
valid_data = [valid_corpus_widxs, valid_vidxs, valid_corpus_pidxs, valid_corpus]
test_data = [test_corpus_widxs, test_vidxs, test_corpus_pidxs, test_corpus]

with open('../../../data/LSDMC/lsdmc_corpus_pos.pkl', 'wb') as outfile:
    pickle.dump([train_data, valid_data, test_data, vocab, idx2word, word_embeddings, idx2pos], outfile)