## Generate corpus and gruond-truth references of released videos

### Corpus file contents
0. train_data: captions and idxs of training videos in format [corpus_widxs, vidxs, corpus_pidxs], where:
    - corpus_widxs is a list of lists with the index of words in the vocabulary
    - vidxs is a list of indexes of video features in the features file
    - corpus_pidxs is a list of lists with the index of POS tags in the POS tagging vocabulary
1. val_data: same format of train_data.
2. test_data: same format of train_data.
3. vocabulary: in format {'word': count}.
4. idx2word: is the vocabulary in format {idx: 'word'}.
5. word_embeddings: are the vectors of each word. The i-th row is the word vector of the i-th word in the vocabulary.
6. idx2pos: is the vocabulary of POS tagging in format {idx: 'POSTAG'}

### Generate split for training and validation

In [10]:
import pandas as pd
train_data = pd.read_csv('../../../data/Charades/annotations/Charades_v1_train.csv', ',')  
valid_data = pd.read_csv('../../../data/Charades/annotations/Charades_v1_test.csv', ',')  
# test_data = pd.read_csv('../../../data/Charades/annotations/Charades_v1_test.csv', ',')

train_data

Unnamed: 0,id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length
0,46GP8,HR43,Kitchen,6.0,7.0,Yes,A person cooking on a stove while watching som...,food;stove;window,A person cooks food on a stove before looking ...,c092 11.90 21.20;c147 0.00 12.60,24.83
1,N11GT,0KZ7,Stairs,6.0,7.0,Yes,"One person opens up a folded blanket, then sne...",blanket;broom;floor,Person at the bottom of the staircase shakes a...,c098 8.60 14.20;c075 0.00 11.70;c127 0.00 15.2...,18.33
2,0IH69,6RE8,Bedroom,6.0,5.0,Yes,A person is seen leaving a cabinet. They then ...,book;box;cabinet;shelf,A person is standing in a bedroom. They walk o...,,30.25
3,KRF68,YA10,Laundry room,6.0,7.0,Yes,A person runs into their laundry room. They gr...,clothes;door;phone,A person runs in and shuts door. The person gr...,c018 22.60 27.80;c141 4.10 9.60;c148 10.30 25....,30.33
4,MJO7C,6RE8,Kitchen,6.0,6.0,Yes,A person runs into their pantry holding a bott...,cup;phone,A person runs in place while holding a bottle ...,c015 0.00 32.00;c107 0.00 32.00,31.38
...,...,...,...,...,...,...,...,...,...,...,...
7980,7K2CS,HJZQ,Garage,6.0,6.0,Yes,Person enters the garage while sneezing. Perso...,chair;clothes;door;food;sandwich;shirt;spoon,"A enters through a doorway, sneezes, then clos...",c065 17.60 31.00;c067 17.60 31.00;c153 0.00 5....,30.08
7981,S2A89,KL48,Bathroom,7.0,7.0,Yes,"A person takes a chair and walks it over, plac...",chair;door,A PERSON IS TAKING A CHAIR FROM ONE ROOM TO TH...,c006 4.00 10.80;c141 4.40 10.90;c151 12.80 20....,19.29
7982,01O27,18IT,Bathroom,6.0,7.0,Yes,A person enters a bathroom and closes the door...,door;floor;mirror,A person is walking towards the bathroom. A pe...,c006 5.10 11.50;c008 0.50 6.60;c124 39.00 47.0...,46.08
7983,2MJ72,6RE8,Bedroom,6.0,6.0,Yes,A person opens a window in their laundry room....,door;towel;window,A person opens a window and looks out of it. ...,c006 11.00 17.00;c037 20.70 31.00;c092 0.60 8....,30.25


In [42]:
train_vidxs, train_corpus = [], []
for vid, script, descriptions in zip(train_data['id'], train_data['script'], train_data['descriptions']):
    train_vidxs.append(vid)
    train_corpus.append(script)
    for d in descriptions.split(';'):
        train_vidxs.append(vid)
        train_corpus.append(d)
# train_vidxs, train_corpus = list(train_data['id']), list(train_data['descriptions'])
# valid_vidxs, valid_corpus = list(valid_data['id']), list(valid_data['descriptions'])
# test_vidxs, test_corpus = list(test_data['video-id']), list(test_data['sentence'])

### Get pretrained embeddings

In [36]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('./glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [47]:
import nltk
nltk.download('punkt')

vocab, total_len = {}, 0
for cap in train_corpus:
    tokens = nltk.word_tokenize(cap.lower())
    total_len += len(tokens)
    for w in tokens:
        try:
            vocab[w] += 1
        except:
            vocab[w] = 1

print('Avg. count of words per caption:', total_len/len(train_corpus))
print('Count of unique words: ', len(vocab))

to_del = []
for w in vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
        print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(vocab.keys()))}
EOS, UNK = 0, 1

print(len(vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word]

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Avg. count of words per caption: 23.913934007559504
Count of unique words:  4010
missing word: shoes.this
missing word: floor/bed
missing word: rinces
missing word: shelf.the
missing word: .the
missing word: selfies
missing word: up.the
missing word: papers.the
missing word: towel.the
missing word: bedroom.they
missing word: table/desk
missing word: manac
missing word: something..
missing word: begings
missing word: stiring
missing word: bedroom.the
missing word: doorwa
missing word: food.the
missing word: tidys
missing word: bed.the
missing word: bench.the
missing word: stand.the
missing word: selfie
missing word: mid-stairs
missing word: place.another
missing word: thews
missing word: cookie..
missing word: picture/video
missing word: cup.the
missing word: brooming
missing word: decines
missing word: stairway.the
missing word: dreser
missing word: pillor
missing word: bowelslk
missing word: proccess
missing word: lieing
missing word: jar.the
missing word: neaten
missing word: vauumin

### Determine POS-tagging vocabulary from train split

In [45]:
import nltk

pos_vocab = {}
pos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower())):
        try:
            pos_vocab[tag[1]] += 1
            try: 
                pos_unique_words[tag[1]][tag[0]] += 1
            except:
                pos_unique_words[tag[1]][tag[0]] = 1
        except:
            pos_vocab[tag[1]] = 1
            pos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: tag for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {tag: idx for idx, tag in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1
print(len(idx2pos))

Unique words per tag:
 DT:	18
 NN:	1669
 VBG:	482
 IN:	94
 RP:	25
 .:	3
 VBZ:	454
 CD:	18
 JJ:	629
 ,:	1
 RB:	252
 CC:	11
 NNS:	582
 PRP:	18
 VBN:	207
 VBP:	346
 WDT:	5
 TO:	1
 VB:	465
 PRP$:	9
 VBD:	285
 WRB:	6
 POS:	4
 MD:	10
 EX:	1
 WP:	4
 ::	3
 RBR:	11
 FW:	7
 NNP:	8
 JJR:	18
 (:	2
 ):	2
 PDT:	5
 JJS:	8
 #:	1
 SYM:	2
 '':	1
40


### Determine Universal POS-tagging from train split

In [46]:
import nltk
nltk.download('universal_tagset')

upos_vocab = {}
upos_unique_words = {}
for cap in train_corpus:
    for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset='universal'):
        try:
            upos_vocab[tag[1]] += 1
            try: 
                upos_unique_words[tag[1]][tag[0]] += 1
            except:
                upos_unique_words[tag[1]][tag[0]] = 1
        except:
            upos_vocab[tag[1]] = 1
            upos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
print(len(idx2upos))

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Unique words per universal tag:
 DET:	27
 NOUN:	2199
 VERB:	1780
 ADP:	94
 PRT:	30
 .:	13
 NUM:	18
 ADJ:	651
 ADV:	265
 CONJ:	11
 PRON:	30
 X:	9
14


### Generate ground-truth references files

In [20]:
with open('../results/Charades_v1_val_references.txt', 'w') as f:
    for vidx, cap in zip(valid_vidxs, valid_corpus):
        f.write('{}\t{}\n'.format(vidx, cap.lower()))

### Generate corpus.pkl file

In [21]:
import pickle

train_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in train_corpus]
valid_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in valid_corpus]
test_corpus_widxs = [[word2idx[w] if w in vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in test_corpus]

train_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in train_corpus]
valid_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in valid_corpus]
test_corpus_pidxs = [[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in test_corpus]

assert len(train_corpus_widxs) == len(train_vidxs) and len(train_vidxs) == len(train_corpus_pidxs) and len(train_vidxs) == len(train_corpus), f'{len(train_vidxs)}, {len(train_corpus_widxs)}, {len(train_corpus_pidxs)}, {len(train_corpus)}'
assert len(valid_corpus_widxs) == len(valid_vidxs) and len(valid_vidxs) == len(valid_corpus_pidxs) and len(valid_vidxs) == len(valid_corpus), f'{len(valid_vidxs)}, {len(valid_corpus_widxs)}, {len(valid_corpus_pidxs)}, {len(valid_corpus)}'
assert len(test_corpus_widxs) == len(test_vidxs) and len(test_vidxs) == len(test_corpus_pidxs) and len(test_vidxs) == len(test_corpus), f'{len(test_vidxs)}, {len(test_corpus_widxs)}, {len(test_corpus_pidxs)}, {len(test_corpus)}'

train_data = [train_corpus_widxs, train_vidxs, train_corpus_pidxs, train_corpus]
valid_data = [valid_corpus_widxs, valid_vidxs, valid_corpus_pidxs, valid_corpus]
test_data = [test_corpus_widxs, test_vidxs, test_corpus_pidxs, test_corpus]

with open('../../../data/LSDMC/charades_v1_corpus_pos.pkl', 'wb') as outfile:
    pickle.dump([train_data, valid_data, test_data, vocab, idx2word, word_embeddings, idx2pos], outfile)