# Generate ActivityNet Dense Corpus, with programs and captions

## Corpus file contents
0. `train_data`: captions and idxs of training video segments in format `[corpus_opidxs, vidxs, intervals, fps]`, where:
    - `corpus_opidxs` is a list of lists with the index of instructions (operations) in the vocabulary of operations
    - `vidxs` is a list of indexes of video features in the features file
    - `intervals` is a list of lists of tuples with the discretized intervals of each video
    - `fps` is a list of the frame per seconds rate used to discretize the intervals
    - `corpus_widxs` is a list of lists of lists with the index of words in the vocabulary of each caption of each video
    - `corpus_pidxs` is a list of lists of lists with the index of POS tags in the POS tagging vocabulary of each caption of each video
1. `val1_data`: same format as `train_data`.
2. `val2_data`: same format as `train_data`.
3. `programs_vocab`: in format `{'instruction': count}`.
4. `idx2op`: is the vocabulary in format `{idx: 'instruction'}`.
5. `caps_vocab`: in format `{'word': count}`.
6. `idx2word`: is the vocabulary in format `{idx: 'word'}`.
7. `word_embeddings`: are the vectors of each word. The *i*-th row is the word vector of the *i*-th word in the vocabulary.
8. `idx2pos`: is the vocabulary of POS tagging in format `{idx: 'POSTAG'}`.

In [2]:
import json
with open('../../../../data/ActivityNet/captions/train.json') as f:
    datainfo_train = json.load(f)
with open('../../../../data/ActivityNet/captions/val_1.json') as f:
    datainfo_val_1 = json.load(f)
with open('../../../../data/ActivityNet/captions/val_2.json') as f:
    datainfo_val_2 = json.load(f)
with open('../../../../data/ActivityNet/captions/train_ids.json') as f:
    datainfo_train_ids = json.load(f)
with open('../../../../data/ActivityNet/captions/val_ids.json') as f:
    datainfo_val_ids = json.load(f)
with open('../../../../data/ActivityNet/captions/test_ids.json') as f:
    datainfo_test_ids = json.load(f)

In [3]:
datainfo_train['v_nlkmPF8TBdQ']

{'duration': 174.57,
 'timestamps': [[0, 46.26],
  [46.26, 71.57],
  [72.45, 106.49],
  [106.49, 110.85],
  [110.85, 171.08]],
 'sentences': ['People gather in a restaurant, then a cooker shows pasta and ingredients while talking.',
  ' The cooker cuts and fries squid in a pot, then he adds salt and vinegar.',
  ' Then, the cooker cut tomatoes and add to the squid, also adds garlic, pepper and green vegetables.',
  ' After, the cooker adds water and covers the pot with aluminum paper.',
  ' Next, the cooker add the pasta to the squid and mix, then he and serves in a dish while talking.']}

## Analyzing intervals distribution

In [4]:
from statistics import stdev

s = list(sorted(datainfo_train.items(), key=lambda item: len(item[1]['timestamps'])))

M=s[-1]
print('max num of intervals: "{0}" {1}'.format(M[0], len(M[1]['timestamps'])))

m=s[0]
print('min num of intervals: "{0}" {1}'.format(m[0], len(m[1]['timestamps'])))

lens = [len(v[1]['timestamps']) for v in s]
total=sum(lens)
print(f'avg num of intervals: {total/len(s)}')
print(f'stdev num of intervals: {stdev(lens)}')
print(f'total num of intervals: {total}')

max num of intervals: "v_3l7quTy4c2s" 27
min num of intervals: "v_nwznKOuZM7w" 2
avg num of intervals: 3.738735138375462
stdev num of intervals: 1.8837039486692693
total num of intervals: 37421


In [5]:
max_len, max_i, max_vidx = 0, None, None
for k, data in datainfo_train.items():
    for i in data['timestamps']:
        if (i[1] - i[0]) > max_len:
            max_vidx = k
            max_len = i[1] - i[0]
            max_i = i
            
print(f'max interval: "{max_vidx}" {max_i}')

max interval: "v_YtgiDWEY_1A" [98.16, 505.93]


## Create mapping file for extracting features

In [31]:
import os
import sys
import imageio

video_fps = {}

def filter_vids(videos_folder, datainfo):
    # determine the video-ids of videos that can be processed
    correct_vids, error_vids = [], []
    for vid, _ in datainfo.items():
        path = os.path.join(videos_folder, vid+'.mp4')
        try:
            reader = imageio.get_reader(path)
            video_fps[vid] = reader.get_meta_data()['fps']
        except:
            error_vids.append(vid)
        else:
            correct_vids.append(vid)
        sys.stdout.write(f'\rerrors {len(error_vids)}/{len(error_vids)+len(correct_vids)}')
    return correct_vids, error_vids
    
# filter train set
print('*****Processing train set*****')    
train_vids, _ = filter_vids('../../../../data/ActivityNet/videos', datainfo_train)

# filter train set
print('\n\n*****Processing val_1 set*****')
val_1_vids, _ = filter_vids('../../../../data/ActivityNet/videos', datainfo_val_1)

# filter train set
print('\n\n*****Processing val_2 set*****')
val_2_vids, _ = filter_vids('../../../../data/ActivityNet/videos', datainfo_val_2)

# create list of indices for using an only h5 file of features
# train_vidxs = list(range(len(train_vids)))
# val_1_vidxs = list(range(len(train_vids), len(train_vids) + len(val_1_vids)))
# val_2_vidxs = list(range(len(train_vids) + len(val_1_vids), len(train_vids) + len(val_1_vids) + len(val_2_vids)))

# create list of videos with the order to be used for extracting features
# with open('../../../../data/ActivityNet/list_for_extraction.txt', 'w') as fo:
#     for vid in train_vids+val_1_vids+val_2_vids:
#         fo.write("%s\n" % vid)

# create list of indices for using an h5 file of features for each split
train_vidxs = list(range(len(train_vids)))
val_1_vidxs = list(range(len(val_1_vids)))
val_2_vidxs = list(range(len(val_2_vids)))

# create list of videos with the order to be used for extracting features
for split, vids in zip(['train', 'val_1', 'val_2'], [train_vids, val_1_vids, val_2_vids]):
    with open(f'../../../../data/ActivityNet/{split}_list_for_extraction.txt', 'w') as fo:
        for vid in vids:
            fo.write("%s\n" % vid)

### Update ground-truth references jeson files (for dense evaluation only)

In [55]:
new_datainfo_val_1 = {vidx: datainfo_val_1[vid] for vidx, vid in zip(val_1_vidxs, val_1_vids)}
with open('../../results/ActivityNet-Dense_val_1_ref_densecap.json', 'w') as f:
    json.dump(new_datainfo_val_1, f)
    
new_datainfo_val_2 = {vidx: datainfo_val_2[vid] for vidx, vid in zip(val_2_vidxs, val_2_vids)}
with open('../../results/ActivityNet-Dense_val_2_ref_densecap.json', 'w') as f:
    json.dump(new_datainfo_val_2, f)

## Generating programs

In [32]:
idx2op = {0:'<eos>', 1:'<unk>', 2:'skip', 3:'enqueue', 4:'generate'}
op2idx = {'<eos>':0, '<unk>':1, 'skip':2, 'enqueue':3, 'generate':4}
EOS, UNK = 0, 1

def get_program(intervals, num_chunks, complete_skips=True):
    program = []
    p,q = 0,1
    for s,e in intervals:
        while True:
            if p < s:
                program.append(op2idx['skip'])
                p+=1
                q=p+1
            if p >= s:
                if (q+1) > e:
                    program.append(op2idx['generate'])
                    break
                else:
                    program.append(op2idx['enqueue'])
                    q+=1
    if complete_skips:
        while p < num_chunks:
            program.append(op2idx['skip'])
            p+=1
    return program

In [33]:
frames_per_chunk = 16 

def get_intervals_and_programs(vids, datainfo):
    intervals, programs, fps, max_num_chunks, sum_num_chunks = [], [], [], 0, 0
    for vid in vids:
        # get the fps rate of video vidx
        vfps = video_fps[vid] 
        fps.append(vfps)

        # determine the time of each chunk, used to discretisize the video intervals
        chunk_duration = frames_per_chunk / vfps 

        # convert timestamps to chunk-indeces
        vintervals = [(ts[0]//chunk_duration, ts[1]//chunk_duration) for ts in datainfo[vid]['timestamps']]
        intervals.append(vintervals)
        
        # get the number of chunks to be processed
        num_chunks = datainfo[vid]['duration']//chunk_duration
        if num_chunks > max_num_chunks:
            max_num_chunks = num_chunks
        sum_num_chunks += num_chunks

        # get program
        programs.append(get_program(vintervals, num_chunks) + [EOS])
    return intervals, programs, fps, max_num_chunks, sum_num_chunks

train_intervals, train_programs, train_fps, m1, s1 = get_intervals_and_programs(train_vids, datainfo_train)
val_1_intervals, val_1_programs, val_1_fps, m2, s2 = get_intervals_and_programs(val_1_vids, datainfo_val_1)
val_2_intervals, val_2_programs, val_2_fps, m3, s3 = get_intervals_and_programs(val_2_vids, datainfo_val_2)

print(f'Max number of chunks in a video: {max([m1,m2,m3])}')
print(f'Sum number of chunks of all video: {sum([s1,s2,s3])}')

Max number of chunks in a video: 1414.0
Sum number of chunks of all video: 3646350.0


### Create ground-truth references files of programs (for evaluating the progemmer model only)

In [58]:
with open('../../results/ActivityNet-Dense_val_1_ref_programs.txt', 'w') as f:
    for vidx, prog in zip(val_1_vidxs, val_1_programs):
        f.write('{}\t{}\n'.format(vidx, ' '.join([idx2op[iop] for iop in prog])))
        
with open('../../results/ActivityNet-Dense_val_2_ref_programs.txt', 'w') as f:
    for vidx, prog in zip(val_2_vidxs, val_2_programs):
        f.write('{}\t{}\n'.format(vidx, ' '.join([idx2op[iop] for iop in prog])))

### Analyzing generated programs distribution

In [34]:
def print_statics(programs):
    programs_s = list(sorted(programs, key=lambda x: len(x)))
    print(f'max len: {len(programs_s[-1])}')
    print(f'min len: {len(programs_s[0])}')
    print(f'avg len: {sum([len(p) for p in programs_s])//len(programs_s)}')
    print(f'stdev len: {stdev([len(p) for p in programs_s])}')

    vocab = {}
    for op, i in op2idx.items():
        vocab[op] = sum([p.count(i) for p in programs])
        print(f'number of {op}: {vocab[op]}')
        
    return vocab
        
print('\n**train set statistics**')
programs_vocab = print_statics(train_programs)

print('\n**val-1 set statistics**')
print_statics(val_1_programs)

print('\n**val-2 set statistics**')
print_statics(val_2_programs)

print(f'\ntrain-vocab: {programs_vocab}')


**train set statistics**
max len: 2829
min len: 5
avg len: 428
stdev len: 266.8239265678711
number of <eos>: 9077
number of <unk>: 0
number of skip: 1840467
number of enqueue: 2003669
number of generate: 33844

**val-1 set statistics**
max len: 2435
min len: 5
avg len: 430
stdev len: 271.908609288794
number of <eos>: 4454
number of <unk>: 0
number of skip: 905896
number of enqueue: 991059
number of generate: 15826

**val-2 set statistics**
max len: 2317
min len: 5
avg len: 434
stdev len: 281.9635100672509
number of <eos>: 4424
number of <unk>: 0
number of skip: 899987
number of enqueue: 1004064
number of generate: 15413

train-vocab: {'<eos>': 9077, '<unk>': 0, 'skip': 1840467, 'enqueue': 2003669, 'generate': 33844}


In [35]:
# determine the maximum distance between two skips instructions
skip_max_dist, skip_max_vid, skip_max_i = 0, None, 0
for i, (vid, p) in enumerate(zip(train_vids, train_programs)):
    skips_pos = [i for i, o in enumerate(p) if o == op2idx['skip']]
    if len(skips_pos):
        prev = skips_pos[0]
        for pos in skips_pos[1:]:
            if pos - prev > skip_max_dist:
                skip_max_dist = pos - prev
                skip_max_vid = vid
                skip_max_i = i
    else:
        print(f'video w/o skips: {vid}')

print(f'\nmax distance between skips is {skip_max_dist} in the program of video "{skip_max_vid}", with {len(train_programs[skip_max_i])} instructions')
print(f'program of "{skip_max_vid}":', ''.join([idx2op[i][0] for i in train_programs[skip_max_i]]))


max distance between skips is 2644 in the program of video "v_YtgiDWEY_1A", with 2829 instructions
program of "v_YtgiDWEY_1A": eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeegssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssseeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee

### Testing program generator

In [36]:
intervals = [(1,3), (2,3), (3,5), (4,8), (5,6), (6,7), (7,8)]
num_chunks = 9
print([idx2op[i][0] for i in get_program(intervals, num_chunks)])

intervals = [(3,4), (3,6)]
num_chunks = 7
print([idx2op[i][0] for i in get_program(intervals, num_chunks)])

['s', 'e', 'g', 's', 'g', 's', 'e', 'g', 's', 'e', 'e', 'e', 'g', 's', 'g', 's', 'g', 's', 'g', 's', 's']
['s', 's', 's', 'g', 'e', 'e', 'g', 's', 's', 's', 's']


## Processing Captions

### Create index of captions and ground-truth references files (for evaluating the captioning model only)

In [60]:
train_cidxs = []
current_idx = 0
for vid in train_vids:
    count = len(datainfo_train[vid]['sentences'])
    train_cidxs.append([idx for idx in range(current_idx, current_idx+count)])
    current_idx += count

val_1_cidxs = []
with open('../../results/ActivityNet-Dense_val_1_ref_captions.txt', 'w') as f:
    for vid in val_1_vids:
        caps = datainfo_val_1[vid]['sentences']
        cidxs = list(range(current_idx, current_idx+len(caps)))
        val_1_cidxs.append(cidxs)
        current_idx += len(caps)
        for cidx, cap in zip(cidxs, caps):
            f.write('{}\t{}\n'.format(cidx, cap.strip().replace('\n','').lower()))

val_2_cidxs = []
with open('../../results/ActivityNet-Dense_val_2_ref_captions.txt', 'w') as f:
    for vid in val_2_vids:
        caps = datainfo_val_1[vid]['sentences']
        cidxs = list(range(current_idx, current_idx+len(caps)))
        val_2_cidxs.append(cidxs)
        current_idx += len(caps)
        for cidx, cap in zip(cidxs, caps):
            f.write('{}\t{}\n'.format(cidx, cap.strip().replace('\n','').lower()))

### Get pretrained embeddings

In [38]:
import os
import numpy as np

wordvectors = {}
# with open('./glove.42B.300d.txt') as f:
with open('../../tools/glove.6B.300d.txt') as f:
    for line in f:
        s = line.strip().split(' ')
        if len(s) == 301:
            wordvectors[s[0]] = np.array(s[1:], dtype=float)
    print(len(wordvectors))

400000


### Determine the vocabulary from train split

In [39]:
import nltk
nltk.download('punkt')

caps_vocab, total_len, caps_count = {}, 0, 0
for vid in train_vids:
    caps = datainfo_train[vid]['sentences']
    caps_count += len(caps)
    for _, cap in enumerate(caps):
        tokens = nltk.word_tokenize(cap.lower())
        total_len += len(tokens)
        for w in tokens:
            try:
                caps_vocab[w] += 1
            except:
                caps_vocab[w] = 1

print('Avg. count of words per caption:', total_len/caps_count)
print('Count of unique words: ', len(caps_vocab))

to_del = []
for w in caps_vocab.keys():
    if not w in wordvectors:
        to_del.append(w)
        print('missing word: {}'.format(w))

print('count of missing words: ', len(to_del))
        
for w in to_del:
    del caps_vocab[w]
        
idx2word = {idx: word for idx, word in enumerate(['<eos>', '<unk>'] + list(caps_vocab.keys()))}
word2idx = {word: idx for idx, word in enumerate(['<eos>', '<unk>'] + list(caps_vocab.keys()))}
EOS, UNK = 0, 1

print(len(caps_vocab), len(idx2word), len(word2idx))

word_embeddings = np.zeros((len(idx2word), 300))
for idx, word in idx2word.items():
    if idx == EOS:
        word_embeddings[idx] = wordvectors['eos']
    elif idx == UNK:
        word_embeddings[idx] = wordvectors['unk']
    else:
        word_embeddings[idx] = wordvectors[word]

[nltk_data] Downloading package punkt to /home/jeperez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Avg. count of words per caption: 14.720659496513415
Count of unique words:  10167
missing word: hulte
missing word: intertubes
missing word: javeline
missing word: sideward
missing word: jump-stilts
missing word: woodfire
missing word: unscrews
missing word: leathe
missing word: gymasts
missing word: voleyball
missing word: liquir
missing word: sepperates
missing word: eliptical
missing word: rubic
missing word: canoesport
missing word: intertube
missing word: cheer-leading
missing word: kufiyyas
missing word: sanitizes
missing word: wearinf
missing word: aiter
missing word: unhooks
missing word: thorougly
missing word: absorbant
missing word: parasails
missing word: plyas
missing word: wreslers
missing word: sizzors
missing word: gargles
missing word: zucky
missing word: -end-
missing word: unboxing
missing word: rubix
missing word: gabs
missing word: themiddle
missing word: garnishments
missing word: whie
missing word: trakc
missing word: forearm-mounted
missing word: re-equips
missi

### Determine POS-tagging vocabulary from train split

In [40]:
import nltk

pos_vocab, pos_unique_words = {}, {}
for vid in train_vids:
    caps = datainfo_train[vid]['sentences']
    for cap in caps:
        for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower())):
            try:
                pos_vocab[tag[1]] += 1
                try: 
                    pos_unique_words[tag[1]][tag[0]] += 1
                except:
                    pos_unique_words[tag[1]][tag[0]] = 1
            except:
                pos_vocab[tag[1]] = 1
                pos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in pos_unique_words.items()]))
            
idx2pos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
pos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(pos_vocab.keys()))}
EOS, UNK = 0, 1

Unique words per tag:
 DT:	25
 JJ:	2063
 NN:	4672
 VBZ:	1045
 VBN:	642
 VBG:	968
 IN:	146
 CC:	15
 PRP$:	7
 .:	3
 NNS:	2056
 PRP:	21
 VBP:	693
 TO:	1
 POS:	3
 VB:	973
 CD:	113
 RB:	470
 RP:	28
 ,:	1
 EX:	6
 VBD:	477
 WRB:	7
 WDT:	7
 JJS:	18
 JJR:	62
 WP:	4
 RBR:	20
 RBS:	3
 '':	2
 MD:	15
 ``:	2
 PDT:	13
 WP$:	1
 (:	1
 ):	1
 ::	3
 NNP:	11
 FW:	18
 #:	1
 SYM:	1
 $:	1
 UH:	1


### Determine Universal POS-tagging from train split

In [41]:
import nltk
nltk.download('universal_tagset')

upos_vocab, upos_unique_words = {}, {}
for vid in train_vids:
    caps = datainfo_train[vid]['sentences']
    for cap in caps:
        for tag in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset='universal'):
            try:
                upos_vocab[tag[1]] += 1
                try: 
                    upos_unique_words[tag[1]][tag[0]] += 1
                except:
                    upos_unique_words[tag[1]][tag[0]] = 1
            except:
                upos_vocab[tag[1]] = 1
                upos_unique_words[tag[1]] = {tag[0]: 1}

print('Unique words per universal tag:')
print('\n'.join([f' {k}:\t{len(words)}' for k, words in upos_unique_words.items()]))
            
idx2upos = {idx: word for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}
upos2idx = {word: idx for idx, word in enumerate(['eos', 'unk'] + list(upos_vocab.keys()))}

[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/jeperez/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


Unique words per universal tag:
 DET:	45
 ADJ:	2134
 NOUN:	6541
 VERB:	3749
 ADP:	146
 CONJ:	15
 PRON:	32
 .:	15
 PRT:	32
 NUM:	113
 ADV:	493
 X:	20


## Generate corpus .pkl file

In [42]:
import pickle

train_corpus_widxs = [[[word2idx[w] if w in caps_vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in datainfo_train[vid]['sentences']] for vid in train_vids]
val_1_corpus_widxs = [[[word2idx[w] if w in caps_vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in datainfo_val_1[vid]['sentences']] for vid in val_1_vids]
val_2_corpus_widxs = [[[word2idx[w] if w in caps_vocab else UNK for w in nltk.word_tokenize(cap.lower())] + [EOS] for cap in datainfo_val_2[vid]['sentences']] for vid in val_2_vids]

train_corpus_pidxs = [[[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in datainfo_train[vid]['sentences']] for vid in train_vids]
val_1_corpus_pidxs = [[[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in datainfo_val_1[vid]['sentences']] for vid in val_1_vids]
val_2_corpus_pidxs = [[[pos2idx[w[1]] if w[1] in pos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()))] + [EOS] for cap in datainfo_val_2[vid]['sentences']] for vid in val_2_vids]

train_corpus_upidxs = [[[upos2idx[w[1]] if w[1] in upos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset="universal")] + [EOS] for cap in datainfo_train[vid]['sentences']] for vid in train_vids]
val_1_corpus_upidxs = [[[upos2idx[w[1]] if w[1] in upos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset="universal")] + [EOS] for cap in datainfo_val_1[vid]['sentences']] for vid in val_1_vids]
val_2_corpus_upidxs = [[[upos2idx[w[1]] if w[1] in upos_vocab else UNK for w in nltk.pos_tag(nltk.word_tokenize(cap.lower()), tagset="universal")] + [EOS] for cap in datainfo_val_2[vid]['sentences']] for vid in val_2_vids]

train_data = [train_vidxs, train_cidxs, train_intervals, train_fps, train_programs, train_corpus_widxs, train_corpus_pidxs, train_corpus_upidxs]
val_1_data = [val_1_vidxs, val_1_cidxs, val_1_intervals, val_1_fps, val_1_programs, val_1_corpus_widxs, val_1_corpus_pidxs, val_1_corpus_upidxs]
val_2_data = [val_2_vidxs, val_2_cidxs, val_2_intervals, val_2_fps, val_2_programs, val_2_corpus_widxs, val_2_corpus_pidxs, val_2_corpus_upidxs]

with open('../../../../data/ActivityNet/activitynet_dense_corpus.pkl', 'wb') as outfile:
    pickle.dump([train_data, val_1_data, val_2_data, programs_vocab, idx2op, caps_vocab, idx2word, word_embeddings, idx2pos, idx2upos], outfile)

In [80]:
len(train_vidxs) + len(val_1_vidxs) + len(val_2_vidxs)

17955