In [1]:
import json
import string
import nltk
import pickle
import numpy as np

from tqdm import tqdm
from collections import defaultdict
from config import Constants

# Reading The Data

* base_path is the base path for all the code, datasets, and models
* the data is read using json library from python

In [2]:
base_path = "MSRVTT/raw-data/train_val_videodatainfo.json"

In [3]:
json_data = json.load(open(base_path, 'r'))

In [4]:
sentences = json_data['sentences']
videos = json_data['videos']

## Storing The Data

1. Get all the needed video id for training
2. Get all the needed captions

In [5]:
split = {'train': [], 'validate': [], 'test': []}

In [6]:
count = 0

for v in videos:
    if count < 849:
        split['train'].append(int(v['id']))
        count += 1
    elif count > 850 and count < 899:
        split['validate'].append(int(v['id']))
        count += 1
    elif count > 900 and count < 999:
        split['test'].append(int(v['id']))
        count += 1
    else:
        break

## Get Captions

In [7]:
raw_caps_all = defaultdict(list)
raw_caps_train = defaultdict(list)
references = defaultdict(list)

In [21]:
for item in tqdm(sentences):
    if int(item['video_id'][5:]) < 1000:
        vid = item['video_id']
        tokens = [token.lower() for token in item['caption'].split() if token not in string.punctuation]

        raw_caps_all[vid].append(tokens)

        if int(vid[5:]) in split['train']:
            raw_caps_train[vid].append(tokens)

        references[vid].append({
            'image_id': vid, 
            'cap_id': len(references[vid]), 
            'caption': ' '.join(tokens)
        })

100%|█████████████████████████████████████████████████████████████████████| 140200/140200 [00:00<00:00, 1353898.45it/s]


In [25]:
itoc = {}
split_category = {'train': defaultdict(list), 'validate': defaultdict(list), 'test': defaultdict(list)}
count = 0

for item in videos:
    if count < 100:
        itoc[item['id']] = item['category']
        split_category[item['split']][int(item["category"])].append(int(item['id']))
        count += 1

In [26]:
results = {
    'split': split, 
    'raw_caps_train': raw_caps_train, 
    'raw_caps_all': raw_caps_all, 
    'references': references,
    'itoc': itoc,
    'split_category': split_category
}

In [27]:
results

{'split': {'train': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   75,
   76,
   77,
   78,
   79],
  'validate': [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
  'test': [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]},
 'raw_caps_train': defaultdict(list,
             {'video24': [['a', 'man', 'is', 'driving', 'a', 'car'],
               ['a',
                'clip',
                'with',
                'a',
                'heavy',
                'man',
                'talking',
                'to',
                'the',
      

In [34]:
split = results['split']
raw_caps_train = results['raw_caps_train']
raw_caps_all = results['raw_caps_all']
references = results.get('references', None)

vid2id = results.get('vid2id', None)
itoc = results.get('itoc', None)
split_category = results.get('split_category', None)

In [35]:
def build_vocab(train_vid2caps, count_thr, sort_vocab=False):
    # count up the number of words
    counts = {}
    for vid, caps in train_vid2caps.items():
        for cap in caps:
            for w in cap:
                counts[w] = counts.get(w, 0) + 1

    bad_words = [w for w, n in counts.items() if n <= count_thr]
    bad_count = sum(counts[w] for w in bad_words)
    total_words = sum(counts.values())

    print('- The number of bad words: %d/%d = %.2f%%' %
          (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
    print('- The number of the vocabulary: %d' % (len(counts) - len(bad_words)))
    print('- The number of UNKs: %d/%d = %.2f%%' %
          (bad_count, total_words, bad_count * 100.0 / total_words))

    candidate_vocab = [(w, n) for w, n in counts.items() if n > count_thr]
    if sort_vocab:
        print('- Sort the vocabulary by the frequency of words, larger the first')
        candidate_vocab = sorted(candidate_vocab, key=lambda x: -x[1])

    vocab = [w for w, _ in candidate_vocab]

    assert len(vocab) == len(counts) - len(bad_words)

    print('- Top 100 words:')
    print(vocab[:100])
    
    return vocab

In [36]:
word_count_threshold = 2
sort_vocab = True

In [37]:
vocab = build_vocab(raw_caps_train, word_count_threshold, sort_vocab)

- The number of bad words: 271/2185 = 12.40%
- The number of the vocabulary: 1914
- The number of UNKs: 315/46499 = 0.68%
- Sort the vocabulary by the frequency of words, larger the first
- Top 100 words:
['a', 'is', 'man', 'the', 'and', 'in', 'on', 'of', 'to', 'woman', 'are', 'talking', 'with', 'about', 'girl', 'two', 'person', 'an', 'playing', 'video', 'for', 'singing', 'people', 'game', 'men', 'from', 'show', 'her', 'how', 'his', 'while', 'some', 'showing', 'car', 'at', 'stage', 'song', 'young', 'cat', 's', 'there', 'talks', 'screen', 'other', 'up', 'speaking', 'music', 'band', 'shown', 'shows', 'cartoon', 'guy', 'being', 'sitting', 'it', 'women', 'each', 'movie', 'soccer', 'tv', 'clip', 'by', 'computer', 'food', 'group', 'camera', 'football', 'boy', 'their', 'as', 'another', 'giving', 'that', 'something', 'kitchen', 'makeup', 'someone', 'front', 'dog', 'into', 'one', 'lady', 'black', 'walking', 'off', 'this', 'white', 'player', 'then', 'wearing', 'around', 'plays', 'play', 'she', '

In [38]:
def get_captions_and_pos_tags(raw_caps_all, vocab):
    itow = {i + 6: w for i, w in enumerate(vocab)}
    itow[Constants.PAD] = Constants.PAD_WORD
    itow[Constants.UNK] = Constants.UNK_WORD
    itow[Constants.BOS] = Constants.BOS_WORD
    itow[Constants.EOS] = Constants.EOS_WORD
    itow[Constants.MASK] = Constants.MASK_WORD
    itow[Constants.VIS] = Constants.VIS_WORD

    wtoi = {w: i for i, w in itow.items()}  # inverse table

    ptoi = {}
    ptoi[Constants.PAD_WORD] = Constants.PAD
    ptoi[Constants.UNK_WORD] = Constants.UNK
    ptoi[Constants.BOS_WORD] = Constants.BOS
    ptoi[Constants.EOS_WORD] = Constants.EOS
    ptoi[Constants.MASK_WORD] = Constants.MASK
    ptoi[Constants.VIS_WORD] = Constants.VIS
    tag_start_i = 6

    captions = defaultdict(list)
    pos_tags = defaultdict(list)
    for vid, caps in tqdm(raw_caps_all.items()):
        for cap in caps:
            tag_res = nltk.pos_tag(cap)

            caption_id = [Constants.BOS]
            tagging_id = [Constants.BOS]

            for w, t in zip(cap, tag_res):
                assert t[0] == w
                tag = Constants.pos_tag_mapping[t[1]]

                if w in wtoi.keys():
                    caption_id += [wtoi[w]]
                    if tag not in ptoi.keys():
                        ptoi[tag] = tag_start_i
                        tag_start_i += 1
                    tagging_id += [ptoi[tag]]
                else:
                    caption_id += [Constants.UNK]
                    tagging_id += [Constants.UNK]

            caption_id += [Constants.EOS]
            tagging_id += [Constants.EOS]

            captions[vid].append(caption_id)
            pos_tags[vid].append(tagging_id)

    itop = {i: t for t, i in ptoi.items()}
    return itow, captions, itop, pos_tags

In [39]:
itow, captions, itop, pos_tags = get_captions_and_pos_tags(raw_caps_all, vocab)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 19.45it/s]


In [40]:
def get_length_info(captions):
    length_info = {}
    max_length = 50

    for vid, caps in captions.items():
        length_info[vid] = [0] * max_length
        for cap in caps:
            length = len(cap) - 2 # exclude <bos>, <eos>
            if length >= max_length:
                continue
            length_info[vid][length] += 1

    return length_info

In [41]:
length_info = get_length_info(captions)

In [42]:
info = {
        'split': split,                # {'train': [0, 1, 2, ...], 'validate': [...], 'test': [...]}
        'vid2id': vid2id,
        'split_category': split_category,
        'itoc': itoc,
        'itow': itow,                       # id to word
        'itop': itop,                       # id to POS tag
        'length_info': length_info,         # id to length info
    }

In [43]:
pickle.dump({
            'info': info,
            'captions': captions,
            'pos_tags': pos_tags,
            }, 
            open("MSRVTT/info_corpus.pkl", 'wb')
)

if references is not None:
    pickle.dump(
        references,
        open("MSRVTT/refs.pkl", 'wb')
    )