In [15]:
import json
import string
import nltk
import pickle
import numpy as np

from tqdm import tqdm
from collections import defaultdict
from config import Constants

# Reading The Data

* base_path is the base path for all the code, datasets, and models
* the data is read using json library from python

In [16]:
base_path = "MSRVTT/raw-data/train_val_videodatainfo.json"

In [17]:
json_data = json.load(open(base_path, 'r'))

In [18]:
sentences = json_data['sentences']
videos = json_data['videos']

In [19]:
## WATCHOUTT

num_videos = 1000 # The number of videos you want to use

## Storing The Data

1. Get all the needed video id for training
2. Get all the needed captions

In [67]:
split = {'train': [], 'validate': [], 'test': []}
count = 0

num_train = 850
num_val = 50
num_test = 100

for v in videos:
    
    # Video Id that will be used for training collected
    # here
    if count < num_train:
        split['train'].append(int(v['id']))
        count += 1
        
    elif count >= num_train and count < num_train + num_val:
        split['validate'].append(int(v['id']))
        count += 1
        
    elif count >= num_train + num_val and count < num_train + num_val + num_test:
        split['test'].append(int(v['id']))
        count += 1
    else:
        break

In [82]:
print("This is the len of train split: " + str(len(split["train"])))
print("With the first index :" + str(split["train"][:5]) + " and the last index :" + str(split["train"][-5:]))
print()
print("This is the len of validation split: " + str(len(split["validate"])))
print("With the first index :" + str(split["validate"][:5]) + " and the last index :" + str(split["validate"][-5:]))
print()
print("This is the len of test split: " + str(len(split["test"])))
print("With the first index :" + str(split["test"][:5]) + " and the last index :" + str(split["test"][-5:]))

This is the len of train split: 850
With the first index :[0, 1, 2, 3, 4] and the last index :[845, 846, 847, 848, 849]

This is the len of validation split: 50
With the first index :[850, 851, 852, 853, 854] and the last index :[895, 896, 897, 898, 899]

This is the len of test split: 100
With the first index :[900, 901, 902, 903, 904] and the last index :[995, 996, 997, 998, 999]


## Get Captions

In [94]:
raw_caps_all = defaultdict(list)
raw_caps_train = defaultdict(list)
references = defaultdict(list)

In [96]:
for item in tqdm(sentences):
    if int(item['video_id'][5:]) < num_videos:
        vid = item['video_id']
        tokens = [token.lower() for token in item['caption'].split() if token not in string.punctuation]

        raw_caps_all[vid].append(tokens)

        if int(vid[5:]) in split['train']:
            raw_caps_train[vid].append(tokens)

        references[vid].append({
            'image_id': vid, 
            'cap_id': len(references[vid]), 
            'caption': ' '.join(tokens)
        })

100%|██████████████████████████████████████████████████████████████████████| 140200/140200 [00:00<00:00, 293120.54it/s]


In [85]:
itoc = {}
split_category = {'train': defaultdict(list), 'validate': defaultdict(list), 'test': defaultdict(list)}
count = 0

for item in videos:
    ## Check for the len of video to train
    if count < num_videos:
        itoc[item['id']] = item['category']
        split_category[item['split']][int(item["category"])].append(int(item['id']))
        count += 1

In [86]:
results = {
    'split': split, 
    'raw_caps_train': raw_caps_train, 
    'raw_caps_all': raw_caps_all, 
    'references': references,
    'itoc': itoc,
    'split_category': split_category
}

In [87]:
split = results['split']
raw_caps_train = results['raw_caps_train']
raw_caps_all = results['raw_caps_all']
references = results.get('references', None)

vid2id = results.get('vid2id', None)
itoc = results.get('itoc', None)
split_category = results.get('split_category', None)

In [88]:
def build_vocab(train_vid2caps, count_thr, sort_vocab=False):
    # count up the number of words
    counts = {}
    for vid, caps in train_vid2caps.items():
        for cap in caps:
            for w in cap:
                counts[w] = counts.get(w, 0) + 1

    bad_words = [w for w, n in counts.items() if n <= count_thr]
    bad_count = sum(counts[w] for w in bad_words)
    total_words = sum(counts.values())

    print('- The number of bad words: %d/%d = %.2f%%' %
          (len(bad_words), len(counts), len(bad_words) * 100.0 / len(counts)))
    print('- The number of the vocabulary: %d' % (len(counts) - len(bad_words)))
    print('- The number of UNKs: %d/%d = %.2f%%' %
          (bad_count, total_words, bad_count * 100.0 / total_words))

    candidate_vocab = [(w, n) for w, n in counts.items() if n > count_thr]
    if sort_vocab:
        print('- Sort the vocabulary by the frequency of words, larger the first')
        candidate_vocab = sorted(candidate_vocab, key=lambda x: -x[1])

    vocab = [w for w, _ in candidate_vocab]

    assert len(vocab) == len(counts) - len(bad_words)

    print('- Top 100 words:')
    print(vocab[:100])
    
    return vocab

In [89]:
word_count_threshold = 2
sort_vocab = True

In [90]:
vocab = build_vocab(raw_caps_train, word_count_threshold, sort_vocab)

- The number of bad words: 4723/8204 = 57.57%
- The number of the vocabulary: 3481
- The number of UNKs: 5931/154695 = 3.83%
- Sort the vocabulary by the frequency of words, larger the first
- Top 100 words:
['a', 'is', 'the', 'in', 'man', 'and', 'of', 'on', 'to', 'woman', 'are', 'with', 'about', 'talking', 'video', 'person', 'playing', 'game', 'people', 'an', 'two', 'girl', 'some', 'for', 'men', 'from', 'car', 'his', 'while', 'at', 'showing', 'show', 'cartoon', 'her', 'being', 'singing', 'someone', 'how', 'group', 'talks', 'stage', 'it', 'there', 'shown', 'movie', 'other', 'black', 'young', 's', 'food', 'dancing', 'women', 'music', 'guy', 'by', 'something', 'into', 'cooking', 'walking', 'white', 'song', 'screen', 'around', 'down', 'another', 'boy', 'up', 'wearing', 'clip', 'as', 'minecraft', 'giving', 'kitchen', 'lady', 'sitting', 'speaking', 'scene', 'that', 'their', 'camera', 'one', 'shows', 'out', 'each', 'plays', 'red', 'then', 'tv', 'he', 'animated', 'through', 'characters', 'dif

In [91]:
def get_captions_and_pos_tags(raw_caps_all, vocab):
    itow = {i + 6: w for i, w in enumerate(vocab)}
    itow[Constants.PAD] = Constants.PAD_WORD
    itow[Constants.UNK] = Constants.UNK_WORD
    itow[Constants.BOS] = Constants.BOS_WORD
    itow[Constants.EOS] = Constants.EOS_WORD
    itow[Constants.MASK] = Constants.MASK_WORD
    itow[Constants.VIS] = Constants.VIS_WORD

    wtoi = {w: i for i, w in itow.items()}  # inverse table

    ptoi = {}
    ptoi[Constants.PAD_WORD] = Constants.PAD
    ptoi[Constants.UNK_WORD] = Constants.UNK
    ptoi[Constants.BOS_WORD] = Constants.BOS
    ptoi[Constants.EOS_WORD] = Constants.EOS
    ptoi[Constants.MASK_WORD] = Constants.MASK
    ptoi[Constants.VIS_WORD] = Constants.VIS
    tag_start_i = 6

    captions = defaultdict(list)
    pos_tags = defaultdict(list)
    for vid, caps in tqdm(raw_caps_all.items()):
        for cap in caps:
            tag_res = nltk.pos_tag(cap)

            caption_id = [Constants.BOS]
            tagging_id = [Constants.BOS]

            for w, t in zip(cap, tag_res):
                assert t[0] == w
                tag = Constants.pos_tag_mapping[t[1]]

                if w in wtoi.keys():
                    caption_id += [wtoi[w]]
                    if tag not in ptoi.keys():
                        ptoi[tag] = tag_start_i
                        tag_start_i += 1
                    tagging_id += [ptoi[tag]]
                else:
                    caption_id += [Constants.UNK]
                    tagging_id += [Constants.UNK]

            caption_id += [Constants.EOS]
            tagging_id += [Constants.EOS]

            captions[vid].append(caption_id)
            pos_tags[vid].append(tagging_id)

    itop = {i: t for t, i in ptoi.items()}
    return itow, captions, itop, pos_tags

In [92]:
itow, captions, itop, pos_tags = get_captions_and_pos_tags(raw_caps_all, vocab)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:22<00:00, 44.30it/s]


In [97]:
def get_length_info(captions):
    length_info = {}
    max_length = 50

    for vid, caps in captions.items():
        length_info[vid] = [0] * max_length
        for cap in caps:
            length = len(cap) - 2 # exclude <bos>, <eos>
            if length >= max_length:
                continue
            length_info[vid][length] += 1

    return length_info

In [98]:
length_info = get_length_info(captions)

In [99]:
info = {
        'split': split,                # {'train': [0, 1, 2, ...], 'validate': [...], 'test': [...]}
        'vid2id': vid2id,
        'split_category': split_category,
        'itoc': itoc,
        'itow': itow,                       # id to word
        'itop': itop,                       # id to POS tag
        'length_info': length_info,         # id to length info
    }

In [100]:
pickle.dump({
            'info': info,
            'captions': captions,
            'pos_tags': pos_tags,
            }, 
            open("MSRVTT/info_corpus.pkl", 'wb')
)

if references is not None:
    pickle.dump(
        references,
        open("MSRVTT/refs.pkl", 'wb')
    )

# Check The Index

In [101]:
data = pickle.load(open("MSRVTT/info_corpus.pkl", 'rb'))

In [102]:
captions = data['captions']
pos_tags = data['pos_tags']

info = data['info']    
itow = info['itow']
itoc = info.get('itoc', None)        
itop = info.get('itop', None)
length_info = info['length_info']
splits = info['split']
split_category = info.get('split_category', None)

In [112]:
captions['video1000']

[]