In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter
from functionized_code.data_pipeline import get_labels_and_corpus, get_vocab
from run_pipeline import LABEL_TO_IDX, get_data, get_column_indices

IDX_TO_LABEL = {i: l for l, i in LABEL_TO_IDX.items()}

In [2]:
# Set parameters for vocabulary.
MAX_VOCAB = 25000
NGRAMS = 1

# Identify the names of the columns for the labels and corpus.
col_labels = 'category_id'
col_corpus = ['title', 'tags', 'description', 'caption']

### Vocabulary without captions

In [3]:
# Read the video data without captions.
data_wo_captions = get_data(captions=False)
# Get the labels and corpus.
idx_labels, idx_corpus = get_column_indices(data_wo_captions, col_labels, col_corpus[:3])
labels_wo_captions, corpus_wo_captions = get_labels_and_corpus(
    data_wo_captions, idx_labels, idx_corpus, silent=True)

In [4]:
print('Intersection of vocabularies without captions')
# Get vocabulary for News & Politics.
i = LABEL_TO_IDX['News & Politics']
target_corpus = corpus_wo_captions[labels_wo_captions == i]
target_vocab, _ = get_vocab(target_corpus, MAX_VOCAB, NGRAMS)
target_vocab = set(target_vocab)
target_len = len(target_vocab)
# Compare to intersections of vocabulary with other categories.
for j in np.unique(labels_wo_captions):
    # Skip self.
    if j == i:
        continue
    # Get vocabulary for this other category.
    other_corpus = corpus_wo_captions[labels_wo_captions == j]
    other_vocab, _ = get_vocab(other_corpus, MAX_VOCAB, NGRAMS)
    other_vocab = set(other_vocab)
    other_len = len(other_vocab)
    # Count the number of words at the intersection.
    n = len(target_vocab & other_vocab)
    # Report.
    values = (IDX_TO_LABEL[i], target_len, IDX_TO_LABEL[j], other_len, n)
    print('%s (%d) AND %s (%d): %d' % values)

Intersection of vocabularies without captions


### Vocabulary with captions

In [None]:
# Read the video data with captions.
data_w_captions = get_data(captions=True)
# Get the labels and corpus.
idx_labels2, idx_corpus2 = get_column_indices(data_wo_captions, col_labels, col_corpus)
labels_w_captions, labels_w_captions = get_labels_and_corpus(
    data_w_captions, idx_labels2, idx_corpus2, silent=True)

In [None]:
print('Intersection of vocabularies without captions')
# Get vocabulary for News & Politics.
i = LABEL_TO_IDX['News & Politics']
target_corpus = corpus_w_captions[labels_w_captions == i]
target_vocab, _ = get_vocab(target_corpus, MAX_VOCAB, NGRAMS)
target_vocab = set(target_vocab)
target_len = len(target_vocab)
# Compare to intersections of vocabulary with other categories.
for j in np.unique(labels_w_captions):
    # Skip self.
    if j == i:
        continue
    # Get vocabulary for this other category.
    other_corpus = corpus_w_captions[labels_w_captions == j]
    other_vocab, _ = get_vocab(other_corpus, MAX_VOCAB, NGRAMS)
    other_vocab = set(other_vocab)
    other_len = len(other_vocab)
    # Count the number of words at the intersection.
    n = len(target_vocab & other_vocab)
    # Report.
    values = (IDX_TO_LABEL[i], target_len, IDX_TO_LABEL[j], other_len, n)
    print('%s (%d) AND %s (%d): %d' % values)