In [1]:
import numpy as np
import pandas as pd
import re
from collections import Counter

In [2]:
data = pd.read_csv('USvideos.csv').drop_duplicates('video_id', 'first')

In [3]:
# Identify constants for vocabulary operations.
DELIMITERS = re.compile(r' |\\n|\|')
PUNCTUATION = re.compile(r'[.:;,?!\"|#()-]|^\'|\'$')
STOP_PATTERN = re.compile(r'http|www')
STOP_WORDS = set(['the', 'a', 'and', 'or', 'of'])

In [4]:
# Map categories indices to labels.
categories = {
    1: 'Film & Animation',
    2: 'Autos & Vehicles',
    10: 'Music',
    15: 'Pets & Animals',
    17: 'Sports',
    18: 'Short Movies',
    19: 'Travel & Events',
    20: 'Gaming',
    21: 'Videoblogging',
    22: 'People & Blogs',
    23: 'Comedy',
    24: 'Entertainment',
    25: 'News & Politics',
    26: 'Howto & Style',
    27: 'Education',
    28: 'Science & Technology',
    29: 'Nonprofits & Activism',
    30: 'Movies',
    31: 'Anime/Animation',
    32: 'Action/Adventure',
    33: 'Classics',
    34: 'Comedy',
    35: 'Documentary',
    36: 'Drama',
    37: 'Family',
    38: 'Foreign',
    39: 'Horror',
    40: 'Sci-Fi/Fantasy',
    41: 'Thriller',
    42: 'Shorts',
    43: 'Shows',
    44: 'Trailers'
}

In [5]:
# Collect vocabulary by category from tags.
vocabularies = {}
n = 10000
# Consider each category in the data.
for category in data['category_id'].unique():
    vocab = {}
    # Consider tags for each video in this category.
    for tags in data.loc[data['category_id'] == category, 'tags']:
        words = re.split(DELIMITERS, tags)
        # Consider each word among these tags.
        for word in words:
            word = re.sub(PUNCTUATION, '', word).lower()
            # Skip this word if it matches the stop pattern.
            if re.search(STOP_PATTERN, word):
                continue
            # Update the vocabulary with this word.
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1
    # Keep only the n most frequent words.
    vocab = {word for word, _ in Counter(vocab).most_common(n)}
    # Add this set to the mapping of categories to vocabulary.
    vocabularies[category] = vocab

In [6]:
# Compare the intersections of vocabulary between categories.
intersections = []
keys = vocabularies.keys()
for i in keys:
    for j in keys:
        if j <= i:
            continue
        n = len(vocabularies[i] & vocabularies[j])
        intersections.append((i, j, n))
# Give the most similar and most different category pairings.
intersections.sort(key=lambda x: x[2])
print('\nMost similar categories:')
for k in range(1, 11):
    i, j, n = intersections[-k]
    print('%s AND %s: %d' % (categories[i], categories[j], n))
print('\nMost dissimilar categories:')
for k in range(10):
    i, j, n = intersections[k]
    print('%s AND %s: %d' % (categories[i], categories[j], n))


Most similar categories:
Comedy AND Entertainment: 2563
People & Blogs AND Entertainment: 2294
Entertainment AND Howto & Style: 2278
Music AND Entertainment: 2077
Film & Animation AND Entertainment: 1880
Entertainment AND News & Politics: 1751
People & Blogs AND Howto & Style: 1563
Sports AND Entertainment: 1548
People & Blogs AND Comedy: 1519
Entertainment AND Science & Technology: 1511

Most dissimilar categories:
Nonprofits & Activism AND Shows: 8
Travel & Events AND Nonprofits & Activism: 24
Pets & Animals AND Shows: 25
Travel & Events AND Shows: 25
Autos & Vehicles AND Nonprofits & Activism: 27
Autos & Vehicles AND Shows: 36
Gaming AND Shows: 36
Gaming AND Nonprofits & Activism: 38
Pets & Animals AND Nonprofits & Activism: 43
Education AND Shows: 58


In [7]:
# Show the size of the data and vocabulary in each category.
for i, vocab in vocabularies.items():
    n = data[data['category_id'] == i].shape[0]
    print('%s:' % categories[i])
    print('\tVideos: %5d (%.3f)' % (n, n / data.shape[0]))
    print('\t Words: %5d\n' % len(vocab))

People & Blogs:
	Videos:   498 (0.078)
	 Words:  4135

Entertainment:
	Videos:  1619 (0.255)
	 Words:  9552

Comedy:
	Videos:   547 (0.086)
	 Words:  4450

Science & Technology:
	Videos:   380 (0.060)
	 Words:  3507

Film & Animation:
	Videos:   318 (0.050)
	 Words:  3073

News & Politics:
	Videos:   505 (0.080)
	 Words:  3541

Sports:
	Videos:   451 (0.071)
	 Words:  3373

Music:
	Videos:   799 (0.126)
	 Words:  4306

Pets & Animals:
	Videos:   138 (0.022)
	 Words:  1460

Education:
	Videos:   250 (0.039)
	 Words:  3299

Howto & Style:
	Videos:   595 (0.094)
	 Words:  4496

Autos & Vehicles:
	Videos:    70 (0.011)
	 Words:   911

Travel & Events:
	Videos:    60 (0.009)
	 Words:   793

Gaming:
	Videos:   103 (0.016)
	 Words:  1124

Nonprofits & Activism:
	Videos:    14 (0.002)
	 Words:   215

Shows:
	Videos:     4 (0.001)
	 Words:   179

