In [123]:
import pandas as pd
import json
import string
import re
import nltk
from collections import defaultdict
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/wardhuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Loading files into dataframes

In [13]:
us_videos = pd.read_csv('./USvideos.csv')
us_videos.head(1)

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...


In [183]:
us_videos.count()

video_id                  40949
trending_date             40949
title                     40949
channel_title             40949
category_id               40949
publish_time              40949
tags                      40949
views                     40949
likes                     40949
dislikes                  40949
comment_count             40949
thumbnail_link            40949
comments_disabled         40949
ratings_disabled          40949
video_error_or_removed    40949
description               40379
dtype: int64

Load the json categories into a python dictionary

In [21]:
cat_str = "" # the json string

with open('./US_category_id.json', 'r') as f:
    cat_str = f.read()
    
# python dictionary
cat_obj = json.loads(cat_str)

# array of dictionaries
cats_arr = cat_obj['items']

# dataframe
us_cats = pd.json_normalize(cats_arr)

us_cats.head(1)

Unnamed: 0,kind,etag,id,snippet.channelId,snippet.title,snippet.assignable
0,youtube#videoCategory,"""m2yskBQFythfE4irbTIeOgYYfBU/Xy1mB4_yLrHy_BmKm...",1,UCBR8-60-B28hp2BmDPdntcQ,Film & Animation,True


## Select columns from us_videos

We select the columns with identifying information, classification, and text descriptions

In [81]:
df_us = us_videos.loc[:,['video_id', 'title', 'channel_title', 'category_id', 'tags', 'description']]
df_us.tail(1)

Unnamed: 0,video_id,title,channel_title,category_id,tags,description
40948,ooyjaVdt-jA,Official Call of Duty®: Black Ops 4 — Multipla...,Call of Duty,20,"call of duty|""cod""|""activision""|""Black Ops 4""",Call of Duty: Black Ops 4 Multiplayer raises t...


## Text Preprocessing

In [169]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

#### Cleaning the tags

In [170]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [173]:
tags = df_us['tags']
specials = r'[' + re.escape('\'"!()\\') + ']'

en_stops = set(stopwords.words('english'))
porter_stemmer = PorterStemmer()

def clean_tags(s, stemming=True):
    ss = s.replace('|', ' ')
    ss = re.sub(specials, '', ss)
    
    words = ss.split(' ')
    set_words = None
    
    words = [word.lower().strip() for word in words if word and word not in en_stops]
    
    if stemming:
        set_words = set(map(porter_stemmer.stem, words))
    else:
        set_words = set(words)
    
    return ' '.join(set_words)

ctags = tags.apply(clean_tags, stemming=False)
ctags_stem = tags.apply(clean_tags, stemming=True)

In [174]:
tags[2]

'racist superman|"rudy"|"mancuso"|"king"|"bach"|"racist"|"superman"|"love"|"rudy mancuso poo bear black white official music video"|"iphone x by pineapple"|"lelepons"|"hannahstocking"|"rudymancuso"|"inanna"|"anwar"|"sarkis"|"shots"|"shotsstudios"|"alesso"|"anitta"|"brazil"|"Getting My Driver\'s License | Lele Pons"'

In [175]:
ctags[2]

'x racist anitta anwar rudymancuso shots inanna brazil license mancuso superman black shotsstudios pineapple poo sarkis lele music rudy hannahstocking official lelepons drivers alesso my love bear getting king iphone pons white video bach'

In [176]:
ctags_stem[2]

'offici x racist pon anitta licens lelepon anwar rudymancuso inanna brazil shotsstudio driver mancuso superman black shot iphon poo lele music hannahstock alesso my love pineappl bear get king sarki white rudi video bach'

#### Cleaning the description

In [179]:
df_us.loc[1,'description'].replace('\\n', ' ')

"One year after the presidential election, John Oliver discusses what we've learned so far and enlists our catheter cowboy to teach Donald Trump what he hasn't.  Connect with Last Week Tonight online...  Subscribe to the Last Week Tonight YouTube channel for more almost news as it almost happens: www.youtube.com/user/LastWeekTonight  Find Last Week Tonight on Facebook like your mom would: http://Facebook.com/LastWeekTonight  Follow us on Twitter for news about jokes and jokes about news: http://Twitter.com/LastWeekTonight  Visit our official site for all that other stuff at once: http://www.hbo.com/lastweektonight"

In [181]:
urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', df_us.loc[1,'description'].replace('\\n', ' '))
urls

['http://Facebook.com', 'http://Twitter.com', 'http://www.hbo.com']

## Feature Importance

In [73]:
d = defaultdict(set) # channel_title: category_id
for row in df_us.itertuples():
    d[row.channel_title].add(row.category_id)

dl = [len(val) for val in d.values()]
print(f'Size of the dataset (US): {len(df_us)}')
print(f'Number of unique channels: {len(d)}')
print(f'Max # of unique categories for a channel: {max(dl)}')
print(f'Average # of unique categories per channel: {sum(dl) / len(dl)}')

Size of the dataset (US): 40949
Number of unique channels: 2207
Max # of unique categories for a channel: 7
Average # of unique categories per channel: 1.0693248753964657
