In [11]:
import pandas as pd
import json
import numpy as np
import spacy
import nltk
import re
import gensim

In [12]:
def splitTags(tag_list):
    tag_list = tag_list.split('|')
    output = ''
    for tag in tag_list:
        output += tag
    return output

In [13]:
#get rid of the punctuations and set all characters to lowercase
RE_PREPROCESS = r'\W+|\d+' #the regular expressions that matches all non-characters

#get rid of punctuation and make everything lowercase
#the code belows works by looping through the array of text
#for a given piece of text we invoke the `re.sub` command where we pass in the regular expression, a space ' ' to
#subsitute all the matching characters with
#we then invoke the `lower()` method on the output of the re.sub command
#to make all the remaining characters
#the cleaned document is then stored in a list
#once this list has been filed it is then stored in a numpy array

In [14]:
def processFeatures(desc):
    try:
        return re.sub(RE_PREPROCESS, ' ', desc)
    except:
        return " "

In [15]:
def processDataFrame(data_frame, country_code='US'):
    data_frame.sort_values(by=['video_id', 'trending_date'], ascending=True, inplace=True)
    grouped_videos = data_frame.groupby(['video_id']).last().reset_index()
    
    #Reading categories from the json file depending on country_code
    json_location = './data/' + country_code +'_category_id.json'
    with open(json_location) as data_file:
        data = json.load(data_file)    
    categories = []
    for item in data['items']:
        category = {}
        category['category_id'] = int(item['id'])
        category['title'] = item['snippet']['title']
        categories.append(category)

    categories_df = pd.DataFrame(categories)
    # Merging videos data with category data
    final_df = grouped_videos.merge(categories_df, on = ['category_id'])
    final_df.rename(columns={'title_y': 'category', 'title_x': 'video_name'}, inplace=True)
    
    # Splitting the tags by pipe (|) character
    final_df['tags'] = final_df['tags'].apply(splitTags)
    
    # Creating a features column that consists all features used for prediction.
    final_df['video_features'] = final_df['tags'].astype(str) + final_df['video_name'].astype(str) \
                        + final_df['channel_title'].astype(str) + final_df['description'] + final_df['category']
        
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    return final_df

In [21]:
def processCorpus(feature_corpus):
    feature_corpus = [comment.lower() for comment in feature_corpus]
    parser = spacy.load('en')
    processed_feature_corpus = [parser(feature) for feature in feature_corpus]
    token_corpus = [nltk.word_tokenize(str(feature)) for feature in processed_feature_corpus]
    return token_corpus

In [70]:
def recommendTags(token_corpus, input_words = ['pocahontas', 'president']):
    word2vec_model = gensim.models.Word2Vec(sentences=token_corpus, min_count=1, size = 32)
    return word2vec_model.most_similar(positive=input_words)

Running the algorithm for US videos

In [29]:
us_videos_df = pd.read_csv('./data/USvideos.csv')

In [30]:
us_final_df = processDataFrame(us_videos_df, country_code='US')

In [31]:
us_final_df.head()

Unnamed: 0,video_id,trending_date,video_name,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category,video_features
0,-2RVw2_QyxQ,17.16.11,2017 Champions Showdown: Day 3,Saint Louis Chess Club,27,2017-11-12T02:39:01.000Z,"Chess""Saint Louis""""Club""",71089,460,27,20,https://i.ytimg.com/vi/-2RVw2_QyxQ/default.jpg,False,False,False,The Saint Louis Chess Club hosts a series of f...,Education,Chess Saint Louis Club Champions Showdown Day ...
1,-oXybog2IuI,17.21.11,24 Facts about Koalas - mental_floss List Show...,Mental Floss,27,2017-11-15T16:00:00.000Z,"john green""mental floss""""koalas""""marsupial""""jo...",38775,1373,16,140,https://i.ytimg.com/vi/-oXybog2IuI/default.jpg,False,False,False,A weekly show where knowledge junkies get thei...,Education,john green mental floss koalas marsupial joey ...
2,16W7c0mb-rE,17.24.11,Emergence – How Stupid Things Become Smart Tog...,Kurzgesagt – In a Nutshell,27,2017-11-16T15:01:58.000Z,"emergence""ants""""intelligence""""ant""""sum of its ...",2032821,124607,1183,8577,https://i.ytimg.com/vi/16W7c0mb-rE/default.jpg,False,False,False,How can many stupid things combine to form sma...,Education,emergence ants intelligence ant sum of its par...
3,5WUDfviiKRE,17.28.11,二贵摔跤 - tienghoa.net,Tina Nguyen,27,2011-03-01T04:14:08.000Z,hanyuqiao,21342,107,312,201,https://i.ytimg.com/vi/5WUDfviiKRE/default.jpg,False,False,False,辽阔的乌珠穆沁草原是摔跤手的摇篮。这里摔跤的传统源远流长，盛名至今不衰。康熙五年(1666年...,Education,hanyuqiao二贵摔跤 tienghoa netTina Nguyen辽阔的乌珠穆沁草原...
4,8-u5nd2GqNE,17.24.11,The Secret Protocol for When the Queen Dies,Half as Interesting,27,2017-11-16T15:30:00.000Z,"the""secret""""protocal""""procedure""""process""""for""...",1145464,28690,887,5083,https://i.ytimg.com/vi/8-u5nd2GqNE/default.jpg,False,False,False,Raise money for charity just by browsing the i...,Education,the secret protocal procedure process for when...


In [42]:
us_token_corpus = processCorpus(us_final_df['video_features'].values)



    Only loading the 'en' tokenizer.



In [43]:
print(recommendTags(us_token_corpus))

[('thanksgiving', 0.9993991851806641), ('moore', 0.9991637468338013), ('pacific', 0.9991582632064819), ('georgia', 0.999140202999115), ('future', 0.9990207552909851), ('old', 0.9988526105880737), ('moments', 0.998785138130188), ('hair', 0.9987520575523376), ('simon', 0.998725414276123), ('donald', 0.9986773729324341)]


In [44]:
ca_videos_df = pd.read_csv('./data/CAvideos.csv')

In [45]:
ca_final_df = processDataFrame(ca_videos_df, country_code='CA')

In [46]:
ca_final_df.head()

Unnamed: 0,video_id,trending_date,video_name,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category,video_features
0,-3c-V5K2p88,17.21.11,10 Times Gordon Ramsay Actually LIKED THE FOOD!,BabbleTop,24,2017-11-17T16:00:00.000Z,"gordon ramsay""gordon ramsey""""ramsay""""ramsey""""c...",420149,2330,1497,1370,https://i.ytimg.com/vi/-3c-V5K2p88/default.jpg,False,False,False,We all know Gordon Ramsay is famous for dislik...,Entertainment,gordon ramsay gordon ramsey ramsay ramsey chef...
1,-4sU_AhRPY0,17.25.11,Ellen Teaches Sofia Vergara an English Word of...,TheEllenShow,24,2017-11-21T14:00:00.000Z,"Sofia Vergara""Sofia""""Vergara""""Modern family""""S...",2202412,47777,513,1480,https://i.ytimg.com/vi/-4sU_AhRPY0/default.jpg,False,False,False,Ellen taught the always hilarious Sofía Vergar...,Entertainment,Sofia Vergara Sofia Vergara Modern family Sofi...
2,-DxQnfAssuY,17.26.11,《明星大侦探3》03案完整版:【深夜麻辣烫】司机撒又开车了！一语道破鸥小妹和魏来的“眠伴”关...,湖南卫视芒果TV官方频道 China HunanTV Official Channel,24,2017-11-24T16:00:00.000Z,"明星大侦探3""明星大偵探""""深夜麻辣烫""""明星大侦探 深夜麻辣烫""""明星大偵探第三季""""明星...",430460,2762,108,1513,https://i.ytimg.com/vi/-DxQnfAssuY/default.jpg,False,False,False,【订阅湖南卫视官方频道Subscribe to HunanTV YouTube Channe...,Entertainment,明星大侦探 明星大偵探 深夜麻辣烫 明星大侦探 深夜麻辣烫 明星大偵探第三季 明星大侦探 W...
3,-k6j6Dkyl1M,17.15.11,Sau Ánh Hào Quang #7 FULL | Cát Phượng: Cái sa...,ĐÔNG TÂY PROMOTION OFFICIAL,24,2017-11-13T15:08:54.000Z,"sau anh hao quang""sau ánh hào quang""""sau anh h...",1544710,12417,803,2186,https://i.ytimg.com/vi/-k6j6Dkyl1M/default.jpg,False,False,False,"sau anh hao quang, sau ánh hào quang, sau anh ...",Entertainment,sau anh hao quang sau ánh hào quang sau anh ha...
4,-tS7Yecuqwo,17.27.11,Thanksgiving And Trump's (Lack Of) Giving,The Late Show with Stephen Colbert,24,2017-11-23T08:35:00.000Z,"The Late Show""Stephen Colbert""""Colbert""""Late S...",1709859,18717,1181,2561,https://i.ytimg.com/vi/-tS7Yecuqwo/default.jpg,False,False,False,President Trump is enjoying the season of givi...,Entertainment,The Late Show Stephen Colbert Colbert Late Sho...


In [48]:
ca_token_corpus = processCorpus(ca_final_df['video_features'].values)



    Only loading the 'en' tokenizer.



In [65]:
us_ca_final_df = pd.concat([us_final_df, ca_final_df])

In [68]:
us_ca_token_corpus = processCorpus(us_ca_final_df['video_features'].values)



    Only loading the 'en' tokenizer.



In [71]:
print(recommendTags(us_ca_token_corpus))

[('election', 0.9848800897598267), ('roy', 0.9822814464569092), ('award', 0.979668140411377), ('actress', 0.9791911840438843), ('soaps', 0.9780545234680176), ('winning', 0.9768493175506592), ('mother', 0.9761425256729126), ('against', 0.9753113985061646), ('donald', 0.9741940498352051), ('arasi', 0.9741353988647461)]


In [72]:
gb_videos_df = pd.read_csv('./data/GBvideos.csv')

In [73]:
gb_final_df = processDataFrame(gb_videos_df, country_code='GB')

In [75]:
us_ca_gb_final_df = pd.concat([us_final_df, ca_final_df, gb_final_df])

In [77]:
us_ca_gb_token_corpus = processCorpus(us_ca_gb_final_df['video_features'].values)



    Only loading the 'en' tokenizer.



In [None]:
len(us_ca_gb_token_corpus)