In [11]:
import pandas as pd
import json
import numpy as np
import spacy
import nltk
import re
import gensim

In [12]:
def splitTags(tag_list):
    tag_list = tag_list.split('|')
    output = ''
    for tag in tag_list:
        output += tag
    return output

In [13]:
#get rid of the punctuations and set all characters to lowercase
RE_PREPROCESS = r'\W+|\d+' #the regular expressions that matches all non-characters

#get rid of punctuation and make everything lowercase
#the code belows works by looping through the array of text
#for a given piece of text we invoke the `re.sub` command where we pass in the regular expression, a space ' ' to
#subsitute all the matching characters with
#we then invoke the `lower()` method on the output of the re.sub command
#to make all the remaining characters
#the cleaned document is then stored in a list
#once this list has been filed it is then stored in a numpy array

In [14]:
def processFeatures(desc):
    try:
        return re.sub(RE_PREPROCESS, ' ', desc)
    except:
        return " "

In [15]:
def processDataFrame(data_frame, country_code='US'):
    data_frame.sort_values(by=['video_id', 'trending_date'], ascending=True, inplace=True)
    grouped_videos = data_frame.groupby(['video_id']).last().reset_index()
    
    #Reading categories from the json file depending on country_code
    json_location = './data/' + country_code +'_category_id.json'
    with open(json_location) as data_file:
        data = json.load(data_file)    
    categories = []
    for item in data['items']:
        category = {}
        category['category_id'] = int(item['id'])
        category['title'] = item['snippet']['title']
        categories.append(category)

    categories_df = pd.DataFrame(categories)
    # Merging videos data with category data
    final_df = grouped_videos.merge(categories_df, on = ['category_id'])
    final_df.rename(columns={'title_y': 'category', 'title_x': 'video_name'}, inplace=True)
    
    # Splitting the tags by pipe (|) character
    final_df['tags'] = final_df['tags'].apply(splitTags)
    
    # Creating a features column that consists all features used for prediction.
    final_df['video_features'] = final_df['tags'].astype(str) + final_df['video_name'].astype(str) \
                        + final_df['channel_title'].astype(str) + final_df['description'] + final_df['category']
        
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    return final_df

Running the algorithm for US videos

In [16]:
us_videos_df = pd.read_csv('./data/USvideos.csv')

In [18]:
us_final_df = processDataFrame(us_videos_df, country_code='US')

In [19]:
us_final_df.head()

Unnamed: 0,video_id,trending_date,video_name,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,category,video_features
0,-2RVw2_QyxQ,17.16.11,2017 Champions Showdown: Day 3,Saint Louis Chess Club,27,2017-11-12T02:39:01.000Z,"Chess""Saint Louis""""Club""",71089,460,27,20,https://i.ytimg.com/vi/-2RVw2_QyxQ/default.jpg,False,False,False,The Saint Louis Chess Club hosts a series of f...,Education,Chess Saint Louis Club Champions Showdown Day ...
1,-oXybog2IuI,17.21.11,24 Facts about Koalas - mental_floss List Show...,Mental Floss,27,2017-11-15T16:00:00.000Z,"john green""mental floss""""koalas""""marsupial""""jo...",38775,1373,16,140,https://i.ytimg.com/vi/-oXybog2IuI/default.jpg,False,False,False,A weekly show where knowledge junkies get thei...,Education,john green mental floss koalas marsupial joey ...
2,16W7c0mb-rE,17.24.11,Emergence – How Stupid Things Become Smart Tog...,Kurzgesagt – In a Nutshell,27,2017-11-16T15:01:58.000Z,"emergence""ants""""intelligence""""ant""""sum of its ...",2032821,124607,1183,8577,https://i.ytimg.com/vi/16W7c0mb-rE/default.jpg,False,False,False,How can many stupid things combine to form sma...,Education,emergence ants intelligence ant sum of its par...
3,5WUDfviiKRE,17.28.11,二贵摔跤 - tienghoa.net,Tina Nguyen,27,2011-03-01T04:14:08.000Z,hanyuqiao,21342,107,312,201,https://i.ytimg.com/vi/5WUDfviiKRE/default.jpg,False,False,False,辽阔的乌珠穆沁草原是摔跤手的摇篮。这里摔跤的传统源远流长，盛名至今不衰。康熙五年(1666年...,Education,hanyuqiao二贵摔跤 tienghoa netTina Nguyen辽阔的乌珠穆沁草原...
4,8-u5nd2GqNE,17.24.11,The Secret Protocol for When the Queen Dies,Half as Interesting,27,2017-11-16T15:30:00.000Z,"the""secret""""protocal""""procedure""""process""""for""...",1145464,28690,887,5083,https://i.ytimg.com/vi/8-u5nd2GqNE/default.jpg,False,False,False,Raise money for charity just by browsing the i...,Education,the secret protocal procedure process for when...
