In [24]:
import pandas as pd
import json
import numpy as np
import spacy
import nltk
from nltk.corpus import stopwords
import re
import gensim
import math

In [48]:
global AVG_TAGS_PER_VIDEO

In [2]:
#get rid of the punctuations and set all characters to lowercase
RE_PREPROCESS = r'\W+|\d+' #the regular expressions that matches all non-characters

#get rid of punctuation and make everything lowercase
#the code belows works by looping through the array of text
#for a given piece of text we invoke the `re.sub` command where we pass in the regular expression, a space ' ' to
#subsitute all the matching characters with
#we then invoke the `lower()` method on the output of the re.sub command
#to make all the remaining characters
#the cleaned document is then stored in a list
#once this list has been filed it is then stored in a numpy array

In [3]:
def processFeatures(desc):
    try:
        return re.sub(RE_PREPROCESS, ' ', desc)
    except:
        return " "

In [4]:
def processDataFrame(data_frame, country_code='US'):
    data_frame.sort_values(by=['video_id', 'trending_date'], ascending=True, inplace=True)
    grouped_videos = data_frame.groupby(['video_id']).last().reset_index()
    
    #Reading categories from the json file depending on country_code
    json_location = './data/' + country_code +'_category_id.json'
    with open(json_location) as data_file:
        data = json.load(data_file)    
    categories = []
    for item in data['items']:
        category = {}
        category['category_id'] = int(item['id'])
        category['title'] = item['snippet']['title']
        categories.append(category)

    categories_df = pd.DataFrame(categories)
    # Merging videos data with category data
    final_df = grouped_videos.merge(categories_df, on = ['category_id'])
    final_df.rename(columns={'title_y': 'category', 'title_x': 'video_name'}, inplace=True)
    
    # Creating a features column that consists all features used for prediction.
    final_df['video_features'] = final_df['tags'].astype(str) + final_df['video_name'].astype(str) \
                        + final_df['channel_title'].astype(str) + final_df['description'] + final_df['category']
        
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    final_df['video_features'] = final_df['video_features'].apply(processFeatures)
    return final_df

In [36]:
def removeStopwords(documents):
    stopwords_list = stopwords.words('english')
    processed_corpus = []
    for document in documents:
        processed_document = []
        for word in document.split():
            if word not in stopwords_list:
                processed_document.append(word)
        processed_corpus.append(processed_document)
    return processed_corpus

In [37]:
def processCorpus(feature_corpus):
    feature_corpus = [comment.lower() for comment in feature_corpus]
    processed_feature_corpus = removeStopwords(feature_corpus)
    # processed_feature_corpus = [nltk.word_tokenize(str(feature)) for feature in feature_corpus]
    return processed_feature_corpus

In [38]:
def trainModel(token_corpus):
    model = gensim.models.Word2Vec(sentences=token_corpus, min_count=1, size = 32)
    model.train(token_corpus, total_examples=model.corpus_count, epochs=model.iter)
    model.save('word2vec_model.w2v')
    return model

In [59]:
def recommendTags(token_corpus, input_words = ['trump', 'president'], number_of_tags = 10):
    word2vec_model = gensim.models.Word2Vec.load('word2vec_model.w2v')
    tags = []
    try:
        tags = word2vec_model.most_similar(positive=input_words, topn=number_of_tags)
    except:
        token_corpus.append(input_words)
        word2vec_model.build_vocab(token_corpus, update=True)
        word2vec_model.train(token_corpus, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.iter)
        word2vec_model.save('word2vec_model.w2v')
        tags = word2vec_model.most_similar(positive=input_words, topn=number_of_tags)
    
    return token_corpus, tags

Running the algorithm for US videos

In [60]:
us_videos_df = pd.read_csv('./data/USvideos.csv')
us_final_df = processDataFrame(us_videos_df, country_code='US')

In [61]:
ca_videos_df = pd.read_csv('./data/CAvideos.csv')
ca_final_df = processDataFrame(ca_videos_df, country_code='CA')

In [62]:
gb_videos_df = pd.read_csv('./data/GBvideos.csv')
gb_final_df = processDataFrame(gb_videos_df, country_code='GB')

In [63]:
us_ca_gb_final_df = pd.concat([us_final_df, ca_final_df, gb_final_df])

In [64]:
us_ca_gb_token_corpus = processCorpus(us_ca_gb_final_df['video_features'].values)

In [65]:
total_tags = 0
for tag_list in us_ca_gb_final_df['tags'].values:
    total_tags += len(tag_list.split('|'))
print('Total tags in the corpus:', total_tags)
AVG_TAGS_PER_VIDEO = math.ceil(total_tags/len(us_ca_gb_final_df))
print('Average number of tags per video:', AVG_TAGS_PER_VIDEO)

Total tags in the corpus: 56012
Average number of tags per video: 19


In [66]:
us_ca_gb_final_df['video_features'] = us_ca_gb_token_corpus

In [67]:
len(us_ca_gb_token_corpus)

3029

In [68]:
trainModel(us_ca_gb_token_corpus)

<gensim.models.word2vec.Word2Vec at 0x1ab72b3e160>

Computing average number of tags per video

In [71]:
us_ca_gb_token_corpus, tags = recommendTags(us_ca_gb_token_corpus, input_words=['trump', 'president'], number_of_tags=AVG_TAGS_PER_VIDEO)

In [72]:
tags

[('donald', 0.9869476556777954),
 ('pocahontas', 0.9681788086891174),
 ('ceremony', 0.9679031372070312),
 ('rodrigo', 0.965264081954956),
 ('napakahusay', 0.9647417068481445),
 ('calling', 0.962499737739563),
 ('conway', 0.9615099430084229),
 ('kellyanne', 0.9595869779586792),
 ('wikileaks', 0.9582960605621338),
 ('roy', 0.9578506946563721),
 ('obama', 0.9525599479675293),
 ('duterte', 0.9497562646865845),
 ('manila', 0.9496163725852966),
 ('philippines', 0.9473019242286682),
 ('presidential', 0.9469354152679443),
 ('moore', 0.9460469484329224),
 ('franken', 0.9451020956039429),
 ('levied', 0.9448361992835999),
 ('asean', 0.9445689916610718)]

## Dividing the dataset into training (80%) and testing sets (20%).

In [220]:
np.random.seed(seed=13579)
us_ca_gb_final_df_shuffled = us_ca_gb_final_df.iloc[np.random.permutation(len(us_ca_gb_final_df))]
train_size = 0.80
us_ca_gb_df_train = us_ca_gb_final_df_shuffled[:int((train_size)*len(us_ca_gb_final_df_shuffled))]
us_ca_gb_df_test = us_ca_gb_final_df_shuffled[int((train_size)*len(us_ca_gb_final_df_shuffled)):]

In [227]:
us_ca_final_df['tags']

0                                Chess"Saint Louis""Club"
1       john green"mental floss""koalas""marsupial""jo...
2       emergence"ants""intelligence""ant""sum of its ...
3                                               hanyuqiao
4       the"secret""protocal""procedure""process""for"...
5       real life lore"real life lore maps""real life ...
6       the school of life"school""life""education""re...
7       SciShow"science""Hank""Green""education""learn...
8       TED"TED-Ed""TED Education""TED Ed""Nicholas Am...
9       black friday"black friday sales""How Likely Ar...
10      TED"TED-Ed""TED Education""TED Ed""Melvin Sani...
11      summit"conference""ideas""talks""performances"...
12      science"pbs digital studios""pbs""joe hanson""...
13      veritasium"mitosis""cell division""biology""dn...
14      TED"TED-Ed""TED Education""TED Ed""Hilary Coll...
15                                            googlevideo
16      tom scott"tomscott""built for science""nationa...
17      earthq