In [None]:
import pandas as pd
import numpy as np
import json

import datetime
import re

from numpy import save, load

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def lemmatize_words(x):
    # Lemmatize the words
    lem = WordNetLemmatizer()
    lemmatized_list = []
    for i in x.split():
        lemmatized_word = lem.lemmatize(i)
        if len(lemmatized_word)>2:
            lemmatized_list.append(lemmatized_word)
    return lemmatized_list


def remove_stopwords(x, all_stopwords):
    # Remove stopwords
    x = [i for i in x if i not in all_stopwords]
    return x

In [None]:
def preprocess_bow_file(df, version):
    # Define stopwords
    youtube_stopwords = ['watch', 'watching', 'previous', 'like', 'subscribe', 'official', 'website', \
                     'newletter', 'channel', 'video', 'videos', 'youtube', 'subscribe'\
                     'facebook', 'instagram', 'twitter', 'snapchat', 'reddit', 'here',\
                     'join', 'follow', 'tweet', 'email', 'check', 'find',\
                     'sponsor', 'sponsored', 'support', 'supported', 'music']
    nltk_stopwords = stopwords.words('english')

    all_stopwords = []
    all_stopwords.extend(youtube_stopwords)
    all_stopwords.extend(nltk_stopwords)
    all_stopwords = set(all_stopwords)
    
    # Lemmatize stopwords
    lem = WordNetLemmatizer()
    all_stopwords = [lem.lemmatize(i) for i in all_stopwords]
    
    print('Combining columns...')
    # Combine columns that correspond to the defined version
    df['words'] = df[version].apply(lambda x: ' '.join(x), axis=1)
    
    print('Removing URLs...')
    # Remove URLs
    df['words'] = df['words'].apply(lambda x: re.sub(r'http\S+', '', x))
    
    print('Removing newline symbols...')
    # Remove newline symbol
    df['words'] = df['words'].apply(lambda x: x.replace('\\n', ''))
    
    print('Removing punctuations, numbers, and converting to lower case...')
    # Remove punctuation and numbers, and convert to lower case
    df['words'] = df['words'].apply(lambda x: re.sub('[^A-Za-z]+', ' ', x).lower())
    
    print('Lemmatizing the words...')
    # Lemmatize the words (each row input is a non-tokenized sentence)
    df['words'] = df['words'].apply(lemmatize_words)
    
    print('Removing stopwords...')
    # Remove stopwords
    df['words'] = df['words'].apply(lambda x: remove_stopwords(x, all_stopwords))
    
    # Recombine all words
    df['words'] = df['words'].apply(lambda x: ' '.join(x))
    
    # Create Bag of Words
    print('Creating Bag of Words...')
    vectorizer = CountVectorizer(max_features=2500)
    X = vectorizer.fit_transform(df['words'])
    
    index_word_dict = {}
    # Create index to word dictionary
    for i, word in enumerate(vectorizer.get_feature_names()):
        index_word_dict[i] = word
    
    bow = X.toarray()
    no_words_index = np.where(np.sum(bow == 0, axis=1) == bow.shape[1])[0]
    bow = np.delete(bow, no_words_index, axis=0)
    
    return index_word_dict, bow, no_words_index

In [None]:
def preprocess_meta_file(df, category_dict):
    print('Preprocessing meta file...')
    # Get dictionary correspondence between category_id and genre name
    id_to_category_dict = {}
    for item in category_dict['items']:
        id_to_category_dict[int(item['id'])] = item['snippet']['title']
        
    # Apply the dictionary to the video dataframe category_id's, to get the corresponding genre name for all rows
    df['genre'] = df['category_id'].map(id_to_category_dict)
    
    # Get dictionary correspondence between trending_date and t_day/t_week
    date_to_tday_dict = {}
    date_to_tweek_dict = {}
    week_ind = 0
    for i, date in enumerate(np.sort(np.unique(df['trending_date']))):
        # t_day correspondence
        date_to_tday_dict[date] = int(i)
        
        # t_week correspondence
        if (i!=0) & (i%7==0):
            week_ind += 1
        date_to_tweek_dict[date] = int(week_ind)
        
    # Apply the dictionary to the video dataframe trending_date's, to get the corresponding t_day/t_week for all rows
    df['t_day'] = df['trending_date'].map(date_to_tday_dict)
    df['t_week'] = df['trending_date'].map(date_to_tweek_dict)
    
    # Get dictionary correspondene between trending_date and t_month
    date_to_tmonth_dict = {}
    for i, year_month in enumerate(np.sort(np.unique(df['trending_date'].dt.to_period('M')))):
        date_to_tmonth_dict[year_month] = int(i)
        
    # Apply the dictionary to the video dataframe trending_date's, to get the corresponding t_month for all rows
    df['t_month'] = df['trending_date'].dt.to_period('M').map(date_to_tmonth_dict)
    
    return df[['trending_date', 't_day', 't_week', 't_month', 'channel_title', 'title', 'tags', 'description', 'genre']]


In [None]:
'''
Function to preprocess the Trending YouTube Video Statistics Dataset from https://www.kaggle.com/datasnaek/youtube-new

INPUT
- df: dataframe of a region's video.csv file
- category_dict: dictionary of the corresponding region's category_id.json file
- version: list of which columns to include in the scope of the latter experiment
    - Could (Recommmended to) take one of the following forms:
        - ['title'], ['tags'], ['description'], ['title', 'tags'], ['title', 'description'], ['tags', 'description'], ['title', 'tags', 'description']

OUTPUT:
- Meta File: CSV File that consists of the columns ['trending_date', 't_day', 't_week', 't_month', 'channel_title', 'title', 'tags', 'description', 'genre'] for each video
- Bag of Words (BOW) File: Numpy File where each column represents a unique word, and each row represents the count of the appearance of words per video
- BoW Index File: Json File which contains a dictionary of the format {index: word} from the BoW

Note that each row of the output files correspond to each other, and they represent a single video
'''

def preprocess_youtube_dataset(df, category_dict, version):
    # Remove rows with NaN values
    
    df = df.dropna().reset_index(drop='True')
    
    # Convert 'trending_date' to datetime '<M8[ns]' type
    if df['trending_date'].dtype != '<M8[ns]':
        df['trending_date'] = df['trending_date'].apply(lambda x: datetime.datetime.strptime(x, '%y.%d.%m'))
    
    # BOW File preprocessing
    index_word_dict, bow, no_words_index = preprocess_bow_file(df, version)

    # Meta File preprocessing
    meta_df = preprocess_meta_file(df.drop(no_words_index, axis=0).reset_index(drop=True), category_dict)
    
    # Version naming convention
    version_name = ''
    for column in version:
        version_name += str(column) + '_'
    
    # Save index to word dictionary to json
    with open(version_name + 'indexword.json', "w") as outfile:  
        json.dump(index_word_dict, outfile) 
    
    # Save bag of words to array
    save(version_name + 'bow.npy', bow)
    
    # Save meta dataframe to csv
    meta_df.to_csv(version_name + 'meta.csv', index=False, line_terminator='^')


In [None]:
# Read video csv and category_id json data
df = pd.read_csv('data/USvideos.csv')
with open('data/US_category_id.json') as f:
    category_dict =  json.load(f)
    
# Set version of preprocessing
version = ['description']

In [None]:
preprocess_youtube_dataset(df, category_dict, version)