In [None]:
%load_ext autoreload
%autoreload 2

# Youtube

## Load and transform the dataset

In [None]:
# Source:
# =======
# https://www.kaggle.com/datasnaek/youtube-new

import io, os, sys
import re
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text as fetext
import qgrid

from datetime import datetime

workdir = "/tmp/experiments/youtube"
source_data_h5 = "youtube.h5"
source_data_h5_path = os.path.join(workdir, source_data_h5)
h5_key = "dataset"

bitboost_path = ".."

if not os.path.isdir(workdir):
    os.makedirs(workdir)

In [None]:
# Prep the source data file if prepped files do not exist
if not os.path.isfile(source_data_h5_path):
    df1 = pd.read_csv(os.path.join(workdir, "CAvideos.csv"))
    df2 = pd.read_csv(os.path.join(workdir, "GBvideos.csv"))
    df3 = pd.read_csv(os.path.join(workdir, "USvideos.csv"))
    df1["country"] = 0
    df2["country"] = 1
    df3["country"] = 2
    df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
    
    display(df1.shape)
    display(df2.shape)
    display(df3.shape)
    display(df.shape)
    
    df.drop(columns=['video_id', 'thumbnail_link'], inplace=True)
    display(df.columns)

    print("[ ] cleaning tags and description")
    re_tag1 = re.compile('(\||/)')
    re_tag2 = re.compile('("|\'|\[none\])')
    df['tags'] = df['tags'].apply(lambda s: re_tag2.sub('', re_tag1.sub(' ', s)))
    df['description'] = df['description'].apply(lambda s: s if isinstance(s, str) else '' )
    
    print("[ ] generating word corpus")
    corpus1 = list(df['title'])
    corpus2 = list(df['channel_title'])
    corpus3 = list(df['tags'])

    corpus = list(map(lambda t: t[0] + ' ' + t[1] + ' ' + t[2],
                      zip(corpus1, corpus2, corpus3)))
    
    print("[ ] generating term-document matrix")
    vectorizer = fetext.CountVectorizer(strip_accents='unicode', min_df=0.01, max_df=0.9, ngram_range=(1,4),
                                        binary=True, lowercase=True)
    tdm = vectorizer.fit_transform(corpus)
    tdm_dense = tdm.todense().astype(np.uint8)
    
    print("[ ] generating n° word features")
    features = {}
    re_space = re.compile('\s+')

    features['title_nchars'] = list(map(len, df['title']))
    features['title_nwords'] = list(map(lambda x: len(re_space.split(x)), df['title']))
    features['channel_nchars'] = list(map(len, df['channel_title']))
    features['channel_nwords'] = list(map(lambda x: len(re_space.split(x)), df['channel_title']))
    features['descr_nchars'] = list(map(len, df['description']))
    features['descr_nwords'] = list(map(lambda x: len(re_space.split(x)), df['description']))
    features['ntags'] = list(map(lambda x: len(re_space.split(x)), df['tags']))
    
    print("[ ] generating trend date features")
    features['trend_year'] = list(map(lambda x: datetime.strptime(x, '%y.%d.%m').year, df['trending_date']))
    features['trend_month'] = list(map(lambda x: datetime.strptime(x, '%y.%d.%m').month, df['trending_date']))
    features['trend_day'] = list(map(lambda x: datetime.strptime(x, '%y.%d.%m').day, df['trending_date']))
    features['trend_wday'] = list(map(lambda x: datetime.strptime(x, '%y.%d.%m').weekday(), df['trending_date']))
    
    print("[ ] generating publish date features")
    features['publish_year'] = list(map(lambda x: datetime.strptime(x[0:13], '%Y-%m-%dT%H').year, df['publish_time']))
    features['publish_month'] = list(map(lambda x: datetime.strptime(x[0:13], '%Y-%m-%dT%H').month, df['publish_time']))
    features['publish_day'] = list(map(lambda x: datetime.strptime(x[0:13], '%Y-%m-%dT%H').day, df['publish_time']))
    features['publish_hour'] = list(map(lambda x: datetime.strptime(x[0:13], '%Y-%m-%dT%H').hour, df['publish_time']))
    features['publish_wday'] = list(map(lambda x: datetime.strptime(x[0:13], '%Y-%m-%dT%H').weekday(), df['publish_time']))
        
    print("[ ] generating transformed features")
    features['dislikeslg'] = np.log(df['dislikes'] + 1, dtype=np.float32)
    features['likeslg'] = np.log(df['likes'] + 1, dtype=np.float32)
    features['dislikeratiolg'] = np.log((df['dislikes'] + 1) / (df['likes'] + df['dislikes'] + 1), dtype=np.float32)
    features['cmtslg'] = np.log(df['comment_count'] + 1, dtype=np.float32)
    features['likepcmtlg'] = np.log((df['comment_count'] + 1) / (df['likes'] + df['dislikes'] + 1), dtype=np.float32)
    #features["viewslg"] = np.log(df['views'] + 1, dtype=np.float32)
   
    print("[ ] generating categorical features")
    cat_features = {}
    cat_features['cat_id'] = list(df['category_id'])
    cat_features['country'] = list(df['country'])
    cat_features['cmtsdis'] = list(map(lambda x: 1 if x else 0, df['comments_disabled']))
    cat_features['likedis'] = list(map(lambda x: 1 if x else 0, df['ratings_disabled']))
    cat_features['err'] = list(map(lambda x: 1 if x else 0, df['video_error_or_removed']))

    print("[ ] generating target")
    #features['target'] = (df["likes"] > df["dislikes"]).astype(np.uint8)
    #features['target'] = np.log(((df["likes"]+1) / (df["dislikes"]+1)), dtype=np.float32)
    features["viewslg"] = np.log10(df['views'] + 1, dtype=np.float32)
    
    print("[ ] combining dataframe")
    n = df.shape[0]
    colnames = list(map(lambda x: "txt_"+re_space.sub('_', x), vectorizer.get_feature_names()))
    df1 = pd.DataFrame(tdm_dense, columns=colnames, index=range(n), dtype=np.uint32)
    df2 = pd.DataFrame(features, index=range(n), dtype=np.float32)
    df3 = pd.DataFrame(cat_features, index=range(n), dtype=np.uint32)
    df_comb  = pd.concat([df1, df3, df2], axis=1)
    display(df1.shape)
    display(df2.shape)
    display(df3.shape)
    display(df_comb.shape)

    print("[ ] write to hdf5")
    df_comb.to_hdf(source_data_h5_path, h5_key, complevel=9)
    
    print("[ ] done")