In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm.notebook
from IPython.display import set_matplotlib_formats
from loaders import load_all_videos, load_thumbnail

tqdm.notebook.tqdm().pandas()

%matplotlib inline
set_matplotlib_formats('svg')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [2]:
vids = load_all_videos()
vids.drop_duplicates(subset='video_id', inplace=True)
vids

Unnamed: 0.1,Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,is_GB,is_US
0,0,Jw1Y-zhQURU,17.14.11,John Lewis Christmas Ad 2017 - #MozTheMonster,John Lewis,,2017-11-10T07:38:29.000Z,"christmas|""john lewis christmas""|""john lewis""|...",7224515,55681,10247,9479,https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg,False,False,False,Click here to continue the story and make your...,True,False
1,1,3s1rvMFUweQ,17.14.11,Taylor Swift: ‚Ä¶Ready for It? (Live) - SNL,Saturday Night Live,,2017-11-12T06:24:44.000Z,"SNL|""Saturday Night Live""|""SNL Season 43""|""Epi...",1053632,25561,2294,2757,https://i.ytimg.com/vi/3s1rvMFUweQ/default.jpg,False,False,False,Musical guest Taylor Swift performs ‚Ä¶Ready f...,True,False
2,2,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyonc√©,EminemVEVO,,2017-11-10T17:00:03.000Z,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",17158579,787420,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyonc√© ...,True,False
3,3,PUTEiSjKwJU,17.14.11,Goals from Salford City vs Class of 92 and Fri...,Salford City Football Club,,2017-11-13T02:30:38.000Z,"Salford City FC|""Salford City""|""Salford""|""Clas...",27833,193,12,37,https://i.ytimg.com/vi/PUTEiSjKwJU/default.jpg,False,False,False,Salford drew 4-4 against the Class of 92 and F...,True,False
4,4,rHwDegptbI4,17.14.11,Dashcam captures truck's near miss with child ...,Cute Girl Videos,,2017-11-13T01:45:13.000Z,[none],9815,30,2,30,https://i.ytimg.com/vi/rHwDegptbI4/default.jpg,False,False,False,Dashcam captures truck's near miss with child ...,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73201,40759,Gi56dSh8Fq8,18.14.06,Gourmet Chef Makes A Big Mac Super Fancy,BuzzFeedVideo,,2018-06-13T18:00:32.000Z,"mcdonalds|""big mac""|""fancy""|""fast food""|""jacqu...",402418,10070,3303,2142,https://i.ytimg.com/vi/Gi56dSh8Fq8/default.jpg,False,False,False,"It's your good ol' McDonald's Big Mac, but lik...",False,True
73202,40760,dS5Thrl-4Kc,18.14.06,CRAYOLA MAKEUP | HIT OR MISS?,Laura Lee,,2018-06-12T18:55:26.000Z,"Laura88Lee|""crayola""|""crayon makeup""|""crayola ...",607422,26166,895,3517,https://i.ytimg.com/vi/dS5Thrl-4Kc/default.jpg,False,False,False,"Hey Larlees, todays video is me testing Crayol...",False,True
73203,40761,JGm9Y_hFqNk,18.14.06,First Take reacts: Kyrie Irving says contract ...,ESPN,,2018-06-13T15:06:08.000Z,"espn|""espn live""|""boston celtics""|""kyrie irvin...",812832,7701,320,2505,https://i.ytimg.com/vi/JGm9Y_hFqNk/default.jpg,False,False,False,First Take's Stephen A. Smith and Max Kellerma...,False,True
73205,40764,mpnshdmtE2Y,18.14.06,Carla Makes BA Smashburgers | From the Test Ki...,Bon App√©tit,,2018-06-12T16:03:58.000Z,"bon appetit|""burgers""|""cheeseburgers""|""how to ...",540149,14206,693,1211,https://i.ytimg.com/vi/mpnshdmtE2Y/default.jpg,False,False,False,"Ground chuck is a great all-purpose, buy-it-an...",False,True


In [3]:
# create a brand new dataframe for hand-crafted features
features = pd.DataFrame(data={'video_id': vids['video_id'], 'category_id': vids['category_id']})
features

Unnamed: 0,video_id,category_id
0,Jw1Y-zhQURU,
1,3s1rvMFUweQ,
2,n1WpP7iowLc,
3,PUTEiSjKwJU,
4,rHwDegptbI4,
...,...,...
73201,Gi56dSh8Fq8,
73202,dS5Thrl-4Kc,
73203,JGm9Y_hFqNk,
73205,mpnshdmtE2Y,


In [4]:
publish_times = pd.to_datetime(vids['publish_time'], utc=True)
features['publish_weekday'] = publish_times.apply(lambda x: x.weekday)
features['publish_hour'] = publish_times.apply(lambda x: x.hour)

In [5]:
likes = vids[vids['ratings_disabled'] == False]['likes']
dislikes = vids[vids['ratings_disabled'] == False]['dislikes']

features['like_ratio'] = likes / (likes + dislikes)
features['likes_per_view'] = likes / vids['views']
features['dislikes_per_view'] = dislikes / vids['views']
features['comments_per_view'] = vids['comment_count'] / vids['views']

In [6]:
import re
from nltk.tokenize.casual import casual_tokenize

def num_chars(text):
    return len(text)

def num_words(text):
    return len(casual_tokenize(text))

def num_uppercase_words(text):
    pattern = r'\b[A-Z]{3,}\b'
    return len(re.findall(pattern, text))

def num_repeated_letters(text):
    pattern = r'\b\w*(\w)\1{2,}\w*\b'
    return len(re.findall(pattern, text))

def num_question_marks(text):
    return text.count('?')

def num_exclamation_marks(text):
    return text.count('!')

def num_periods(text):
    return text.count('.')

def num_numbers(text):
    pattern = '\d+'
    return len(re.findall(pattern, text))

text_features = [
    num_chars, num_words, num_uppercase_words,
    num_question_marks, num_exclamation_marks, num_periods, num_numbers
]

for func in text_features:
    features[f"title_{func.__name__}"] = vids['title'].apply(func)

In [7]:
import string
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import ngrams

stops = set(stopwords.words('english'))
stops.update(string.punctuation)
stemmer = PorterStemmer()


def most_frequent_words(phrases, ngram_size=1, num_most_common=5):
    counter = Counter()
    for phrase in phrases:
        phrase = phrase.lower()
        tokens = casual_tokenize(phrase)
        tokens = [
            stemmer.stem(token)
            for token in tokens
            if token not in stops
        ]
        if ngram_size > 1:
            tokens = ngrams(tokens, ngram_size)
        counter.update(tokens)
    return counter.most_common(num_most_common)

categories = vids['category_id'].dropna().unique()
meaningful_tokens = set()
for ctg in categories:
    titles_in_category = vids[vids['category_id'] == ctg]['title']
    for word, _count in most_frequent_words(titles_in_category):
        if re.match('[a-z0-9]+', word):
            meaningful_tokens.add(word)

for token in meaningful_tokens:
    features[f"title__{token}"] = vids['title'].apply(lambda title: int(token in title.lower()))

In [8]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

features['title_sentiment'] = vids['title'].apply(lambda title: sid.polarity_scores(title)['compound'])

In [9]:
for func in text_features:
    features[f"description_{func.__name__}"] = vids['description'].dropna().apply(func)

In [10]:
features['description_sentiment'] = vids['description'].dropna().apply(lambda desc: sid.polarity_scores(desc)['compound'])

In [11]:
meaningful_tokens = set()
for ctg in categories:
    descriptions_in_category = vids[vids['category_id'] == ctg]['description']
    for word, _count in most_frequent_words(descriptions_in_category.dropna(), 1, 5):
        if re.match('[a-z0-9]+', word) and len(word) > 1 and word != 'video':
            meaningful_tokens.add(word)

for token in meaningful_tokens:
    features[f"description__{token}"] = vids['description'].apply(lambda desc: int(token in str(desc).lower()))

In [12]:
def parse_tags(value):
    if value == '[none]':
        return set()
    return set(tag.strip('"').lower() for tag in value.split('|'))

tags = vids['tags'].apply(parse_tags)
meaningful_tags = set()
for ctg in categories:
    tags_counter = Counter()
    for t in tags[vids['category_id'] == ctg]:
        tags_counter.update(t)
    for word, _count in tags_counter.most_common(4):
        meaningful_tags.add(word)

for tag in meaningful_tags:
    features[f"tag__{tag}"] = tags.apply(lambda tags: int(tag in tags))

In [13]:
import PIL
import cv2
import colorgram

def extract_palette(bgr):
    rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
    img = PIL.Image.fromarray(rgb)
    return colorgram.extract(img, 4)

def hsl_attributes(row):
    try:
        im = load_thumbnail(row['video_id'])
        for i, color in enumerate(extract_palette(im)):
            row[f"h{i}"] = int(color.hsl.h / 255 * 360)
            row[f"s{i}"] = int(color.hsl.s / 255 * 100)
            row[f"l{i}"] = int(color.hsl.l / 255 * 100)
    except TypeError:
        pass
    return row

hsl_zeros = { f"{x}{i}": 0 for i in range(4) for x in ['h', 's', 'l'] }
features = features.assign(**hsl_zeros)
features = features.progress_apply(hsl_attributes, axis=1)

HBox(children=(FloatProgress(value=0.0, max=8607.0), HTML(value='')))




In [14]:
features['has_image'] = (features['h1'] > 0) | (features['h2'] > 0) | (features['s1'] > 0)

In [15]:
face_cascade = cv2.CascadeClassifier('../youtube_data/haarcascade_frontalface_default.xml')

def detect_faces(row):
    if not row['has_image']:
        row['_faces'] = []
    else:
        img = load_thumbnail(row['video_id'])
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.1, 4)
        row['_faces'] = list(faces)
    return row

features = features.progress_apply(detect_faces, axis=1)
features['faces_total'] = features['_faces'].apply(len)

HBox(children=(FloatProgress(value=0.0, max=8607.0), HTML(value='')))




In [16]:
# features = pd.read_csv('../youtube_data/features.csv')
features

Unnamed: 0,video_id,category_id,publish_weekday,publish_hour,like_ratio,likes_per_view,dislikes_per_view,comments_per_view,title_num_chars,title_num_words,...,l1,h2,s2,l2,h3,s3,l3,has_image,_faces,faces_total
0,Jw1Y-zhQURU,,4,7,0.844573,0.007707,0.001418,0.001312,45,7,...,0,0,0,0,0,0,0,False,[],0
1,3s1rvMFUweQ,,6,6,0.917645,0.024260,0.002177,0.002617,43,15,...,0,0,0,0,0,0,0,False,[],0
2,n1WpP7iowLc,,4,17,0.947740,0.045891,0.002531,0.007336,43,13,...,11,214,17,37,0,0,36,True,[],0
3,PUTEiSjKwJU,,0,2,0.941463,0.006934,0.000431,0.001329,76,15,...,59,29,21,36,213,8,60,True,[],0
4,rHwDegptbI4,,0,1,0.937500,0.003057,0.000204,0.003057,55,9,...,0,0,0,0,0,0,0,False,[],0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73201,Gi56dSh8Fq8,,2,18,0.753010,0.025024,0.008208,0.005323,40,8,...,92,26,60,62,141,56,97,True,"[[48, 18, 25, 25]]",1
73202,dS5Thrl-4Kc,,1,18,0.966927,0.043077,0.001473,0.005790,29,7,...,62,25,58,85,24,38,38,True,"[[43, 4, 44, 44]]",1
73203,JGm9Y_hFqNk,,2,15,0.960105,0.009474,0.000394,0.003082,94,17,...,33,314,9,16,193,20,34,True,"[[18, 6, 27, 27]]",1
73205,mpnshdmtE2Y,,1,16,0.953487,0.026300,0.001283,0.002242,66,15,...,61,221,13,89,32,33,15,True,[],0


In [17]:
from collections import Counter
import numpy as np
from keras.models import load_model
from keras.preprocessing.image import img_to_array

classifier = load_model('../youtube_data/model_v6_23.hdf5')
class_labels = {0: 'angry', 1: 'disgust', 2: 'fear', 3: 'happy', 4: 'neutral', 5: 'sad', 6: 'surprise'}

for label in class_labels.values():
    features[f"faces_{label}"] = 0

def detect_emotions(row):
    if not row['has_image']:
        return row
    
    img = load_thumbnail(row['video_id'])
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    rois = []
    for face in row['_faces']:
        (x,y,w,h) = face
        roi_gray = gray[y:y+h, x:x+w]
        
        try:
            roi_gray = cv2.resize(roi_gray, (48, 48), interpolation = cv2.INTER_AREA)
        except:
            roi_gray = np.zeros((48,48), np.uint8)
        
        rois.append(roi_gray)
        
    emotions = Counter()
    for roi in rois:
#         if np.sum([roi]) == 0.0:
#             print("Incorrect ROI for id:" + row['video_id'])
#             continue
        roi = roi.astype("float")
        roi = img_to_array(roi)
        roi = np.expand_dims(roi, axis=0)
        
        preds = classifier.predict(roi)[0]
        label = class_labels[preds.argmax()]
        
        emotions[label] += 1
    for label, count in emotions.items():
        row[f"faces_{label}"] = count
    
    return row

features = features.progress_apply(detect_emotions, axis=1)

Using TensorFlow backend.


HBox(children=(FloatProgress(value=0.0, max=8607.0), HTML(value='')))




In [18]:
from text_detection import find_text
from scipy import ndimage

features['text_area'] = 0
features['text_centroid_x'], features['text_centroid_y'] = None, None

def boxes_to_bitmap(image, boxes):
    w, h, _ = image.shape
    bitmap = np.zeros((w, h), dtype=bool)
    for (startX, startY, endX, endY) in boxes:
        bitmap[startY:endY, startX:endX] = True
    return bitmap

def detect_text(row):
    if not row['has_image']:
        return row
    img = load_thumbnail(row['video_id'])
    boxes = find_text(img)
    if len(boxes):
        boxes_bmp = boxes_to_bitmap(img, boxes)
        row['text_area'] = boxes_bmp.mean()
        row['text_centroid_x'], row['text_centroid_y'] = ndimage.measurements.center_of_mass(boxes_bmp)
    return row

features = features.progress_apply(detect_text, axis=1)

HBox(children=(FloatProgress(value=0.0, max=8607.0), HTML(value='')))

  for dir in range(input.ndim)]





In [19]:
features = features.drop(columns=['_faces'])
features.to_csv('../youtube_data/features.csv')
features

Unnamed: 0,video_id,category_id,publish_weekday,publish_hour,like_ratio,likes_per_view,dislikes_per_view,comments_per_view,title_num_chars,title_num_words,...,faces_angry,faces_disgust,faces_fear,faces_happy,faces_neutral,faces_sad,faces_surprise,text_area,text_centroid_x,text_centroid_y
0,Jw1Y-zhQURU,,4,7,0.844573,0.007707,0.001418,0.001312,45,7,...,0,0,0,0,0,0,0,0.000000,,
1,3s1rvMFUweQ,,6,6,0.917645,0.024260,0.002177,0.002617,43,15,...,0,0,0,0,0,0,0,0.000000,,
2,n1WpP7iowLc,,4,17,0.947740,0.045891,0.002531,0.007336,43,13,...,0,0,0,0,0,0,0,0.419853,25.809107,55.878284
3,PUTEiSjKwJU,,0,2,0.941463,0.006934,0.000431,0.001329,76,15,...,0,0,0,0,0,0,0,0.000000,,
4,rHwDegptbI4,,0,1,0.937500,0.003057,0.000204,0.003057,55,9,...,0,0,0,0,0,0,0,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73201,Gi56dSh8Fq8,,2,18,0.753010,0.025024,0.008208,0.005323,40,8,...,0,0,0,0,0,0,1,0.000000,,
73202,dS5Thrl-4Kc,,1,18,0.966927,0.043077,0.001473,0.005790,29,7,...,0,0,0,0,0,0,1,0.000000,,
73203,JGm9Y_hFqNk,,2,15,0.960105,0.009474,0.000394,0.003082,94,17,...,0,0,1,0,0,0,0,0.000000,,
73205,mpnshdmtE2Y,,1,16,0.953487,0.026300,0.001283,0.002242,66,15,...,0,0,0,0,0,0,0,0.046569,58.500000,26.500000
