# Data Processing

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import time
import bs4 as bs4
import requests as rq
import json
import os 
import tqdm
import glob

In [2]:
pd.set_option("display.max_columns", 180)

## Parsed Video DataFrame

In [3]:
df = pd.read_json('parsed_videos.json', lines= True)
df.head()

Unnamed: 0,link,title,query
0,/watch?v=ofYIsToVnT0,Azure Databricks = Azure + Spark + Machine Lea...,machine+learnin
1,/watch?v=AmZRpyFJDx8,Machine Learning Tool KNN (K- Nearest Neighbou...,machine+learnin
2,/watch?v=5ygSefB6YGc,How deep learning is predicting extreme weathe...,machine+learnin
3,/watch?v=kk2kMpk9Q7g,Decision Tree in Machine Learning,machine+learnin
4,/watch?v=KReZjW052YU,Numpy Complete Tutorial For Machine Learning |...,machine+learnin


Some of the links are duplicated

In [6]:
# Unique links
links_list = df.link.unique()
print('Unique links: {}'.format(len(df.link.unique())))

Unique links: 1489


In [26]:
links_list[-4]

'/watch?v=elojMnjn4kk'

## Concatenate url and video identifier 

In [7]:
# concatenate the url and video identifier and collecting video pages

url = "https://www.youtube.com{link}"

for link in links_list:
    urll = url.format(link = link)
    print(urll)
    response = rq.get(urll)
    
    # Add video identifier 
    # regular expression RE to get strings that match the condigion "v=(.*)
    link_name = re.search("v=(.*)",link).group(1)
    
    # save page source 
    with open("./raw_data/video_{}.html".format(link_name), "w+") as output:
        output.write(response.text)
        
    time.sleep(2)

https://www.youtube.com/watch?v=ofYIsToVnT0
https://www.youtube.com/watch?v=AmZRpyFJDx8
https://www.youtube.com/watch?v=5ygSefB6YGc
https://www.youtube.com/watch?v=kk2kMpk9Q7g
https://www.youtube.com/watch?v=KReZjW052YU
https://www.youtube.com/watch?v=-HsCoeAl40Q
https://www.youtube.com/watch?v=GIHEzQDBF-k
https://www.youtube.com/watch?v=9mB_d6c4jvE
https://www.youtube.com/watch?v=o6gmRKqLF6c
https://www.youtube.com/watch?v=zT_IrXrIAKs
https://www.youtube.com/watch?v=0VA8NVr9pYo
https://www.youtube.com/watch?v=tD2YGm2rbPw
https://www.youtube.com/watch?v=HQvz4P1cUCo
https://www.youtube.com/watch?v=mMDjn3ivhxM
https://www.youtube.com/watch?v=DwgqcMhtaL4
https://www.youtube.com/watch?v=qBAoCS-ksHo
https://www.youtube.com/watch?v=a22er1gj7c0
https://www.youtube.com/watch?v=mP92fhHjFmQ
https://www.youtube.com/watch?v=8Hy_m5fHPBA
https://www.youtube.com/watch?v=3Vzh85R_U5M
https://www.youtube.com/watch?v=8Wqe27I-k-s
https://www.youtube.com/watch?v=fJj6ixgAiBQ
https://www.youtube.com/watch?v=

AttributeError: 'NoneType' object has no attribute 'group'

## Collecting page information 

In [8]:
with open("parsed_video_info.json", 'w+') as output:
    for video_file in tqdm.tqdm_notebook(sorted(glob.glob("./raw_data/video*"))):
        with open(video_file, 'r+') as inp:
            page_html = inp.read()
            parsed = bs4.BeautifulSoup(page_html, 'html.parser')

            class_watch = parsed.find_all(attrs={"class":re.compile(r"watch")})
            id_watch = parsed.find_all(attrs={"id":re.compile(r"watch")})
            channel = parsed.find_all("a", attrs={"href":re.compile(r"channel")})
            meta = parsed.find_all("meta")


            data = dict()

            for e in class_watch:
                colname = "_".join(e['class'])
                if "clearfix" in colname:
                    continue
                data[colname] = e.text.strip()

            for e in id_watch:
                colname = e['id']
                #if colname in output:
                #    print(colname)
                data[colname] = e.text.strip()

            for e in meta:
                colname = e.get('property')
                if colname is not None:
                    data[colname] = e['content']

            for link_num, e in enumerate(channel):
                data["channel_link_{}".format(link_num)] = e['href']


            output.write("{}\n".format(json.dumps(data)))


HBox(children=(IntProgress(value=0, max=951), HTML(value='')))




In [27]:
df = pd.read_json("parsed_video_info.json", lines = True)
df.shape


(951, 172)

In [28]:
df.head(1)

Unnamed: 0,content-alignment_watch-small,watch-playlist_player-height,watch-queue-header,watch-queue-info,watch-queue-info-icon,watch-queue-title,watch-queue-control-bar_control-bar-button,watch-queue-mole-info,watch-queue-control-bar-icon,watch-queue-icon_yt-sprite,watch-queue-title-container,watch-queue-count,watch-queue-menu_yt-uix-button-menu_yt-uix-button-menu-dark-overflow-action-menu_hid,watch-queue-menu-choice_overflow-menu-choice_yt-uix-button-menu-item,watch-queue-controls,yt-uix-button_yt-uix-button-size-default_yt-uix-button-empty_yt-uix-button-has-icon_control-bar-button_prev-watch-queue-button_yt-uix-button-opacity_yt-uix-tooltip_yt-uix-tooltip,yt-uix-button-icon_yt-uix-button-icon-watch-queue-prev_yt-sprite,yt-uix-button_yt-uix-button-size-default_yt-uix-button-empty_yt-uix-button-has-icon_control-bar-button_play-watch-queue-button_yt-uix-button-opacity_yt-uix-tooltip_yt-uix-tooltip,yt-uix-button-icon_yt-uix-button-icon-watch-queue-play_yt-sprite,yt-uix-button_yt-uix-button-size-default_yt-uix-button-empty_yt-uix-button-has-icon_control-bar-button_pause-watch-queue-button_yt-uix-button-opacity_yt-uix-tooltip_hid_yt-uix-tooltip,yt-uix-button-icon_yt-uix-button-icon-watch-queue-pause_yt-sprite,yt-uix-button_yt-uix-button-size-default_yt-uix-button-empty_yt-uix-button-has-icon_control-bar-button_next-watch-queue-button_yt-uix-button-opacity_yt-uix-tooltip_yt-uix-tooltip,yt-uix-button-icon_yt-uix-button-icon-watch-queue-next_yt-sprite,watch-queue-items-container_yt-scrollbar-dark_yt-scrollbar,watch-queue-items-list,content-alignment_watch-player-playlist,watch-main-col,watch-title-container,watch-title,watch-secondary-actions_yt-uix-button-group,watch-view-count,watch-action-panels_yt-uix-button-panel_hid_yt-card_yt-card-has-padding,watch-time-text,watch-extras-section,watch-meta-item_yt-uix-expander-body,content_watch-info-tag-list,watch-sidebar,watch-playlist_player-height_hid,watch-sidebar-gutter_yt-card_yt-card-has-padding_yt-uix-expander_yt-uix-expander-collapsed,watch-sidebar-section,watch-sidebar-head,watch-sidebar-body,watch-sidebar-separation-line,watch-queue-mole,watch-queue,watch-queue-title-msg,watch-queue-count-msg,watch-queue-loading-template,watch7-container,watch7-main-container,watch7-main,watch7-preview,watch7-content,watch-header,watch7-headline,watch-headline-title,watch7-user-header,watch7-subscription-container,watch8-action-buttons,watch8-secondary-actions,watch8-sentiment-actions,watch7-views-info,watch-action-panels,watch-actions-share-loading,watch-actions-share-panel,watch-actions-rental-required,watch-description,watch-description-content,watch-description-clip,watch-uploader-info,watch-description-text,watch-description-extras,watch-discussion,watch7-sidebar,watch7-sidebar-contents,watch7-sidebar-offer,watch7-sidebar-ads,watch-channel-brand-div,watch-channel-brand-div-text,watch7-sidebar-modules,watch-related,shared-addto-watch-later-login,og:site_name,og:url,og:title,og:image,og:image:width,og:image:height,og:description,al:ios:app_store_id,al:ios:app_name,al:ios:url,al:android:url,al:android:app_name,al:android:package,al:web:url,og:type,og:video:url,og:video:secure_url,og:video:type,og:video:width,og:video:height,og:video:tag,fb:app_id,channel_link_0,channel_link_1,yt-pl-watch-queue-overlay,watch-actions-transcript-loading,watch-actions-transcript,watch-transcript-container,watch-transcript-not-found,channel_link_2,watch-skeleton,watch-page-skeleton,watch-meta-item,watch-sidebar-discussion,channel_link_3,watch_history,watch_later,watch_related_mix,what_to_watch,ytd-watch-card-album-list-renderer,ytd-watch-card-collage-renderer,ytd-watch-card-single-image-renderer,ytd-watch-card-video-list-renderer,ytd-generic-watch-card,watch-card-header,watch-card-title,watch-card-labels,ytd-artist-watch-card-renderer,ytd-show-watch-card-renderer,ytd-watch-card-one-vs-one-event,ytd-watch-card-hero-one-vs-one-event-renderer,ytd-watch-card-hero-video-renderer,watch-card-endpoint,watch-card-subtitle,ytd-watch-card-rich-header-renderer,ytd-watch-card-compact-video-renderer,ytd-watch-card-one-vs-one-event-compact-video-renderer,ytd-vertical-watch-card-list-renderer,ytd-watch-card-section-dropdown-renderer,ytd-watch-card-section-sequence-renderer,ytd-universal-watch-card-renderer,ytd-video-game-watch-card-renderer,ytd-watch-card-renderer,ytd-player-legacy-desktop-watch-ads-renderer,ytd-watch-items,watch-checkout-offers,ytd-watch-next-secondary-results-renderer,ytd-watch-fixie,ytd-watch-flexy,watch-sidebar-live-chat,channel_link_4,watch-meta-item_has-image,content-alignment_watch-small_off-screen-trigger,channel_link_5,channel_link_6,channel_link_7,channel_link_8,channel_link_9,channel_link_10,channel_link_11,channel_link_12,channel_link_13,channel_link_14,channel_link_15,channel_link_16,channel_link_17,channel_link_18,channel_link_19,channel_link_20,channel_link_21
0,This video is unavailable.\n\n \n\n\n\n\n\n...,Watch QueueQueueWatch QueueQueue \nRemove allD...,Watch QueueQueueWatch QueueQueue \nRemove allD...,Watch QueueQueue,,Watch Queue,Watch QueueQueue \nRemove allDisconnect,Watch QueueQueue,,,Watch QueueQueue,,Remove allDisconnect,Disconnect,,,,,,,,,,Loading...,Loading...,,"{\n ""@context"": ""http://schema.org"",\n ""...",Machine Learning Course A To Z || Beginner to ...,Machine Learning Course A To Z || Beginner to ...,Add to\n\nWant to watch this again later?\n\n ...,"173,884 views",Loading...\n \n\n\n\n\n\n\n\n\n\n\nLoading....,"Published on Aug 10, 2018",Category\n \n\nEducation,Category\n \n\nEducation,Education,Advertisement\n \n\n\n\n\n\n\n\n\nAutopla...,,Advertisement\n \n\n\n\n\n\n\n\n\nAutopla...,Machine Learning Fundamentals: The Confusion M...,Up next,Machine Learning Fundamentals: The Confusion M...,,Watch QueueQueueWatch QueueQueue \nRemove allD...,Watch QueueQueueWatch QueueQueue \nRemove allD...,Watch Queue,__count__/__total__,,YouTube Premium\n \n\n\n\n\n\n\nLoading...\n ...,"{\n ""@context"": ""http://schema.org"",\n ""...","{\n ""@context"": ""http://schema.org"",\n ""...",,"{\n ""@context"": ""http://schema.org"",\n ""...",Machine Learning Course A To Z || Beginner to ...,Machine Learning Course A To Z || Beginner to ...,Machine Learning Course A To Z || Beginner to ...,Geek's Lesson\n\n\n\n\n\n\n\n\n\n\n\n\n\nLoadi...,Loading...\n \n\n\n\n\n\n\n\n Unsubs...,Add to\n\nWant to watch this again later?\n\n ...,Add to\n\nWant to watch this again later?\n\n ...,"173,884 views\n\n\n\n\n\n\n\n5,109\n\nLike thi...","173,884 views",Loading...\n \n\n\n\n\n\n\n\n\n\n\nLoading....,Loading...,,Rating is available when the video has been re...,"Published on Aug 10, 2018Welcome to this free ...","Published on Aug 10, 2018Welcome to this free ...","Published on Aug 10, 2018Welcome to this free ...","Published on Aug 10, 2018",Welcome to this free online class on machine l...,Category\n \n\nEducation,Loading...,Advertisement\n \n\n\n\n\n\n\n\n\nAutopla...,Advertisement\n \n\n\n\n\n\n\n\n\nAutopla...,,Advertisement,Advertisement,Advertisement,"Autoplay\n\n\nWhen autoplay is enabled, a sugg...",Machine Learning Fundamentals: The Confusion M...,Sign in to add this to Watch Later,YouTube,https://www.youtube.com/watch?v=-58kO_zYUGE,Machine Learning Course A To Z || Beginner to ...,https://i.ytimg.com/vi/-58kO_zYUGE/maxresdefau...,1280.0,720.0,Welcome to this free online class on machine l...,544007664.0,YouTube,vnd.youtube://www.youtube.com/watch?v=-58kO_zY...,vnd.youtube://www.youtube.com/watch?v=-58kO_zY...,YouTube,com.google.android.youtube,https://www.youtube.com/watch?v=-58kO_zYUGE&fe...,video.other,https://www.youtube.com/embed/-58kO_zYUGE,https://www.youtube.com/embed/-58kO_zYUGE,text/html,640.0,360.0,Ai and machine learning course,87741120000.0,/channel/UCKXx22vOENUyHrVAADq7Z_g,/channel/UCKXx22vOENUyHrVAADq7Z_g,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [29]:
# Columns selected for my baseline model
selected_columns = ['watch-title', 'watch-view-count', 'watch-time-text', 'content_watch-info-tag-list', 'watch7-headline',
                    'watch7-user-header', 'watch8-sentiment-actions', "og:image", 'og:image:width', 'og:image:height',
                    "og:description", "og:video:width", 'og:video:height', "og:video:tag", 'channel_link_0']

In [30]:
df[selected_columns].head()

Unnamed: 0,watch-title,watch-view-count,watch-time-text,content_watch-info-tag-list,watch7-headline,watch7-user-header,watch8-sentiment-actions,og:image,og:image:width,og:image:height,og:description,og:video:width,og:video:height,og:video:tag,channel_link_0
0,Machine Learning Course A To Z || Beginner to ...,"173,884 views","Published on Aug 10, 2018",Education,Machine Learning Course A To Z || Beginner to ...,Geek's Lesson\n\n\n\n\n\n\n\n\n\n\n\n\n\nLoadi...,"173,884 views\n\n\n\n\n\n\n\n5,109\n\nLike thi...",https://i.ytimg.com/vi/-58kO_zYUGE/maxresdefau...,1280.0,720.0,Welcome to this free online class on machine l...,640.0,360.0,Ai and machine learning course,/channel/UCKXx22vOENUyHrVAADq7Z_g
1,Python For Data Science Full Course - 9 Hours ...,"11,634 views","Published on Mar 15, 2020",Education,#edureka #PythonEdureka #pythonfordatasciencef...,edureka!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLoading....,"11,634 views\n\n\n\n\n\n\n\n579\n\nLike this v...",https://i.ytimg.com/vi/-6RqxhNO2yY/maxresdefau...,1280.0,720.0,🔥Edureka Python Certification Training: https:...,1280.0,720.0,edureka,/channel/UCkw4JCwteGrDHIsyIIKo4tQ
2,Machine Learning In 5 Minutes | Machine Learni...,"111,754 views","Published on Feb 19, 2019",Education,#MachineLearning #MachineLearningAlgorithms #W...,Simplilearn\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLoadi...,"111,754 views\n\n\n\n\n\n\n\n354\n\nLike this ...",https://i.ytimg.com/vi/-DEL6SVRPw0/maxresdefau...,1280.0,720.0,This Machine Learning basics video will help y...,1280.0,720.0,simplilearn,/channel/UCsvqVGtbbyHaMoevxPAq9Fg
3,Data Science Full Course - Learn Data Science ...,"520,299 views","Published on Aug 18, 2019",Education,Data Science Full Course - Learn Data Science ...,edureka!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nLoading....,"520,299 views\n\n\n\n\n\n\n\n13,494\n\nLike th...",https://i.ytimg.com/vi/-ETQ97mXXF0/maxresdefau...,1280.0,720.0,🔥 Data Science Master Program: https://www.edu...,1280.0,720.0,edureka data science,/channel/UCkw4JCwteGrDHIsyIIKo4tQ
4,Machine Learning - Features Engineering (Part 1),161 views,"Published on Mar 28, 2020",Education,Machine Learning - Features Engineering (Part 1),habib benlahmar\n\n\n\n\n\n\n\n\n\n\n\n\n\nLoa...,161 views\n\n\n\n\n\n\n\n13\n\nLike this video...,https://i.ytimg.com/vi/-HsCoeAl40Q/hqdefault.jpg,480.0,360.0,cours et ateliers Features Engineering,640.0,360.0,,/channel/UC1BkKcPvBMX4lqSG_pv21ng


In [31]:
# save df in feather format
df[selected_columns].to_feather("raw_data.feather")

In [32]:
#Save df in csv file for labeliing 
df[selected_columns].to_csv('raw_data_no_labels.csv')