In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

# Web Scraper

In [41]:
def yt_video_summary_to_df(url:str):
    """Returns dataframe with video information: title, views, duration, date
    
    Input:\n
    url: (str) 
    
    """
    
    driver = webdriver.Chrome()
    driver.get(f'{url}/videos?view=0&sort=p&flow=grid')
    content = driver.page_source.encode('utf-8').strip()
    soup = BeautifulSoup(content, 'lxml')
    titles_html = soup.findAll('yt-formatted-string', id='video-title')
    views_html = soup.findAll('span', class_='inline-metadata-item style-scope ytd-video-meta-block')
    duration_html = soup.findAll('span', id='text', class_='style-scope ytd-thumbnail-overlay-time-status-renderer')
    title, views, duration, date = [], [], [], []
    
    for i in range(len(titles_html)):
        title.append(titles_html[i].text)
        views.append(views_html[i*2].text)
        duration.append(duration_html[i*2].text)
        date.append(views_html[i*2+1].text)
        
    return pd.DataFrame({
        'title': title,
        'views':views,
        'duration':duration,
        'date':date})

In [42]:
# testing function - only 30 videos are being pulled
url = 'https://www.youtube.com/channel/UC0RhatS1pyxInC00YKjjBqQ'
geek_df = yt_video_summary_to_df(url)
geek_df.head()

Unnamed: 0,title,views,duration,date
0,Ace your DevOps interview | GeeksforGeeks,358 views,\n 0:44\n,2 days ago
1,How to Get Free Google Cloud Qwiklabs Credits ...,3.2K views,\n 1:08\n,2 weeks ago
2,"GeeksforGeeks Community – Connect, Ask, Learn ...",724 views,\n 0:52\n,2 weeks ago
3,Become a SDET Expert with GeeksforGeeks,8.3K views,\n 0:33\n,2 weeks ago
4,Geeks Premier League 2023 | India's Biggest Co...,1.6K views,\n 0:53\n,3 weeks ago


# ETL Pipeline

In [5]:
# Convert views to numeric form
def view_transform(views_word:str):
    conv = {'K':1000,'M':1000000,'B':1000000000}
    temp = views_word.split(' ')[0]
    suff = temp[-1]
    if suff in conv.keys():
        return int(float(temp[:-1]) * conv[suff])
    else:
        return int(temp)


In [35]:
# convert duration to minutes
# raw extraction is: \n 0:44\n
# can come in multiple forms - HH:MM:SS, MM:SS, 0:SS
# strategy, use number of semicolons to determine
def duration_to_minutes(dur:str):
    temp = dur.strip()
    units = temp.split(':')
    if len(units) == 3:
        return float(units[-1])/60 + float(units[-2]) + float(units[-3])*60
    return round(float(units[-1])/60 + float(units[-2]),2)

duration_to_minutes('\n    0:44\n')

0.73

In [43]:
geek_df['views'] = geek_df['views'].transform(view_transform)

In [44]:
geek_df['duration'] = geek_df['duration'].transform(duration_to_minutes)

In [45]:
geek_df.head()

Unnamed: 0,title,views,duration,date
0,Ace your DevOps interview | GeeksforGeeks,358,0.73,2 days ago
1,How to Get Free Google Cloud Qwiklabs Credits ...,3200,1.13,2 weeks ago
2,"GeeksforGeeks Community – Connect, Ask, Learn ...",724,0.87,2 weeks ago
3,Become a SDET Expert with GeeksforGeeks,8300,0.55,2 weeks ago
4,Geeks Premier League 2023 | India's Biggest Co...,1600,0.88,3 weeks ago
