In [1]:
import requests
from bs4 import BeautifulSoup as bs
import json
import re
import splinter
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)




In [3]:
# visit wikipedia page to find top 50 Youtubers
url = "https://en.wikipedia.org/wiki/List_of_most-subscribed_YouTube_channels"
browser.visit(url)

In [4]:
# use Beautiful soup to parse html
html = browser.html
soup = bs(html, 'html.parser')

In [5]:
# we want first table of class "wikitable", so use find function
table = soup.find('table', class_="wikitable")

In [6]:
# list of top 50 Youtube channels
channels = []

# list of all rows in table
rows = table.find('tbody').find_all('tr')

# for each row, add the text in the first cell to channels
for row in rows:
    channel = row.find('td').text #first cell holds the channel name
    channels.append(channel.strip()) #call strip because of \n at the end of each text

In [7]:
# find all the tags that contain a link to Youtube page
tags_with_links = table.find_all('a', class_='external text')

In [8]:
# make list of YouTube links
links = []

for tag in tags_with_links:
    links.append(tag.get('href'))


In [9]:
# the final link is a dud, so we splice that out
links = links[0:len(links)-1]

In [10]:
# check that links variable is working properly
for link in links:
    print(link)

https://www.youtube.com/channel/UCq-Fj5jknLsUf-MWSy4_brA
https://www.youtube.com/channel/UCbCmjCuTUZos6Inko4u57UQ
https://www.youtube.com/channel/UCpEhnqL0y41EpW2TvWAHD7Q
https://youtube.com/user/PewDiePie
https://www.youtube.com/channel/UCX6OQ3DkcsbYNE6H8uQQuVA
https://www.youtube.com/channel/UCk8GzjMOrta8yxDcKfylJYw
https://www.youtube.com/channel/UCJplp5SjeGSdVdwsfb9Q7lQ
https://www.youtube.com/channel/UCJ5v_MCY6GNUBTO8-D3XoAg
https://www.youtube.com/channel/UCFFbwnve3yF62-tVXkTyHqg
https://www.youtube.com/channel/UCvlE5gTbOvjiolFlEm-c_Ow
https://www.youtube.com/channel/UCOmHUn--16B90oW2L6FRR3A
https://www.youtube.com/channel/UC295-Dw_tDNtZXFeAPAW6Aw
https://www.youtube.com/channel/UCyoXW-Dse7fURq30EWl_CUA
https://www.youtube.com/channel/UC6-F5tO8uklgE9Zy8IvbdFw
https://www.youtube.com/channel/UCLkAepWjdylmXSltofFvsYQ
https://www.youtube.com/channel/UCIwFjwMjI0y7PDBVEO9-bkQ
https://www.youtube.com/channel/UC3IZKseVpdzPSBaWxBxundA
https://www.youtube.com/channel/UCffDXn7ycAzwL2LDlbyW

In [11]:
# make dictionary that pairs each channel with its web link
channel_and_link = dict(zip(channels,links))

In [12]:
# list holding all of the video information, where each element is a dictionary holding info for one video
video_info_list = []

# loop through each channel and go to webapge to scrape data
for channel, link in channel_and_link.items():
    
    
    # visit each page
    browser.visit(link+"/videos")
    
    # get html
    html = browser.html
    
    # use Beautiful Soup to parse html
    soup = bs(html, 'html.parser')
    
    # search html for "a" tag with video-title as the id - this provides the tag holding the title
    titles = soup.find_all("a", id='video-title')
           
    # search html for "span" tag of specified class - this provides the tags holding 
    # the number of views and the time the video was posted
    views_and_time = soup.find_all("span", class_="style-scope ytd-grid-video-renderer")
    
    # hold all video titles
    video_titles = []
    
    for title in titles:
        
        title_as_text = title.text
        
        video_titles.append(title_as_text)
    
    
    
    start_index = 0
    
    finish_index = len(views_and_time)
    
    # holds all of the views
    views = []
    
    # holds all of the post times
    post_time = []
    
    for index in range(start_index, finish_index):
        
        # this is the element we will add to a list
        target_element = views_and_time[index].text
        
        # if the index is even, then it must be views information
        if (index % 2 == 0):
            views.append(target_element)
    
        # otherwise, index is odd, meaning it is post-time information
        else:
            post_time.append(target_element)
    
    print("Titles Length:",len(video_titles))
    print("Views Length",len(views))
    print("Post-Time Length:",len(post_time))
    
    
    # with all the information needed, loop through index of the lists, where each index represents
    # a video
    for i in range(0, len(video_titles)):
       
        # indexError is likely because of the variability in the html pages 
        try: 
            
            # sometimes no information is scraped, so make sure that there is at least one video to add
            if ( ( len(video_titles[i]) > 0 ) & ( len(views[i]) > 0 ) & ( len(post_time[i]) > 0 ) ):
                
                video_info = {
                    "Channel": channel,
                    "Title": video_titles[i],
                    "Views": views[i],
                    "Post-Time": post_time[i]
                }

                # add video to list of videos
                video_info_list.append(video_info)
        
        # catch IndexErrors and continue 
        except IndexError:
            print("Expected Error Occured. Moving on...")
        




Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 0
Views Length 0
Post-Time Length: 0
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length: 30
Views Length 30
Post-Time Length: 30
Titles Length

In [13]:
video_info_list

[{'Channel': 'T-Series',
  'Title': '#TujheBhulaDiya 😭#RanbirKapoor 👨\u200d🎤 #SadSongStatus 💔 #PriyankaChopra 👩\u200d🎤 Sad Status #shorts',
  'Views': '11K views',
  'Post-Time': '4 hours ago'},
 {'Channel': 'T-Series',
  'Title': 'MASTI TAKITA DHUM (Full Vudio) Shabaash Mithu | Taapsee P, Lagnajita, Anusha Mani, Amit T | M Kumar',
  'Views': '22K views',
  'Post-Time': '6 hours ago'},
 {'Channel': 'T-Series',
  'Title': 'Mere Dil Gaaye Ja (Zooby Zooby) Dhokha | R. Madhavan Khushalii K|Tanishk Kumaar Zahrah Yash Bhushan',
  'Views': '798K views',
  'Post-Time': '9 hours ago'},
 {'Channel': 'T-Series',
  'Title': 'Ganpati Bappa Morya (Video Jukebox) Filmy Ganpati Utsav | Ganpati Festival 2022 | Ganpati Songs',
  'Views': '94K views',
  'Post-Time': '17 hours ago'},
 {'Channel': 'T-Series',
  'Title': 'Sabri Brothers: Teri Jawani Badi Mast Mast Hai (Full Song) | Pyar Kiya To Darna Kya | Dance Song',
  'Views': '78K views',
  'Post-Time': '1 day ago'},
 {'Channel': 'T-Series',
  'Title': 

In [15]:
# use pandas to convert list of dictionaries to dataframe
video_df = pd.DataFrame.from_dict(video_info_list)
video_df.head()

Unnamed: 0,Channel,Title,Views,Post-Time
0,T-Series,#TujheBhulaDiya 😭#RanbirKapoor 👨‍🎤 #SadSongSta...,11K views,4 hours ago
1,T-Series,MASTI TAKITA DHUM (Full Vudio) Shabaash Mithu ...,22K views,6 hours ago
2,T-Series,Mere Dil Gaaye Ja (Zooby Zooby) Dhokha | R. Ma...,798K views,9 hours ago
3,T-Series,Ganpati Bappa Morya (Video Jukebox) Filmy Ganp...,94K views,17 hours ago
4,T-Series,Sabri Brothers: Teri Jawani Badi Mast Mast Hai...,78K views,1 day ago
...,...,...,...,...
1298,Katy Perry,"Katy Perry, Darius Rucker - Only Love (Live Fr...",1.2M views,1 year ago
1299,Katy Perry,Katy Perry - Never Really Over/Not The End Of ...,2.3M views,1 year ago
1300,Katy Perry,"Katy Perry, Tiësto, Aitana - Resilient (Tiësto...",9.6M views,1 year ago
1301,Katy Perry,"Katy Perry, Tiësto - Resilient (ft. Aitana) (T...",2.8M views,1 year ago


In [14]:
browser.quit()