In [1]:
'''
IMPORT ALL DEPENDENCIES
--------------------------
Beautiful Soup: Parse HTML objects from web pages
Pymongo: Read and write to MongoDB
Splinter: Automating browser actions to interact with HTML elements
DateTime: Convert dates
re: Hack to remove 'rd', 'th', 'st' from date strings
'''

from bs4 import BeautifulSoup as bs
import pandas as pd
import pymongo
from pprint import pprint
import requests
from splinter import Browser
from datetime import datetime
from dateutil.parser import parse
import time
import re
import lxml

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.youtube_stats
summary_stats = db.summary_stats
views_stats=db.views_stats
ranking_stats=db.ranking_stats
earning_stats=db.earning_stats
timeline_stats=db.timeline_stats

In [4]:
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://socialblade.com/youtube/top/5000'
browser.visit(url)

In [5]:
html = browser.html
soup = bs(html, 'lxml')

summary = []
ranking = []
viewcount = []
earnings = []
timeline = []

def solve(s):                                             
    return re.sub(r'(\d)(st|nd|rd|th)', r'\1', s)

youtuber = soup.find_all('div', style='float: right; width: 900px;')
for i in youtuber:
    try:
        link = i.find_all('a')[1:2000]

        for i in link:
            try:
                youtuber_link = i['href']
                browser.click_link_by_href(youtuber_link)
                html = browser.html
                soup = bs(html, 'lxml')

                if 'YouTube search results found' in soup.text or 'YouTube search result found' in soup.text:
                    browser.back()
                else:
                    name = soup.find_all('h1', style='float: left; font-size: 1.4em; font-weight: bold; color:#333; margin: 0px; padding: 0px; margin-right: 5px;')[0].text
                    primary_stats = soup.find_all('div', id='YouTubeUserTopInfoBlock')

                    for i in primary_stats:
                        uploads = i.find_all('span', style='font-weight: bold;')[0].text.replace(',', '')
                        subscribers = i.find_all('span', style='font-weight: bold;')[1].text.replace(',', '')
                        views = i.find_all('span', style='font-weight: bold;')[2].text.replace(',', '')
                        category = i.find_all('span', style='font-weight: bold;')[4].text.replace(',', '')
                        created = i.find_all('span', style='font-weight: bold;')[5].text.replace(',', '')
                        if created == '--':
                            created = '1900-01-01'
                        else:
                            created = datetime.strptime(solve(created), "%b %d %Y").strftime('%Y-%m-%d')

                        summary.append({'name': name,
                                        'uploads': uploads,
                                        'subscribers': subscribers,
                                        'views': views,
                                        'category': category,
                                        'created': created})

                    ranking_stats = soup.find_all('div', style='height: 100px; width: 860px; border-bottom: 1px solid #eee;')
                    for i in ranking_stats:
                        grade = i.find_all('p')[0].text
                        subscriber_rank = i.find_all('p')[2].text[:-2].replace(',', '')
                        view_rank = i.find_all('p')[4].text[:-2].replace(',', '')
                        socialblade_rank = i.find_all('p')[6].text[:-2].replace(',', '')

                        ranking.append({'name': name,
                                        'category': category,
                                        'grade': grade,
                                        'subscriber_rank': subscriber_rank,
                                        'view_rank': view_rank,
                                        'socialblade_rank': socialblade_rank})

                    views_stats = soup.find_all('div', style='height: 70px; width: 860px; border-bottom: 1px solid #eee;')
                    for i in views_stats:
                        views_last30d_count = i.find_all('span', id='afd-header-views-30d')[0].text.replace(',', '').replace('\n', '')
                        subs_last30d_count = i.find_all('span', id='afd-header-subs-30d')[0].text.replace(',', '').replace('\n', '')

                        viewcount.append({'name': name,
                                          'category': category,
                                          'views_last30d_count': views_last30d_count,
                                          'subs_last30d_count': subs_last30d_count})

                    earnings_stats = soup.find_all('div', style='height: 80px; width: 860px;')
                    for i in earnings_stats:
                        min_monthly_earnings = i.find_all('p', style='font-size: 1.4em; color:#41a200; font-weight: 600; padding-top: 20px;')[0].text.split('-')[0].replace('$', '').replace('.', '').replace('K', '00').replace('M', '00000').strip()
                        max_monthly_earnings = i.find_all('p', style='font-size: 1.4em; color:#41a200; font-weight: 600; padding-top: 20px;')[0].text.split('-')[1].replace('$', '').replace('.', '').replace('K', '00').replace('M', '00000').strip()
                        min_annual_earnings = i.find_all('p', style='font-size: 1.4em; color:#41a200; font-weight: 600; padding-top: 20px;')[1].text.split('-')[0].replace('$', '').replace('.', '').replace('K', '00').replace('M', '00000').strip()
                        max_annual_earnings = i.find_all('p', style='font-size: 1.4em; color:#41a200; font-weight: 600; padding-top: 20px;')[1].text.split('-')[1].replace('$', '').replace('.', '').replace('K', '00').replace('M', '00000').strip()

                        earnings.append({'name': name,
                                         'category': category,
                                         'min_monthly_earnings': min_monthly_earnings,
                                         'max_monthly_earnings': max_monthly_earnings,
                                         'min_annual_earnings': min_annual_earnings,
                                         'max_annual_earnings': max_annual_earnings})

                    timeline_statsA = soup.find_all('div', style='width: 860px; height: 32px; line-height: 32px; background: #f8f8f8;; padding: 0px 20px; color:#444; font-size: 9pt; border-bottom: 1px solid #eee;')
                    for i in timeline_statsA:
                        timeline_dateA = i.find_all('div', style='float: left; width: 95px;')[0].text.replace('\n', '')
                        timeline_subsA = i.find_all('div', style='width: 140px; float: left;')[0].text.replace('\n', '').replace(',', '')
                        timeline_viewsA = i.find_all('div', style='width: 140px; float: left;')[1].text.replace('\n', '').replace(',', '')

                        timeline_min_earningsA = i.find_all('div', style='float: left; width: 165px; height: 30px;')[0].text.split('-')[0].replace('$', '').strip()
                        if '.' in timeline_min_earningsA:
                            timeline_min_earningsA = timeline_min_earningsA.replace('.', '').replace('K', '00').replace('M', '00000')
                        else:
                            timeline_min_earningsA = timeline_min_earningsA.replace('K', '000').replace('M', '000000')

                        timeline_max_earningsA = i.find_all('div', style='float: left; width: 165px; height: 30px;')[0].text.split('-')[1].replace('$', '').strip()
                        if '.' in timeline_max_earningsA:
                            timeline_max_earningsA = timeline_max_earningsA.replace('.', '').replace('K', '00').replace('M', '00000')
                        else:
                            timeline_max_earningsA = timeline_max_earningsA.replace('K', '000').replace('M', '000000')

                        timeline.append({'name': name,
                                         'category': category,
                                         'date': timeline_dateA,
                                         'subscribers': timeline_subsA,
                                         'views': timeline_viewsA,
                                         'min_earnings': timeline_min_earningsA,
                                         'max_earnings': timeline_max_earningsA})

                    timeline_statsB = soup.find_all('div', style='width: 860px; height: 32px; line-height: 32px; background: #fcfcfc; padding: 0px 20px; color:#444; font-size: 9pt; border-bottom: 1px solid #eee;')
                    for i in timeline_statsB:
                        timeline_dateB = i.find_all('div', style='float: left; width: 95px;')[0].text.replace('\n', '').strip()
                        timeline_subsB = i.find_all('div', style='width: 140px; float: left;')[0].text.replace(',', '').replace('\n', '').replace(' LIVE', '').strip()
                        timeline_viewsB = i.find_all('div', style='width: 140px; float: left;')[1].text.replace('\n', '').replace(',', '').strip()

                        timeline_min_earningsB = i.find_all('div', style='float: left; width: 165px; height: 30px;')[0].text.split('-')[0].replace('$', '').strip()
                        if '.' in timeline_min_earningsB:
                            timeline_min_earningsB = timeline_min_earningsB.replace('.', '').replace('K', '00').replace('M', '00000')
                        else:
                            timeline_min_earningsB = timeline_min_earningsB.replace('K', '000').replace('M', '000000')

                        timeline_max_earningsB = i.find_all('div', style='float: left; width: 165px; height: 30px;')[0].text.split('-')[1].replace('$', '').strip()
                        if '.' in timeline_max_earningsB:
                            timeline_max_earningsB = timeline_max_earningsB.replace('.', '').replace('K', '00').replace('M', '00000')
                        else:
                            timeline_max_earningsB = timeline_max_earningsB.replace('K', '000').replace('M', '000000')

                        timeline.append({'name': name,
                                         'category': category,
                                         'date': timeline_dateB,
                                         'subscribers': timeline_subsB,
                                         'views': timeline_viewsA,
                                         'min_earnings': timeline_min_earningsB,
                                         'max_earnings': timeline_max_earningsB})

            except:
                browser.back()

    except Exception as e:
        browser.back()


In [14]:
summary_df = pd.DataFrame(summary)
views_df = pd.DataFrame(viewcount)
ranking_df = pd.DataFrame(ranking)
earnings_df = pd.DataFrame(earnings)
timeline_df = pd.DataFrame(timeline)

summary_df.to_csv('summary_df.csv')
views_df.to_csv('views_df.csv')
ranking_df.to_csv('ranking_df.csv')
earnings_df.to_csv('earnings_df.csv')
timeline_df.to_csv('timeline_df.csv')

In [15]:
views_df.count()

category               1612
name                   1612
subs_last30d_count     1612
views_last30d_count    1612
dtype: int64

In [16]:
ranking_df.count()

category            1612
grade               1612
name                1612
socialblade_rank    1612
subscriber_rank     1612
view_rank           1612
dtype: int64

In [17]:
earnings_df.count()

category                1612
max_annual_earnings     1612
max_monthly_earnings    1612
min_annual_earnings     1612
min_monthly_earnings    1612
name                    1612
dtype: int64

In [18]:
timeline_df.count()

category        22540
date            22540
max_earnings    22540
min_earnings    22540
name            22540
subscribers     22540
views           22540
dtype: int64

In [19]:
db = client.youtube_stats
summary_stats = db.summary_stats
views_stats = db.views_stats
ranking_stats = db.ranking_stats
earning_stats = db.earning_stats
timeline_stats = db.timeline_stats

db.summary_stats.drop()
db.views_stats.drop()
db.ranking_stats.drop()
db.earning_stats.drop()
db.timeline_stats.drop()

summary_stats.insert_many(summary_df.to_dict('records'))
views_stats.insert_many(views_df.to_dict('records'))
ranking_stats.insert_many(ranking_df.to_dict('records'))
earning_stats.insert_many(earnings_df.to_dict('records'))
timeline_stats.insert_many(timeline_df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x22b806cfe88>