# YouTube Social Blade Scraper

This notebook scrapes data from monthly statistic pages on Social Blade for YouTubers who posted apology videos prior to September 2019.

The data includes daily changes in subscriber counts and channel views, as well as total subscriber counts and channel views.

In [639]:
# Libraries for scraping Social Blade pages
import requests as req
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
from datetime import datetime

# Libraries for filtering data by date
from datetime import date
from dateutil.relativedelta import relativedelta

## The following functions have been adapted from Anjali Shrivastava's [Medium post](https://medium.com/swlh/how-to-scrape-socialblade-for-youtube-subscription-data-ec7c4bde6933) and her [Jupyter Notebooks](https://github.com/vastava/data-science-projects/tree/master/content%20cop).

In [603]:
# Function that scrapes data from Social Blade
def sub_scraper(url, dataType):
    r = req.get(url)
    #print(r.status_code)
    soup = bs(r.text, 'lxml')
    script_divs = soup.find_all('script', {'type': 'text/javascript'})
    res = 0
    
    # Decides what data to gather based on dataType
    for i in range(len(script_divs)):
        if "CSV" in str(script_divs[i]):
            if dataType == 'count':           # daily change in subscribers
                res = script_divs[i]
            elif dataType == 'total':         # total amount of subscriber
                res = script_divs[i + 1]
            elif dataType == 'views':         # daily change in channel views
                res = script_divs[i + 2]
            elif dataType == 'views_tot':     # total amount of channel views
                res = script_divs[i + 3]
            break
    
    # Splits data gathered from url into a list
    lst = str(res).split('+')
    lst = [test.strip() for test in lst]
    lst = [test.replace('\\n"', '').replace('"', '') for test in lst]
    return lst

In [604]:
# Function that transforms Social Blade data into a dataframe
def to_df(url, name, dataType):

    # Gathers social blade data into a list
    lst = sub_scraper(url, dataType)
    
    if lst != 0: 
        lst = lst[1:len(lst) - 1]
        # Creates empty dataframe and populates columns with the list data
        df = pd.DataFrame() 
        df['Date'] = [x.split(',')[0] for x in lst] 
        df['Subs'] = [x.split(',')[1] for x in lst]
        df['Name'] = name
        return df

In [638]:
# Function that checks the validity of a date
# Takes in a string in the format of year-month-day (eg. ‘2016–05–19’) 
def checkmonth(check, year, month, day):
    target = date(year, month, day)
    check = date.fromisoformat(check)
    bounds = [target + relativedelta(months=-1), target + relativedelta(months=+1)]
    if check >= bounds[0] and check <= bounds[1]:
        return True
    else: 
        return False
    
# Function that filters the dataframe by date. 
# Returns a dataframe with one month prior to the given date and six months after
def filterdate(apology_date, df):
    month = apology_date.month
    day   = apology_date.day
    year = apology_date.year
    return df[df['Date'].apply(checkmonth, args=(year, month, day))]

# Importing apology metadata

In [636]:
# Imports CSV files with all the apology metadata
apology_info_df = pd.read_csv('80_YT_wayback.csv')

# Transforms dates into numerical dates
apology_info_df['Date Posted'] = pd.to_datetime(apology_info_df['Date Posted']).dt.date

#Creates a dataframe that only contains apologies with Wayback Machine URLs
web_archive_df = apology_info_df.dropna(subset=['Wayback Machine URL'])
web_archive_df = apology_info_df[apology_info_df["Wayback Machine URL"].str.contains("web.archive") == True]

# Resets the index of the remaining rows starting from 0
web_archive_df = web_archive_df.reset_index(drop=True)

In [606]:
web_archive_df

Unnamed: 0,ID #,ID Name,ID,YouTuber,Channel,Title,Date Posted,Duration,Video source,Video likes percentage,...,1 day,1 wk,1 mos,6 mos,current,1 day.1,1 wk.1,1 mos.1,6 mos.1,current.1
0,2,SamNia,002_SamNia,"Samuel, NiaChel Rader",Sam and Nia,FORGIVEN,2015-08-21,0:02:38,https://www.youtube.com/watch?v=ilFuzKURvhk,21.08%,...,360609.0,364489.0,371862.0,411785.0,2540000,,,,,
1,3,Keem,003_Keem,Daniel Keem (Keemstar),DramaAlert,We got it wrong. - I'm sorry & will do better.,2016-01-09,0:05:57,https://www.youtube.com/watch?v=2JMZWMJVNe8,23.72%,...,,,,,5530000,,,,,
2,4,Fine,004_Fine,Benny and Rafi Fine,The Fine Bros,Update.,2016-01-31,0:03:06,https://www.youtube.com/watch?v=0t-vuI9vKfg,14.86%,...,13810000.0,13590000.0,13590000.0,14190000.0,20000000,,,,,
3,5,Pepper,005_Pepper,Sam Pepper,Sam Pepper,i'm sorry,2016-02-24,0:20:03,https://youtu.be/DPxb892BaV8,49.37%,...,2311330.0,,2329942.0,,2130000,,,,,
4,6,Leafy,006_Leafy,Calvin Vail,LeafyIsHere,Apology.,2016-03-21,0:03:05,https://www.youtube.com/watch?v=ShbBDjQ7zpA,76.37%,...,1630000.0,1680000.0,2270000.0,4820000.0,4910000,,,,,
5,7,Toby,007_Toby,Toby Turner (Tobuscus),TobyTurner,The Truth.,2016-04-11,0:01:01,https://www.youtube.com/watch?v=MiCJG8Q2Uds,78.71%,...,6390000.0,6390000.0,6390000.0,6370000.0,6200000,,,,,
6,8,Tmar,008_Tmar,Trevor Martin,TmarTn,I'm Sorry,2016-07-06,0:02:28,https://www.youtube.com/watch?v=gjyGQV2i9eU,36.42%,...,,,,,3280000,,,,,
7,9,Rice,009_Rice,Bryan Le,RiceLive,#ApologizeRiceGum I Am Sorry,2016-07-16,0:11:46,https://www.youtube.com/watch?v=UWvlmU7iKnU,90.90%,...,3000000.0,3170000.0,3710000.0,5150000.0,10000000,,,,,
8,10,Louis,010_Louis,Louis Cole,FunForLouis,MY RESPONSE...,2016-08-17,0:04:42,https://www.youtube.com/watch?v=8U1ZGMDlASA,85.74%,...,1860000.0,1870000.0,1870000.0,1890000.0,1980000,,,,,
9,11,JonTron,011_JonTron,Jon Jafari,JonTronShow,My Statement:,2017-03-19,0:04:27,https://www.youtube.com/watch?v=aIFf7qwlnSc,78.67%,...,3120000.0,3140000.0,3160000.0,3360000.0,6660000,,,,,


# Scraping data from Social Blade

In [656]:
apology_dates = web_archive_df['Date Posted']

# Retrieves subscriber changes by day data from Social Blade 
# Filtered by a month before and a month after the apology post date
sub_month_change_dfs = []
count = 0
for i in range(len(web_archive_df)):
    sub_month_change_dfs.append(filterdate(apology_dates[count], to_df(web_archive_df['Wayback Machine URL'][count], web_archive_df['Channel'][count], 'count')))
    count += 1
    
# Retrieves total subscriber counts by day data from Social Blade 
# Filtered by a month before and a month after the apology post date
sub_month_total = []
count = 0
for i in range(len(web_archive_df)):
    sub_month_total.append(filterdate(apology_dates[count], to_df(web_archive_df['Wayback Machine URL'][count], web_archive_df['Channel'][count], 'total')))
    count += 1

[           Date  Subs         Name
 495  2015-07-21   270  Sam and Nia
 496  2015-07-22   237  Sam and Nia
 497  2015-07-23    74  Sam and Nia
 498  2015-07-24   793  Sam and Nia
 499  2015-07-25   386  Sam and Nia
 ..          ...   ...          ...
 553  2015-09-17   279  Sam and Nia
 554  2015-09-18   193  Sam and Nia
 555  2015-09-19  1328  Sam and Nia
 556  2015-09-20  1135  Sam and Nia
 557  2015-09-21   216  Sam and Nia
 
 [63 rows x 3 columns],
            Date  Subs        Name
 38   2015-12-09  2922  DramaAlert
 39   2015-12-10  2126  DramaAlert
 40   2015-12-11  4009  DramaAlert
 41   2015-12-12  4280  DramaAlert
 42   2015-12-13  3825  DramaAlert
 ..          ...   ...         ...
 96   2016-02-05  3867  DramaAlert
 97   2016-02-06  4611  DramaAlert
 98   2016-02-07  4642  DramaAlert
 99   2016-02-08  8342  DramaAlert
 100  2016-02-09  8091  DramaAlert
 
 [63 rows x 3 columns],
             Date   Subs           Name
 1559  2015-12-31   9283  The Fine Bros
 1560  2016-01-0

In [647]:
# Exports subscriber data into CSV files
pd.concat(sub_month_change_dfs).to_csv('sub_month_change.csv', index=False)
pd.concat(sub_month_total).to_csv('sub_month_total.csv', index=False)

In [679]:
# Merges total subscriber data with subscriber change count data
for i in range(len(web_archive_df)-6):
    sub_month_change_dfs[i]['Total'] = sub_month_total[i].Subs

In [680]:
# Exports all subsciber data into a CSV file
pd.concat(sub_month_change_dfs).to_csv('sub_data_combined.csv', index=False)

# Channel Views

In [653]:
# Retrieves channel view counts by day data from Social Blade 
# Filtered by a month before and a month after the apology post date
view_month_change_dfs = []
count = 0
for i in range(len(web_archive_df)):
    view_month_change_dfs.append(filterdate(apology_dates[count], to_df(web_archive_df['Wayback Machine URL'][count], web_archive_df['Channel'][count], 'views')))
    count += 1
view_month_change_dfs

[           Date    Subs         Name
 495  2015-07-21  160731  Sam and Nia
 496  2015-07-22   79328  Sam and Nia
 497  2015-07-23   80040  Sam and Nia
 498  2015-07-24   80902  Sam and Nia
 499  2015-07-25   84380  Sam and Nia
 ..          ...     ...          ...
 553  2015-09-17  116506  Sam and Nia
 554  2015-09-18  112328  Sam and Nia
 555  2015-09-19  127878  Sam and Nia
 556  2015-09-20  142344  Sam and Nia
 557  2015-09-21  288471  Sam and Nia
 
 [63 rows x 3 columns],
            Date     Subs        Name
 38   2015-12-09   767958  DramaAlert
 39   2015-12-10   677001  DramaAlert
 40   2015-12-11   385065  DramaAlert
 41   2015-12-12   474056  DramaAlert
 42   2015-12-13   485719  DramaAlert
 ..          ...      ...         ...
 96   2016-02-05   486448  DramaAlert
 97   2016-02-06   601680  DramaAlert
 98   2016-02-07   658848  DramaAlert
 99   2016-02-08        0  DramaAlert
 100  2016-02-09  1331137  DramaAlert
 
 [63 rows x 3 columns],
             Date     Subs          

In [654]:
# Retrieves total channel view count data from Social Blade 
# Filtered by a month before and a month after the apology post date
view_month_total = []
count = 0
for i in range(len(web_archive_df)):
    view_month_total.append(filterdate(apology_dates[count], to_df(web_archive_df['Wayback Machine URL'][count], web_archive_df['Channel'][count], 'views_tot')))
    count += 1
view_month_total

[           Date      Subs         Name
 495  2015-07-21  42089601  Sam and Nia
 496  2015-07-22  42250332  Sam and Nia
 497  2015-07-23  42329660  Sam and Nia
 498  2015-07-24  42409700  Sam and Nia
 499  2015-07-25  42490602  Sam and Nia
 ..          ...       ...          ...
 553  2015-09-17  77569190  Sam and Nia
 554  2015-09-18  77685696  Sam and Nia
 555  2015-09-19  77798024  Sam and Nia
 556  2015-09-20  77925902  Sam and Nia
 557  2015-09-21  78068246  Sam and Nia
 
 [63 rows x 3 columns],
            Date       Subs        Name
 38   2015-12-09   82124382  DramaAlert
 39   2015-12-10   82892340  DramaAlert
 40   2015-12-11   83569341  DramaAlert
 41   2015-12-12   83954406  DramaAlert
 42   2015-12-13   84428462  DramaAlert
 ..          ...        ...         ...
 96   2016-02-05  117807280  DramaAlert
 97   2016-02-06  118293728  DramaAlert
 98   2016-02-07  118895408  DramaAlert
 99   2016-02-08  119554256  DramaAlert
 100  2016-02-09  119554256  DramaAlert
 
 [63 rows x 

In [655]:
# Exports channel views data into CSV files
pd.concat(view_month_change_dfs).to_csv('view_month_change.csv', index=False)
pd.concat(view_month_total).to_csv('view_month_total.csv', index=False)

In [676]:
# Merges channel views by data data with subscriber change count data
for i in range(len(web_archive_df)-6):
    sub_month_change_dfs[i]['Views'] = view_month_change_dfs[i].Subs
    
# Merges total channel view data with subscriber change count data
for i in range(len(web_archive_df)-6):
    sub_month_change_dfs[i]['Views Total'] = view_month_total[i].Subs

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35


[           Date  Subs         Name   Total   Views
 495  2015-07-21   270  Sam and Nia  152953  160731
 496  2015-07-22   237  Sam and Nia  153223   79328
 497  2015-07-23    74  Sam and Nia  153460   80040
 498  2015-07-24   793  Sam and Nia  153534   80902
 499  2015-07-25   386  Sam and Nia  154327   84380
 ..          ...   ...          ...     ...     ...
 553  2015-09-17   279  Sam and Nia  371386  116506
 554  2015-09-18   193  Sam and Nia  371665  112328
 555  2015-09-19  1328  Sam and Nia  371858  127878
 556  2015-09-20  1135  Sam and Nia  373186  142344
 557  2015-09-21   216  Sam and Nia  374321  288471
 
 [63 rows x 5 columns],
            Date  Subs        Name   Total    Views
 38   2015-12-09  2922  DramaAlert  674981   767958
 39   2015-12-10  2126  DramaAlert  677903   677001
 40   2015-12-11  4009  DramaAlert  680029   385065
 41   2015-12-12  4280  DramaAlert  684038   474056
 42   2015-12-13  3825  DramaAlert  688318   485719
 ..          ...   ...         ...    

In [678]:
# Exports Social Blade data into a CSV file
pd.concat(sub_month_change_dfs).to_csv('social_blade_combined.csv', index=False)