# Requires reference list dataset to iterate through channel IDs

dataset = https://www.kaggle.com/datasets/bhavyadhingra00020/top-100-social-media-influencers-2024-countrywise?resource=download

dataset_path = [insert here path of dataset reference]

for reference, the current dataset_path is the top 100 influencers' data

In [150]:
## IMPORT FUNCTIONS

import os
import pandas as pd
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
from IPython.display import JSON
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

In [151]:
## GLOBAL VARIABLES HERE

api_key = {API_KEY}
output_path = {output_path}
chromedriver_path = {chromedriver_path}
dataset_path = {dataset_path}

# Selenium - Channel Username -> Channel ID converter

URL: https://www.streamweasels.com/tools/youtube-channel-id-and-user-id-convertor/
*all source code belongs to them, I accessed the site using selenium automation.


this was necessary due to the new limitation (or bug) in the youtube api v3, where custom urls (e.g. @xxx) is unable to locate the items when iterating through using API which leads to runtime errors; do note, however, that this is not foolproof as there are some deleted sites and unsearchable sites, so this is just a workaround for automation

In [130]:
## FUNCTION USES STREAMWEASEL'S SITE (WHICH USES YOUTUBE API) TO OBTAIN CHANNEL ID
def get_channel_id(username):
    cService = webdriver.ChromeService(executable_path=chromedriver_path)
    driver = webdriver.Chrome(service = cService)

    driver.get("https://www.streamweasels.com/tools/youtube-channel-id-and-user-id-convertor/")  #access the site

    channel_name = driver.find_element(By.CLASS_NAME, "cp-youtube-to-id__target")    #access the channel name DOM
    channel_name.send_keys(username)                                                 #input the custom channel username
    convert = driver.find_element(By.CLASS_NAME, "cp-youtube-to-id__submit")         #access the form submission DOM
    convert.click()                                                                  #convert to ID
    time.sleep(5)                                                                    # delay for 5s to give enough time for form to load otherwise returns '-'
    result = driver.find_element(By.CLASS_NAME, "cp-youtube-to-id__result")
    if result.text == '-':                                                           # '-' is the default, it also means ID not found or Account is deleted
        return 'pass'                                                                # if '-' then return 'pass' to input a null set when iterating in channel_id list
    else:
        return result.text
    driver.close()                                                                   # close the driver to avoid overlap

In [131]:
## INITIALIZE API

api_service_name = "youtube"
api_version = "v3"

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=api_key)

In [136]:
## define functions

def to_csv(df, output_path):                         #saves to csv
    df.to_csv(output_path, index = True)

all_data = []

def get_channel_stats(youtube, channel_ids):         # scrapes specific data from Channel ID using youtube api and appends to all_data
    
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id=channel_ids
        )
        
    response = request.execute()
    
    for item in response['items']:
        data = {
            'channelName': item['snippet']['title'],
            'subscribers': item['statistics']['subscriberCount'],
            'views': item['statistics']['viewCount'],
            'totalVideos': item['statistics']['videoCount'],
            'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
        }
        
        all_data.append(data)
        
        return()

In [133]:
## EXTRACT THE CHANNEL IDS FROM THE REFERENCE DATASET. 
## channel_id_list is then passed on to the get_channel_stats function through iteration to append each channel id's stats into staging list

channel_id_list = []                            # staging list to be transformed to dataframe

df = pd.read_csv(dataset_path)
channel_ids = df['NAME'].str.split('@').str[-1].unique()

for i in channel_ids:
    if i.startswith('U',0,1) == False:
        temp_result = get_channel_id(i)
        if temp_result == 'pass':                 #if 'pass' then the username couldn't be converted to channel id (which means it might be deleted or unsearchable)
            pass                                  #input nothing
        else:
            channel_id_list.append(temp_result)
    else:
        channel_id_list.append(i)

['UCq-Fj5jknLsUf-MWSy4_brA', 'UCbCmjCuTUZos6Inko4u57UQ', 'UCpEhnqL0y41EpW2TvWAHD7Q', 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'UCX6OQ3DkcsbYNE6H8uQQuVA', 'UCJplp5SjeGSdVdwsfb9Q7lQ', 'UCk8GzjMOrta8yxDcKfylJYw', 'UCFFbwnve3yF62-tVXkTyHqg', 'UCJ5v_MCY6GNUBTO8-D3XoAg', 'UCvlE5gTbOvjiolFlEm-c_Ow', 'UCOmHUn--16B90oW2L6FRR3A', 'UCyoXW-Dse7fURq30EWl_CUA', 'UCIwFjwMjI0y7PDBVEO9-bkQ', 'UC6-F5tO8uklgE9Zy8IvbdFw', 'UCLkAepWjdylmXSltofFvsYQ', 'UC3IZKseVpdzPSBaWxBxundA', 'UCffDXn7ycAzwL2LDlbyWOTw', 'UCppHT7SZKKvar4Oc9J4oljQ', 'UCP6uH_XlsxrXwZQ4DlqbqPg', 'UC55IWqFLDH1Xp7iu1_xknRA', 'UCBnZ16ahKA2DZ_T5W0FPUXg', 'UCRijo3ddMTht_IHyNSNXpNQ', 'UC3gNmTGu-TTbFPpfSs5kNkg', 'UCcdwLMPsaU2ezNSJU1nFoBQ', 'UCJrDMFOdv1I2k8n9oK_V21w', 'UCEdvpU2pFRCVqU6yIPyTpMQ', 'UC56gTxNs4f9xZ7Pa2i5xNzg', 'UCt4t-jeY85JegMlZ-E5UWtA', 'UCK1i2UviaXLUNrZlAFpw_jA', 'UC0C-w0YjGpqDXGB8IHb662A', 'UC9CoOnJkIBMdeijd9qYoT_g', 'UCfM3zsQsOnfWNUppiycmBuw', 'UCaayLD9i5x4MmIoVZxXSv_g', 'UC4NALVCmcmL5ntpV0thoH6w', 'UCbTLwN10NoCU4WDzLf1JMOA', 'UCqECaJ8Gagnn7YCbP

In [138]:
## iterates through channel_id_list and retrieves stats for each
for channel_id in channel_id_list:
    get_channel_stats(youtube, channel_id)

In [144]:
## transforms the staging list into a dataframe
df1 = pd.DataFrame(all_data)

In [148]:
## standardizes the index to be used as primary key for SQL querying
df1.index = df1.index + 1

In [149]:
df1

Unnamed: 0,channelName,subscribers,views,totalVideos,playlistId
1,T-Series,272000000,264367318565,21599,UUq-Fj5jknLsUf-MWSy4_brA
2,Cocomelon - Nursery Rhymes,181000000,186629925185,1251,UUbCmjCuTUZos6Inko4u57UQ
3,SET India,177000000,169185607582,143344,UUpEhnqL0y41EpW2TvWAHD7Q
4,PewDiePie,111000000,29384161123,4782,UU-lHJZR3Gqxm24_Vd_AJ5Yw
5,MrBeast,312000000,57598316688,812,UUX6OQ3DkcsbYNE6H8uQQuVA
...,...,...,...,...,...
94,Like Nastya ESP,40900000,20374474891,881,UUpEJRZdSpdVZ8vh63T9I2KQ
95,La Granja de Zenón,40900000,32327424301,787,UUwpcLKMwiuPg4aqImpGk6Ew
96,Alfredo Larin,41800000,34008037143,1836,UUd5ApCORQsMOZZz5E9oVeFA
97,Mikecrack,51900000,19035459543,2049,UUqJ5zFEED1hWs0KNQCQuYdQ


In [None]:
## save to csv with index=True
to_csv(df1, output_path)

In [None]:
print("---- ALL DONE ------")