### YouTuberme channel Crawling

##### Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import glob
import os

##### Parameters

In [27]:
today_date = datetime.today().strftime('%Y-%m-%d')
file_path = f"DB/Youtube_Data_{today_date}.xlsx"

# List all files that match the pattern "Youtube_Data_*.xlsx"
files = glob.glob("DB/Youtube_Data_*.xlsx")

# Extract dates from filenames and find the latest date
dates = [os.path.splitext(os.path.basename(file))[0].replace("Youtube_Data_", "") for file in files]
dates = sorted(dates, reverse=True)

# Define the file name
file_name = "DB/country_category_url.xlsx"

# start url(30 countries): top 1000
country_dic = {"United States": "https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states",
               "South Korea": "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of",
               "Germany":"https://us.youtubers.me/germany/all/top-1000-youtube-channels-in-germany",
               "United Kingdom":"https://us.youtubers.me/united-kingdom/all/top-1000-youtube-channels-in-united-kingdom",
               "Brazil": "https://us.youtubers.me/brazil/all/top-1000-youtube-channels-in-brazil",
               "Mexico" : "https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico",
               "Spain": "https://us.youtubers.me/spain/all/top-1000-youtube-channels-in-spain",
               "Italy" : "https://us.youtubers.me/italy/all/top-1000-youtube-channels-in-italy",
               "Czech Republic": "https://us.youtubers.me/czech-republic/all/top-1000-youtube-channels-in-czech-republic",
               "Russia":"https://us.youtubers.me/russian-federation/all/top-1000-youtube-channels-in-russian-federation",
               "India" : "https://us.youtubers.me/india/all/top-1000-youtube-channels-in-india",
               "France": "https://us.youtubers.me/france/all/top-1000-youtube-channels-in-france",
               "Japan" : "https://us.youtubers.me/japan/all/top-1000-youtube-channels-in-japan",
               "Turkey": "https://us.youtubers.me/turkey/all/top-1000-youtube-channels-in-turkey",
               "Poland": "https://us.youtubers.me/poland/all/top-1000-youtube-channels-in-poland",
               "Canada" : "https://us.youtubers.me/canada/all/top-1000-youtube-channels-in-canada",
               "Vietnam" : "https://us.youtubers.me/viet-nam/all/top-1000-youtube-channels-in-viet-nam",
               "Thailand" : "https://us.youtubers.me/thailand/all/top-1000-youtube-channels-in-thailand",
               "Indonesia" : "https://us.youtubers.me/indonesia/all/top-1000-youtube-channels-in-indonesia",
               "Ukraine" : "https://us.youtubers.me/ukraine/all/top-1000-youtube-channels-in-ukraine",
               "Morocco" : "https://us.youtubers.me/morocco/all/top-1000-youtube-channels-in-morocco",
               "Argentina" : "https://us.youtubers.me/argentina/all/top-1000-youtube-channels-in-argentina",
               "Saudi Arabia": "https://us.youtubers.me/saudi-arabia/all/top-1000-youtube-channels-in-saudi-arabia",
               "Netherlands": "https://us.youtubers.me/netherlands/all/top-1000-youtube-channels-in-netherlands",
               "Egypt": "https://us.youtubers.me/egypt/all/top-1000-youtube-channels-in-egypt",
               "Taiwan": "https://us.youtubers.me/taiwan/all/top-1000-youtube-channels-in-taiwan",
               "Australia": "https://us.youtubers.me/australia/all/top-1000-youtube-channels-in-australia",
               "Greece": "https://us.youtubers.me/greece/all/top-1000-youtube-channels-in-greece",
               "Colombia": "https://us.youtubers.me/colombia/all/top-1000-youtube-channels-in-colombia",
               "Romania" : "https://us.youtubers.me/romania/all/top-1000-youtube-channels-in-romania"
              }

##### Functions

In [None]:
# url 포함 youtuberme df 생성
def collect_youtuberme_url(category_url):
    page = requests.get(category_url)
    # Parse the HTML content
    soup = BeautifulSoup(page.content, "html.parser")

    # Find the table with class "top-charts"
    table = soup.find("table", class_="top-charts")

    # Find all <a> tags within the table
    href_list = []
    if table:
        for a_tag in table.find_all("a"):
            href_value = a_tag.get("href")
            if href_value.endswith("/youtuber-stats"):
                href_list.append("https://us.youtubers.me/" + href_value)
    return href_list
# 여러 카테고리 확인
def crawl_table_urls_extracted(start_url, table_class='top-charts', depth=1):
    visited_urls = set()
    excluded_urls = []
    extracted_strings = {}

    def extract_string_between_substrings(url, start_substring, end_substring):
        start_index = url.find(start_substring)
        end_index = url.find(end_substring, start_index + len(start_substring))
        if start_index != -1 and end_index != -1:
            return url[start_index + len(start_substring):end_index]
        return None

    def recursive_crawl(url, current_depth):
        if current_depth > depth:
            return

        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find the table with the specified class
                table = soup.find('table', class_=table_class)

                if table:
                    # Extract href attributes from anchor tags within the table
                    for row in table.find_all('tr'):
                        columns = row.find_all('td')
                        for col_index, col in enumerate(columns):
                            # Exclude href attributes from the column with the name "category"
                            if col.get_text(strip=True).lower() == 'category':
                                continue

                            hrefs = [a.get('href') for a in col.find_all('a', href=True)]

                            # Process the hrefs
                            for href in hrefs:
                                absolute_url = urljoin(url, href)

                                # Save URLs without "korea-republic-of"
                                if start_url[-8:] not in absolute_url.lower():
                                    excluded_urls.append(absolute_url)
                                else:
                                    visited_urls.add(absolute_url)

                                    # Extract and save strings between specified substrings
                                    extracted_string = extract_string_between_substrings(
                                        absolute_url,
                                        start_url[:28],
                                        '/top-1000-'
                                    )
                                    if extracted_string:
                                        # Create a set for each visited URL to remove duplicates
                                        extracted_strings.setdefault(absolute_url, set()).add(extracted_string)

                                    # Avoid revisiting the same URL
                                    if absolute_url not in visited_urls:
                                        # Recursively crawl the next level
                                        recursive_crawl(absolute_url, current_depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {e}")

    recursive_crawl(start_url, 1)

    # Convert sets to lists
    excluded_urls = list(set(excluded_urls))
    visited_urls = list(set(visited_urls))

    # Convert sets to lists within the extracted_strings dictionary
    for url, strings_set in extracted_strings.items():
        extracted_strings[url] = list(strings_set)
    updated_data = {}
    for key, value in extracted_strings.items():
        split_value = value[0].split('/')
        if len(split_value) >= 2:
            updated_value = split_value[1]
            updated_data[key] = [updated_value]
    return updated_data

    
# category 통일(union 편하게)
def category_preprocessing(df):
    df['category'] = df['category'].replace('nan', 'all')

    # Mapping of old categories to new categories
    category_mapping = {'Film & Animation': 'film-animation',
                        'Autos & Vehicles': 'autos-vehicles',
                        'Music': 'music',
                        'Movies': 'movies',
                        'Pets & Animals': 'pets-animals',
                        'Sports': 'sports',
                        'Travel & Events': 'travel-events',
                        'Gaming': 'gaming',
                        'People & Blogs': 'people-blogs',
                        'Comedy': 'comedy',
                        'Entertainment': 'entertainment',
                        'News & Politics': 'news-politics',
                        'Howto & Style': 'howto-style',
                        'Education': 'education',
                        'Science & Technology': 'science-technology',
                        'Shows': 'shows',
                        'Nonprofits & Activism': 'nonprofits-activism',
                        'all': 'all'}

    # Map the old categories to the new categories
    df['category'] = df['category'].map(category_mapping)
    return df

# category df 생성
def create_dataframe(extracted_strings, country):
    data = {'url': [], 'category': [], 'country' : []}
    for url, categories in extracted_strings.items():
        for category in categories:
            data['url'].append(url)
            data['category'].append(category)
            data['country'].append(country)
    df = pd.DataFrame(data)
    return df

# youtuberme df 생성
def collect_youtuberme_basic(url, country):
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'lxml')

    # 현재 페이지에서 table 태그 모두 선택하기
    table1 = soup.select('table')

    # 하나의 테이블 태그 선택하기
    table = table1[0]

    df_top1000 = pd.read_html(str(table))[0]
    href_list = collect_youtuberme_url(url)
    df_top1000['url'] = href_list
    df_top1000['country'] = [country for x in range(len(df_top1000))]
    # Check if the first category value is NaN, and assign the second value if it is
    if pd.isna(df_top1000['category'].iloc[0]):
        df_top1000['category'].iloc[0] = df_top1000['category'].iloc[1]
    # Now, you can assign the value to the entire column as per your requirement
    df_top1000['category'] = df_top1000['category'].iloc[0]    
    # print("url_list: ", len(href_list))
    df_top1000 = category_preprocessing(df_top1000)
    return df_top1000

# 기존 + 신규 데이터 추가
def update_channels(past_df, df):
    combined_df = pd.concat([past_df, df])
    # Drop duplicates based on 'channel_name' and keep only the unique entries
    combined_df = combined_df.drop_duplicates(subset=['channel_name'])
    return combined_df

# excute whole process
def excute_youtuberme_crawling(county_category_data, past_df):#,country_url_df
    # 전체 카테고리 별 채널 수집
    # country_category_df = pd.read_excel("country_category_url.xlsx")
    # url 돌면서 필요한 데이터 
    country_url_df = []
    for url, country in zip(county_category_data['url'].to_list(), county_category_data['country'].to_list()):    
        df_new = collect_youtuberme_basic(url, country)
        country_url_df.append(df_new)
        df = pd.concat(country_url_df, axis=0, ignore_index=True)
    # 중복 제거
    df = df.drop_duplicates('Youtuber')
    df = df.drop("rank", axis=1)
    # YouTube URL 저장
    df['youtube url'] = [url[:-7] for url in  df['url'].to_list()]
    df['yt_fixed_url'] = ''
    df['data_yn'] = 'y'    
    # column명 변경
    df = df.rename(columns={
        'url': 'youtuberme_url',
        'Youtuber': 'channel_name',
        'video views': 'total_video_views',
        'video count': 'total_video_count',
        'youtube url': 'yt_url'
    })
    # subscribers, total_video_count가 0인 경우 채널이 삭제된 경우이므로 제거
    df = df[(df['subscribers'] != 0) & (df['subscribers'].notna())]
    df = update_channels(df, past_df)
    return df


##### Load Data

In [None]:
if dates:
    # Load the latest file based on the date
    latest_file = f"DB/Youtube_Data_{dates[0]}.xlsx"
    previous_df = pd.read_excel(latest_file)
    print(f"Loaded file: {latest_file}")
else:
    print("No files found.")

# Check if the file exists
if os.path.exists(file_name):
    # Load the Excel file into a DataFrame
    country_category_df = pd.read_excel(file_name)
else:
    print(f"{file_name} does not exist. The code will not run.")
    # category별 url 리스트 합치기
    dfs = []
    for country, url in country_dic.items():
        print(country, url)
        extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
        df_category = create_dataframe(extracted_strings, country)
        dfs.append(df_category)
        country_category_df = pd.concat(dfs, axis=0, ignore_index=True)
        print(country_category_df)
        # 전체 URL 저장
        country_category_df.to_excel("DB/country_category_url.xlsx", index=False)

# 우선은 한국 미국만
country_category_df = country_category_df[country_category_df['country'].isin(['South Korea', 'United States'])]


Loaded file: DB/Youtube_Data_2024-10-09.xlsx


##### YouTuberme Crawling

In [23]:
result_df = excute_youtuberme_crawling(country_category_df)

  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top1000 = pd.read_html(str(table))[0]
  df_top100

##### Save Data

In [40]:
result_df.to_excel(file_path, index=False)