### 채널 수집 (youtubers.me로 기본)

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [2]:
# url 포함 youtuberme df 생성
def collect_youtuberme_url(category_url):
    page = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(page.content, "html.parser")

    # Find the table with class "top-charts"
    table = soup.find("table", class_="top-charts")

    # Find all <a> tags within the table
    href_list = []
    if table:
        for a_tag in table.find_all("a"):
            href_value = a_tag.get("href")
            if href_value.endswith("/youtuber-stats"):
                href_list.append("https://us.youtubers.me/" + href_value)
    return href_list

In [70]:
# 여러 카테고리 확인
def crawl_table_urls_extracted(start_url, table_class='top-charts', depth=1):
    visited_urls = set()
    excluded_urls = []
    extracted_strings = {}

    def extract_string_between_substrings(url, start_substring, end_substring):
        start_index = url.find(start_substring)
        end_index = url.find(end_substring, start_index + len(start_substring))
        if start_index != -1 and end_index != -1:
            return url[start_index + len(start_substring):end_index]
        return None

    def recursive_crawl(url, current_depth):
        if current_depth > depth:
            return

        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find the table with the specified class
                table = soup.find('table', class_=table_class)

                if table:
                    # Extract href attributes from anchor tags within the table
                    for row in table.find_all('tr'):
                        columns = row.find_all('td')
                        for col_index, col in enumerate(columns):
                            # Exclude href attributes from the column with the name "category"
                            if col.get_text(strip=True).lower() == 'category':
                                continue

                            hrefs = [a.get('href') for a in col.find_all('a', href=True)]

                            # Process the hrefs
                            for href in hrefs:
                                absolute_url = urljoin(url, href)

                                # Save URLs without "korea-republic-of"
                                if start_url[-8:] not in absolute_url.lower():
                                    excluded_urls.append(absolute_url)
                                else:
                                    visited_urls.add(absolute_url)

                                    # Extract and save strings between specified substrings
                                    extracted_string = extract_string_between_substrings(
                                        absolute_url,
                                        start_url[:28],
                                        '/top-1000-'
                                    )
                                    if extracted_string:
                                        # Create a set for each visited URL to remove duplicates
                                        extracted_strings.setdefault(absolute_url, set()).add(extracted_string)

                                    # Avoid revisiting the same URL
                                    if absolute_url not in visited_urls:
                                        # Recursively crawl the next level
                                        recursive_crawl(absolute_url, current_depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {e}")

    recursive_crawl(start_url, 1)

    # Convert sets to lists
    excluded_urls = list(set(excluded_urls))
    visited_urls = list(set(visited_urls))

    # Convert sets to lists within the extracted_strings dictionary
    for url, strings_set in extracted_strings.items():
        extracted_strings[url] = list(strings_set)
    updated_data = {}
    for key, value in extracted_strings.items():
        split_value = value[0].split('/')
        if len(split_value) >= 2:
            updated_value = split_value[1]
            updated_data[key] = [updated_value]
    return updated_data

In [52]:
# start url: 나라별 top 1000
# 30 개국
country_dic = {"United States": "https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states",
               "Germany":"https://us.youtubers.me/germany/all/top-1000-youtube-channels-in-germany",
               "United Kingdom":"https://us.youtubers.me/united-kingdom/all/top-1000-youtube-channels-in-united-kingdom",
               "Brazil": "https://us.youtubers.me/brazil/all/top-1000-youtube-channels-in-brazil",
               "Mexico" : "https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico",
               "Spain": "https://us.youtubers.me/spain/all/top-1000-youtube-channels-in-spain",
               "Italy" : "https://us.youtubers.me/italy/all/top-1000-youtube-channels-in-italy",
               "Czech Republic": "https://us.youtubers.me/czech-republic/all/top-1000-youtube-channels-in-czech-republic",
               "Russia":"https://us.youtubers.me/russian-federation/all/top-1000-youtube-channels-in-russian-federation",
               "India" : "https://us.youtubers.me/india/all/top-1000-youtube-channels-in-india",
               "France": "https://us.youtubers.me/france/all/top-1000-youtube-channels-in-france",
               "Japan" : "https://us.youtubers.me/japan/all/top-1000-youtube-channels-in-japan",
               "Turkey": "https://us.youtubers.me/turkey/all/top-1000-youtube-channels-in-turkey",
               "South Korea": "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of",
               "Poland": "https://us.youtubers.me/poland/all/top-1000-youtube-channels-in-poland",
               "Canada" : "https://us.youtubers.me/canada/all/top-1000-youtube-channels-in-canada",
               "Vietnam" : "https://us.youtubers.me/viet-nam/all/top-1000-youtube-channels-in-viet-nam",
               "Thailand" : "https://us.youtubers.me/thailand/all/top-1000-youtube-channels-in-thailand",
               "Indonesia" : "https://us.youtubers.me/indonesia/all/top-1000-youtube-channels-in-indonesia",
               "Ukraine" : "https://us.youtubers.me/ukraine/all/top-1000-youtube-channels-in-ukraine",
               "Morocco" : "https://us.youtubers.me/morocco/all/top-1000-youtube-channels-in-morocco",
               "Argentina" : "https://us.youtubers.me/argentina/all/top-1000-youtube-channels-in-argentina",
               "Saudi Arabia": "https://us.youtubers.me/saudi-arabia/all/top-1000-youtube-channels-in-saudi-arabia",
               "Netherlands": "https://us.youtubers.me/netherlands/all/top-1000-youtube-channels-in-netherlands",
               "Egypt": "https://us.youtubers.me/egypt/all/top-1000-youtube-channels-in-egypt",
               "Taiwan": "https://us.youtubers.me/taiwan/all/top-1000-youtube-channels-in-taiwan",
               "Australia": "https://us.youtubers.me/australia/all/top-1000-youtube-channels-in-australia",
               "Greece": "https://us.youtubers.me/greece/all/top-1000-youtube-channels-in-greece",
               "Colombia": "https://us.youtubers.me/colombia/all/top-1000-youtube-channels-in-colombia",
               "Romania" : "https://us.youtubers.me/romania/all/top-1000-youtube-channels-in-romania"
              }

In [53]:
# category 통일(union 편하게)
def category_preprocessing(df):
    df['category'] = df['category'].replace('nan', 'all')

    # Mapping of old categories to new categories
    category_mapping = {'Gaming': 'gaming',
                        'Movies': 'movies',
                        'Music': 'music',
                        'Comedy': 'comedy',
                        'People & Blogs': 'people-blogs',
                        'Pets & Animals': 'pets-animals',
                        'Autos & Vehicles': 'autos-vehicles',
                        'Entertainment': 'entertainment',
                        'News & Politics': 'news-politics',
                        'Film & Animation': 'film-animation',
                        'Sports': 'sports',
                        'Science & Technology': 'science-technology',
                        'Shows': 'shows',
                        'Education': 'education',
                        'Travel & Events': 'travel-events',
                        'Howto & Style': 'howto-style',
                        'Nonprofits & Activism': 'nonprofits-activism',
                        'all': 'all'}

    # Map the old categories to the new categories
    df['category'] = df['category'].map(category_mapping)
    return df

# category df 생성
def create_dataframe(extracted_strings, country):
    data = {'url': [], 'category': [], 'country' : []}
    for url, categories in extracted_strings.items():
        for category in categories:
            data['url'].append(url)
            data['category'].append(category)
            data['country'].append(country)
    df = pd.DataFrame(data)
    return df


In [54]:
# youtuberme df 생성
def collect_youtuberme_basic(url, country):
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'lxml')

    # 현재 페이지에서 table 태그 모두 선택하기
    table1 = soup.select('table')

    # 하나의 테이블 태그 선택하기
    table = table1[0]

    df_top1000 = pd.read_html(str(table))[0]
    print("df1000:", len(df_top1000))
    href_list = collect_youtuberme_url(url)
    df_top1000['url'] = href_list
    df_top1000['country'] = [country for x in range(len(df_top1000))]
    print("url_list: ", len(href_list))
    df_top1000 = category_preprocessing(df_top1000)
    return df_top1000

In [71]:
# test
url = "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of"
start_url_us = 'https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico'
extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
print(extracted_strings)
df_category = create_dataframe(extracted_strings, "korea")
# Display the DataFrame
print(df_category)

{'https://us.youtubers.me/korea-republic-of/people-blogs/top-1000-people-blogs-youtube-channels-in-korea-republic-of': ['people-blogs'], 'https://us.youtubers.me/korea-republic-of/music/top-1000-music-youtube-channels-in-korea-republic-of': ['music'], 'https://us.youtubers.me/korea-republic-of/entertainment/top-1000-entertainment-youtube-channels-in-korea-republic-of': ['entertainment'], 'https://us.youtubers.me/korea-republic-of/gaming/top-1000-gaming-youtube-channels-in-korea-republic-of': ['gaming'], 'https://us.youtubers.me/korea-republic-of/news-politics/top-1000-news-politics-youtube-channels-in-korea-republic-of': ['news-politics'], 'https://us.youtubers.me/korea-republic-of/sports/top-1000-sports-youtube-channels-in-korea-republic-of': ['sports'], 'https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of': ['all'], 'https://us.youtubers.me/korea-republic-of/film-animation/top-1000-film-animation-youtube-channels-in-korea-republic-of': ['film-

### 각 유명 채널별 카테고리 링크 저장
- 나라별 17개 카테고리 URL 저장

In [73]:
# category별 url 리스트 합치기
dfs = []
for country, url in country_dic.items():
    print(country, url)
    extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
    df_category = create_dataframe(extracted_strings, country)
    dfs.append(df_category)
    country_category_df = pd.concat(dfs, axis=0, ignore_index=True)
    print(country_category_df)
# 전체 URL 저장
country_category_df.to_excel("country_category_url.xlsx", index=False)
country_category_df

United States https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states
                                                  url            category  \
0   https://us.youtubers.me/united-states/educatio...           education   
1   https://us.youtubers.me/united-states/people-b...        people-blogs   
2   https://us.youtubers.me/united-states/sports/t...              sports   
3   https://us.youtubers.me/united-states/entertai...       entertainment   
4   https://us.youtubers.me/united-states/film-ani...      film-animation   
5   https://us.youtubers.me/united-states/comedy/t...              comedy   
6   https://us.youtubers.me/united-states/music/to...               music   
7   https://us.youtubers.me/united-states/gaming/t...              gaming   
8   https://us.youtubers.me/united-states/pets-ani...        pets-animals   
9   https://us.youtubers.me/united-states/all/top-...                 all   
10  https://us.youtubers.me/united-states/news-pol... 

                                                  url        category  \
0   https://us.youtubers.me/united-states/educatio...       education   
1   https://us.youtubers.me/united-states/people-b...    people-blogs   
2   https://us.youtubers.me/united-states/sports/t...          sports   
3   https://us.youtubers.me/united-states/entertai...   entertainment   
4   https://us.youtubers.me/united-states/film-ani...  film-animation   
..                                                ...             ...   
62  https://us.youtubers.me/brazil/howto-style/top...     howto-style   
63  https://us.youtubers.me/brazil/pets-animals/to...    pets-animals   
64  https://us.youtubers.me/brazil/news-politics/t...   news-politics   
65  https://us.youtubers.me/brazil/travel-events/t...   travel-events   
66  https://us.youtubers.me/brazil/autos-vehicles/...  autos-vehicles   

          country  
0   United States  
1   United States  
2   United States  
3   United States  
4   United States  
.. 

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
180  https://us.youtubers.me/france/pets-animals/to...         pets-animals   
181  https://us.youtubers.me/france/science-technol...   science-technology   
182  https://us.youtubers.me/france/nonprofits-acti...  nonprofits-activism   
183  https://us.youtubers.me/france/travel-events/t...        travel-events   
184  https://us.youtubers.me/france/movies/top-1000...               movies   

           country  
0    United States  
1    Unit

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
294  https://us.youtubers.me/thailand/autos-vehicle...       autos-vehicles   
295  https://us.youtubers.me/thailand/education/top...            education   
296  https://us.youtubers.me/thailand/film-animatio...       film-animation   
297  https://us.youtubers.me/thailand/pets-animals/...         pets-animals   
298  https://us.youtubers.me/thailand/nonprofits-ac...  nonprofits-activism   

           country  
0    United States  
1    Unit

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
409  https://us.youtubers.me/egypt/sports/top-1000-...               sports   
410  https://us.youtubers.me/egypt/autos-vehicles/t...       autos-vehicles   
411  https://us.youtubers.me/egypt/nonprofits-activ...  nonprofits-activism   
412  https://us.youtubers.me/egypt/pets-animals/top...         pets-animals   
413  https://us.youtubers.me/egypt/travel-events/to...        travel-events   

           country  
0    United States  
1    Unit

Unnamed: 0,url,category,country
0,https://us.youtubers.me/united-states/educatio...,education,United States
1,https://us.youtubers.me/united-states/people-b...,people-blogs,United States
2,https://us.youtubers.me/united-states/sports/t...,sports,United States
3,https://us.youtubers.me/united-states/entertai...,entertainment,United States
4,https://us.youtubers.me/united-states/film-ani...,film-animation,United States
...,...,...,...
492,https://us.youtubers.me/romania/autos-vehicles...,autos-vehicles,Romania
493,https://us.youtubers.me/romania/science-techno...,science-technology,Romania
494,https://us.youtubers.me/romania/travel-events/...,travel-events,Romania
495,https://us.youtubers.me/romania/shows/top-1000...,shows,Romania


### YouTuberme Dataframe

In [77]:
# 전체 카테고리 별 채널 수집
country_category_df = pd.read_excel("country_category_url.xlsx")
# url 돌면서 필요한 데이터 
for url, country in zip(country_category_df['url'].to_list(), country_category_df['country'].to_list()):    
    df_new = collect_youtuberme_basic(url, country)
    dfs.append(df_new)
    result_df = pd.concat(dfs, axis=0, ignore_index=True)

# 중복 제거
result_df = result_df.drop_duplicates('Youtuber')
result_df = result_df.drop("rank", axis=1)
# YouTube URL 저장
result_df['youtube url'] = [url[:-7] for url in  result_df['url'].to_list()]

In [79]:
from datetime import datetime
# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')
file_path = f"Youtube_Data_{today_date}.xlsx"
# Save the DataFrame to Excel with today's date in the filename
new_df.to_excel(file_path, index=False)


### 수집 채널 URL 추가-----------------------------------------------

In [9]:
import schedule
import time
from sqlalchemy import create_engine

# 처음 DB에 추가할때
def append_channel(dataframe):
    engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
    conn = engine.connect()

    dataframe.to_sql(name='channel', con=engine, if_exists='append', index=False)
    conn.close()

In [11]:
# db 저장
append_channel(result_df) 

In [19]:
result_df.dropna(subset=['url'], inplace=True)
result_df.to_excel("yotube_list.xlsx")

In [17]:
# weekly 업로드 진행
def update_channel(dataframe):
    engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
    conn = engine.connect()
    sql_query = 'SELECT * FROM channel'
    df = pd.read_sql(sql_query, engine)
    
    dataframe.to_sql(name='channel', con=engine, if_exists='update', index=False)
    conn.close()

### Shorts 수집

In [27]:
refine_df

NameError: name 'refine_df' is not defined

In [39]:
# 기존에 존재하다던 데이터 update 해서 다시 데이터 넣기
engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
existing_data = pd.read_sql('SELECT * FROM channel', engine)

# Check for duplicates based on the 'ID' column
duplicates = existing_data[existing_data['Youtuber'].isin(new_data['Youtuber'])]

# Update existing rows with new data
existing_data.update(new_data)

# Filter out rows that are duplicates
new_rows = new_data[~new_data['Youtuber'].isin(duplicates['Youtuber'])]

# Append new rows to the existing data
merged_data = pd.concat([existing_data, new_rows], ignore_index=True)

# Write the merged data back to the database
merged_data.to_sql('channel', engine, if_exists='replace', index=False)

Unnamed: 0,rank,Youtuber,subscribers,video views,video count,category,started,url,category_ranking
0,1,한국고전영화 Korean Classic Film,844000.0,338119500,370,Movies,2011,,
1,1,김프로KIMPRO,28400000.0,29282621107,2233,people-blogs,2017,https://us.youtubers.me/5f2ac6ed-7607-4084-8d9...,NAN
2,2,TwinRoozi 쌍둥이 루지,10600000.0,6676946688,553,people-blogs,2018,https://us.youtubers.me/skt-t1/youtuber-stats,NAN
3,3,Byungari 병아리언니,7540000.0,6372469710,772,people-blogs,2020,https://us.youtubers.me/805407e2-3ce6-4a72-a61...,NAN
4,4,팀일루션 노성율 - TEAM1LLUSION,5790000.0,6263576833,828,people-blogs,2010,https://us.youtubers.me/heykin-couple/youtuber...,NAN
...,...,...,...,...,...,...,...,...,...
7811,49,Anonymous Messengers,493.0,2525,6,nonprofits-activism,2016,https://us.youtubers.me/anonymous-e144fd85-358...,NAN
7812,50,네더고래의 대.단.한 채널.,5980.0,536,1,nonprofits-activism,2012,https://us.youtubers.me/wildbreeze/youtuber-stats,NAN
7813,51,오렌지LAB,8020.0,0,0,nonprofits-activism,2018,https://us.youtubers.me/650d2f3e-90b8-48ee-9ce...,NAN
7814,52,Álan V,959.0,0,0,nonprofits-activism,2016,https://us.youtubers.me/tv-825cb6e4-9590-4f10-...,NAN
