### 채널 수집 (youtubersme로 기본)

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import glob
import os

In [57]:
# url 포함 youtuberme df 생성
def collect_youtuberme_url(category_url):
    page = requests.get(url, verify=False)
    # Parse the HTML content
    soup = BeautifulSoup(page.content, "html.parser")

    # Find the table with class "top-charts"
    table = soup.find("table", class_="top-charts")

    # Find all <a> tags within the table
    href_list = []
    if table:
        for a_tag in table.find_all("a"):
            href_value = a_tag.get("href")
            if href_value.endswith("/youtuber-stats"):
                href_list.append("https://us.youtubers.me/" + href_value)
    return href_list

In [59]:
# 여러 카테고리 확인
def crawl_table_urls_extracted(start_url, table_class='top-charts', depth=1):
    visited_urls = set()
    excluded_urls = []
    extracted_strings = {}

    def extract_string_between_substrings(url, start_substring, end_substring):
        start_index = url.find(start_substring)
        end_index = url.find(end_substring, start_index + len(start_substring))
        if start_index != -1 and end_index != -1:
            return url[start_index + len(start_substring):end_index]
        return None

    def recursive_crawl(url, current_depth):
        if current_depth > depth:
            return

        try:
            response = requests.get(url, verify=False)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find the table with the specified class
                table = soup.find('table', class_=table_class)

                if table:
                    # Extract href attributes from anchor tags within the table
                    for row in table.find_all('tr'):
                        columns = row.find_all('td')
                        for col_index, col in enumerate(columns):
                            # Exclude href attributes from the column with the name "category"
                            if col.get_text(strip=True).lower() == 'category':
                                continue

                            hrefs = [a.get('href') for a in col.find_all('a', href=True)]

                            # Process the hrefs
                            for href in hrefs:
                                absolute_url = urljoin(url, href)

                                # Save URLs without "korea-republic-of"
                                if start_url[-8:] not in absolute_url.lower():
                                    excluded_urls.append(absolute_url)
                                else:
                                    visited_urls.add(absolute_url)

                                    # Extract and save strings between specified substrings
                                    extracted_string = extract_string_between_substrings(
                                        absolute_url,
                                        start_url[:28],
                                        '/top-1000-'
                                    )
                                    if extracted_string:
                                        # Create a set for each visited URL to remove duplicates
                                        extracted_strings.setdefault(absolute_url, set()).add(extracted_string)

                                    # Avoid revisiting the same URL
                                    if absolute_url not in visited_urls:
                                        # Recursively crawl the next level
                                        recursive_crawl(absolute_url, current_depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {e}")

    recursive_crawl(start_url, 1)

    # Convert sets to lists
    excluded_urls = list(set(excluded_urls))
    visited_urls = list(set(visited_urls))

    # Convert sets to lists within the extracted_strings dictionary
    for url, strings_set in extracted_strings.items():
        extracted_strings[url] = list(strings_set)
    updated_data = {}
    for key, value in extracted_strings.items():
        split_value = value[0].split('/')
        if len(split_value) >= 2:
            updated_value = split_value[1]
            updated_data[key] = [updated_value]
    return updated_data

In [5]:
# start url: 나라별 top 1000
# 30 개국
country_dic = {"United States": "https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states",
               "South Korea": "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of",
               "Germany":"https://us.youtubers.me/germany/all/top-1000-youtube-channels-in-germany",
               "United Kingdom":"https://us.youtubers.me/united-kingdom/all/top-1000-youtube-channels-in-united-kingdom",
               "Brazil": "https://us.youtubers.me/brazil/all/top-1000-youtube-channels-in-brazil",
               "Mexico" : "https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico",
               "Spain": "https://us.youtubers.me/spain/all/top-1000-youtube-channels-in-spain",
               "Italy" : "https://us.youtubers.me/italy/all/top-1000-youtube-channels-in-italy",
               "Czech Republic": "https://us.youtubers.me/czech-republic/all/top-1000-youtube-channels-in-czech-republic",
               "Russia":"https://us.youtubers.me/russian-federation/all/top-1000-youtube-channels-in-russian-federation",
               "India" : "https://us.youtubers.me/india/all/top-1000-youtube-channels-in-india",
               "France": "https://us.youtubers.me/france/all/top-1000-youtube-channels-in-france",
               "Japan" : "https://us.youtubers.me/japan/all/top-1000-youtube-channels-in-japan",
               "Turkey": "https://us.youtubers.me/turkey/all/top-1000-youtube-channels-in-turkey",
               "Poland": "https://us.youtubers.me/poland/all/top-1000-youtube-channels-in-poland",
               "Canada" : "https://us.youtubers.me/canada/all/top-1000-youtube-channels-in-canada",
               "Vietnam" : "https://us.youtubers.me/viet-nam/all/top-1000-youtube-channels-in-viet-nam",
               "Thailand" : "https://us.youtubers.me/thailand/all/top-1000-youtube-channels-in-thailand",
               "Indonesia" : "https://us.youtubers.me/indonesia/all/top-1000-youtube-channels-in-indonesia",
               "Ukraine" : "https://us.youtubers.me/ukraine/all/top-1000-youtube-channels-in-ukraine",
               "Morocco" : "https://us.youtubers.me/morocco/all/top-1000-youtube-channels-in-morocco",
               "Argentina" : "https://us.youtubers.me/argentina/all/top-1000-youtube-channels-in-argentina",
               "Saudi Arabia": "https://us.youtubers.me/saudi-arabia/all/top-1000-youtube-channels-in-saudi-arabia",
               "Netherlands": "https://us.youtubers.me/netherlands/all/top-1000-youtube-channels-in-netherlands",
               "Egypt": "https://us.youtubers.me/egypt/all/top-1000-youtube-channels-in-egypt",
               "Taiwan": "https://us.youtubers.me/taiwan/all/top-1000-youtube-channels-in-taiwan",
               "Australia": "https://us.youtubers.me/australia/all/top-1000-youtube-channels-in-australia",
               "Greece": "https://us.youtubers.me/greece/all/top-1000-youtube-channels-in-greece",
               "Colombia": "https://us.youtubers.me/colombia/all/top-1000-youtube-channels-in-colombia",
               "Romania" : "https://us.youtubers.me/romania/all/top-1000-youtube-channels-in-romania"
              }

In [6]:
# category 통일(union 편하게)
def category_preprocessing(df):
    df['category'] = df['category'].replace('nan', 'all')

    # Mapping of old categories to new categories
    category_mapping = {'Film & Animation': 'film-animation',
                        'Autos & Vehicles': 'autos-vehicles',
                        'Music': 'music',
                        'Movies': 'movies',
                        'Pets & Animals': 'pets-animals',
                        'Sports': 'sports',
                        'Travel & Events': 'travel-events',
                        'Gaming': 'gaming',
                        'People & Blogs': 'people-blogs',
                        'Comedy': 'comedy',
                        'Entertainment': 'entertainment',
                        'News & Politics': 'news-politics',
                        'Howto & Style': 'howto-style',
                        'Education': 'education',
                        'Science & Technology': 'science-technology',
                        'Shows': 'shows',
                        'Nonprofits & Activism': 'nonprofits-activism',
                        'all': 'all'}

    # Map the old categories to the new categories
    df['category'] = df['category'].map(category_mapping)
    return df

# category df 생성
def create_dataframe(extracted_strings, country):
    data = {'url': [], 'category': [], 'country' : []}
    for url, categories in extracted_strings.items():
        for category in categories:
            data['url'].append(url)
            data['category'].append(category)
            data['country'].append(country)
    df = pd.DataFrame(data)
    return df


In [67]:
# youtuberme df 생성
def collect_youtuberme_basic(url, country):
    page = requests.get(url, verify=False)

    soup = BeautifulSoup(page.content, 'lxml')

    # 현재 페이지에서 table 태그 모두 선택하기
    table1 = soup.select('table')
    try:
        # 하나의 테이블 태그 선택하기
        table = table1[0]
    
        df_top1000 = pd.read_html(str(table))[0]
        print("df1000:", len(df_top1000))
        href_list = collect_youtuberme_url(url)
        df_top1000['url'] = href_list
        df_top1000['country'] = [country for x in range(len(df_top1000))]
        print("url_list: ", len(href_list))
        df_top1000 = category_preprocessing(df_top1000)
    except Exception as err:
        print(err)
        pass
    return df_top1000

In [63]:
df_new = collect_youtuberme_basic("https://us.youtubers.me/united-states/education/top-1000-education-youtube-channels-in-united-states", "US")

  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


In [8]:
# test
url = "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of"
start_url_us = 'https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico'
extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
print(extracted_strings)
df_category = create_dataframe(extracted_strings, "korea")
# Display the DataFrame
df_category

{}


Unnamed: 0,url,category,country


### 기존 수집했던 데이터 불러오기
- 날짜 가장 빠른 순으로 호출

In [10]:
# List all files that match the pattern "Youtube_Data_*.xlsx"
files = glob.glob("Youtube_Data_*.xlsx")

# Extract dates from filenames and find the latest date
dates = [os.path.splitext(os.path.basename(file))[0].replace("Youtube_Data_", "") for file in files]
dates = sorted(dates, reverse=True)

if dates:
    # Load the latest file based on the date
    latest_file = f"Youtube_Data_{dates[0]}.xlsx"
    previous_df = pd.read_excel(latest_file)
    print(f"Loaded file: {latest_file}")
else:
    print("No files found.")

# Display the dataframe
previous_df.head()

Loaded file: Youtube_Data_2024-05-14.xlsx


Unnamed: 0,youtuberme_url,category,country,channel_name,subscribers,total_video_views,total_video_count,started,yt_url
0,https://us.youtubers.me//pinkfong-kids-songs-s...,education,United States,Baby Shark - Pinkfong Kids’ Songs & Stories,80400000.0,48184301971,3220,2011,https://us.youtubers.me//pinkfong-kids-songs-s...
1,https://us.youtubers.me//genevieve-s-playhouse...,education,United States,Genevieve's Playhouse - Learning Videos for Kids,40200000.0,30124608781,636,2016,https://us.youtubers.me//genevieve-s-playhouse...
2,https://us.youtubers.me//babybus-kids-tv-songs...,education,United States,BabyBus - Kids Songs and Cartoons,35700000.0,28252093300,2405,2016,https://us.youtubers.me//babybus-kids-tv-songs...
3,https://us.youtubers.me//blippi/youtuber-stats,education,United States,Blippi - Educational Videos for Kids,19800000.0,16246480827,928,2014,https://us.youtubers.me//blippi/youtube
4,https://us.youtubers.me//dave-and-ava-nursery-...,education,United States,Dave and Ava - Nursery Rhymes and Baby Songs,15600000.0,11355289210,975,2014,https://us.youtubers.me//dave-and-ava-nursery-...


### 각 유명 채널별 카테고리 링크 저장
- 나라별 17개 카테고리 URL 저장 (1개는 all)

In [12]:
print("a")

a


In [13]:
# Define the file name
file_name = "country_category_url.xlsx"

# Check if the file exists
if os.path.exists(file_name):
    # Load the Excel file into a DataFrame
    country_category_df = pd.read_excel(file_name)
else:
    print(f"{file_name} does not exist. The code will not run.")
    # category별 url 리스트 합치기
    dfs = []
    for country, url in country_dic.items():
        print(country, url)
        extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
        df_category = create_dataframe(extracted_strings, country)
        dfs.append(df_category)
        country_category_df = pd.concat(dfs, axis=0, ignore_index=True)
        print(country_category_df)
    # 전체 URL 저장
    country_category_df.to_excel("country_category_url.xlsx", index=False)
    country_category_df

In [14]:
country_dic

{'United States': 'https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states',
 'South Korea': 'https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of',
 'Germany': 'https://us.youtubers.me/germany/all/top-1000-youtube-channels-in-germany',
 'United Kingdom': 'https://us.youtubers.me/united-kingdom/all/top-1000-youtube-channels-in-united-kingdom',
 'Brazil': 'https://us.youtubers.me/brazil/all/top-1000-youtube-channels-in-brazil',
 'Mexico': 'https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico',
 'Spain': 'https://us.youtubers.me/spain/all/top-1000-youtube-channels-in-spain',
 'Italy': 'https://us.youtubers.me/italy/all/top-1000-youtube-channels-in-italy',
 'Czech Republic': 'https://us.youtubers.me/czech-republic/all/top-1000-youtube-channels-in-czech-republic',
 'Russia': 'https://us.youtubers.me/russian-federation/all/top-1000-youtube-channels-in-russian-federation',
 'India': 'https://us.youtubers.m

In [15]:
create_dataframe(crawl_table_urls_extracted( "https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico", table_class='top-charts', depth=1), "Mexico")

Unnamed: 0,url,category,country
0,https://us.youtubers.me/mexico/entertainment/t...,entertainment,Mexico
1,https://us.youtubers.me/mexico/news-politics/t...,news-politics,Mexico
2,https://us.youtubers.me/mexico/film-animation/...,film-animation,Mexico
3,https://us.youtubers.me/mexico/education/top-1...,education,Mexico
4,https://us.youtubers.me/mexico/people-blogs/to...,people-blogs,Mexico
5,https://us.youtubers.me/mexico/music/top-1000-...,music,Mexico
6,https://us.youtubers.me/mexico/comedy/top-1000...,comedy,Mexico
7,https://us.youtubers.me/mexico/gaming/top-1000...,gaming,Mexico
8,https://us.youtubers.me/mexico/shows/top-1000-...,shows,Mexico
9,https://us.youtubers.me/mexico/all/top-1000-yo...,all,Mexico


In [16]:
# category별 url 리스트 합치기
dfs = []

# 데이터프레임 생성 및 리스트에 추가
for country, url in country_dic.items():
    print("나라 url: ", country, url)
    extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
    df_category = create_dataframe(extracted_strings, country)
    dfs.append(df_category)

# 반복문이 끝난 후에 데이터프레임들을 한 번에 결합
country_category_df = pd.concat(dfs, axis=0, ignore_index=True)

# 결과 출력 및 엑셀 저장
print(country_category_df)
country_category_df.to_excel("country_category_url.xlsx", index=False)
country_category_df

나라 url:  United States https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states
나라 url:  South Korea https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of
나라 url:  Germany https://us.youtubers.me/germany/all/top-1000-youtube-channels-in-germany
나라 url:  United Kingdom https://us.youtubers.me/united-kingdom/all/top-1000-youtube-channels-in-united-kingdom
나라 url:  Brazil https://us.youtubers.me/brazil/all/top-1000-youtube-channels-in-brazil
나라 url:  Mexico https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico
나라 url:  Spain https://us.youtubers.me/spain/all/top-1000-youtube-channels-in-spain
나라 url:  Italy https://us.youtubers.me/italy/all/top-1000-youtube-channels-in-italy
나라 url:  Czech Republic https://us.youtubers.me/czech-republic/all/top-1000-youtube-channels-in-czech-republic
나라 url:  Russia https://us.youtubers.me/russian-federation/all/top-1000-youtube-channels-in-russian-federation
나라 url:  Indi

Unnamed: 0,url,category,country
0,https://us.youtubers.me/united-states/educatio...,education,United States
1,https://us.youtubers.me/united-states/people-b...,people-blogs,United States
2,https://us.youtubers.me/united-states/sports/t...,sports,United States
3,https://us.youtubers.me/united-states/entertai...,entertainment,United States
4,https://us.youtubers.me/united-states/film-ani...,film-animation,United States
...,...,...,...
341,https://us.youtubers.me/romania/autos-vehicles...,autos-vehicles,Romania
342,https://us.youtubers.me/romania/science-techno...,science-technology,Romania
343,https://us.youtubers.me/romania/travel-events/...,travel-events,Romania
344,https://us.youtubers.me/romania/shows/top-1000...,shows,Romania


### YouTuberme Dataframe Final

In [73]:
# 전체 카테고리 별 채널 수집
country_category_df = pd.read_excel("country_category_url.xlsx")
print(country_category_df)
# url 돌면서 필요한 데이터 
for url, country in zip(country_category_df['url'].to_list(), country_category_df['country'].to_list()):    
    try:
        df_new = collect_youtuberme_basic(url, country)
        dfs.append(df_new)
        result_df = pd.concat(dfs, axis=0, ignore_index=True)
    except Exception as err:
        print(err, url)
        pass

# 중복 제거
result_df = result_df.drop_duplicates('Youtuber')
result_df = result_df.drop("rank", axis=1)
# YouTube URL 저장
result_df['youtube url'] = [url[:-7] for url in  result_df['url'].to_list()]

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
341  https://us.youtubers.me/romania/autos-vehicles...       autos-vehicles   
342  https://us.youtubers.me/romania/science-techno...   science-technology   
343  https://us.youtubers.me/romania/travel-events/...        travel-events   
344  https://us.youtubers.me/romania/shows/top-1000...                shows   
345  https://us.youtubers.me/romania/nonprofits-act...  nonprofits-activism   

           country  
0    United States  
1    Unit

  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 4
url_list:  4


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 95
url_list:  95


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 284




url_list:  284


  df_top1000 = pd.read_html(str(table))[0]


df1000: 701




url_list:  701


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 633




url_list:  633


  df_top1000 = pd.read_html(str(table))[0]


df1000: 174




url_list:  174


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 529




url_list:  529


  df_top1000 = pd.read_html(str(table))[0]


df1000: 473




url_list:  473


  df_top1000 = pd.read_html(str(table))[0]


df1000: 513




url_list:  513


  df_top1000 = pd.read_html(str(table))[0]


df1000: 662




url_list:  662


  df_top1000 = pd.read_html(str(table))[0]


df1000: 7
url_list:  7


  df_top1000 = pd.read_html(str(table))[0]


df1000: 674




url_list:  674


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 91
url_list:  91


  df_top1000 = pd.read_html(str(table))[0]


df1000: 228
url_list:  228


  df_top1000 = pd.read_html(str(table))[0]


df1000: 666




url_list:  666


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 934




url_list:  934


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 519




url_list:  519


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 13
url_list:  13


  df_top1000 = pd.read_html(str(table))[0]


df1000: 156
url_list:  156


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 397




url_list:  397


  df_top1000 = pd.read_html(str(table))[0]


df1000: 402




url_list:  402


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 314




url_list:  314


  df_top1000 = pd.read_html(str(table))[0]


df1000: 490




url_list:  490


  df_top1000 = pd.read_html(str(table))[0]


df1000: 895




url_list:  895


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 987




url_list:  987


  df_top1000 = pd.read_html(str(table))[0]


df1000: 4
url_list:  4


  df_top1000 = pd.read_html(str(table))[0]


df1000: 722




url_list:  722


  df_top1000 = pd.read_html(str(table))[0]


df1000: 246




url_list:  246


  df_top1000 = pd.read_html(str(table))[0]


df1000: 236




url_list:  236




('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) https://us.youtubers.me/brazil/education/top-1000-education-youtube-channels-in-brazil


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 522




url_list:  522


  df_top1000 = pd.read_html(str(table))[0]


df1000: 274




url_list:  274


  df_top1000 = pd.read_html(str(table))[0]


df1000: 862




url_list:  862


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 632




url_list:  632


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 576




url_list:  576


  df_top1000 = pd.read_html(str(table))[0]


df1000: 454




url_list:  454


  df_top1000 = pd.read_html(str(table))[0]


df1000: 271




url_list:  271


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 384




url_list:  384


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 11
url_list:  11


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 591




url_list:  591


  df_top1000 = pd.read_html(str(table))[0]


df1000: 416




url_list:  416


  df_top1000 = pd.read_html(str(table))[0]


df1000: 111
url_list:  111


  df_top1000 = pd.read_html(str(table))[0]


df1000: 59
url_list:  59


  df_top1000 = pd.read_html(str(table))[0]


df1000: 221




url_list:  221


  df_top1000 = pd.read_html(str(table))[0]


df1000: 42
url_list:  42


  df_top1000 = pd.read_html(str(table))[0]


df1000: 165




url_list:  165


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000




('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')) https://us.youtubers.me/spain/gaming/top-1000-gaming-youtube-channels-in-spain


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 480




url_list:  480


  df_top1000 = pd.read_html(str(table))[0]


df1000: 405




url_list:  405


  df_top1000 = pd.read_html(str(table))[0]


df1000: 601




url_list:  601


  df_top1000 = pd.read_html(str(table))[0]


df1000: 333
url_list:  333


  df_top1000 = pd.read_html(str(table))[0]


df1000: 367




url_list:  367


  df_top1000 = pd.read_html(str(table))[0]


df1000: 717




url_list:  717


  df_top1000 = pd.read_html(str(table))[0]


df1000: 114
url_list:  114


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 370




url_list:  370


  df_top1000 = pd.read_html(str(table))[0]


df1000: 196




url_list:  196


  df_top1000 = pd.read_html(str(table))[0]


df1000: 287




url_list:  287


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 340




url_list:  340


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 383




url_list:  383


  df_top1000 = pd.read_html(str(table))[0]


df1000: 667




url_list:  667


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 214




url_list:  214


  df_top1000 = pd.read_html(str(table))[0]


df1000: 230
url_list:  230


  df_top1000 = pd.read_html(str(table))[0]


df1000: 395




url_list:  395


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 315
url_list:  315


  df_top1000 = pd.read_html(str(table))[0]


df1000: 111
url_list:  111


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 268




url_list:  268


  df_top1000 = pd.read_html(str(table))[0]


df1000: 55
url_list:  55


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 970




url_list:  970


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 33




url_list:  33


  df_top1000 = pd.read_html(str(table))[0]


df1000: 911




url_list:  911


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 728




url_list:  728


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 790




url_list:  790


  df_top1000 = pd.read_html(str(table))[0]


df1000: 240




url_list:  240


  df_top1000 = pd.read_html(str(table))[0]


df1000: 379




url_list:  379


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 969




url_list:  969


  df_top1000 = pd.read_html(str(table))[0]


df1000: 431




url_list:  431


  df_top1000 = pd.read_html(str(table))[0]


df1000: 773




url_list:  773


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 761




url_list:  761


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 485




url_list:  485


  df_top1000 = pd.read_html(str(table))[0]


df1000: 568




url_list:  568


  df_top1000 = pd.read_html(str(table))[0]


df1000: 437
url_list:  437


  df_top1000 = pd.read_html(str(table))[0]


df1000: 855




url_list:  855


  df_top1000 = pd.read_html(str(table))[0]


df1000: 839




url_list:  839


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 144
url_list:  144


  df_top1000 = pd.read_html(str(table))[0]


df1000: 403




url_list:  403


  df_top1000 = pd.read_html(str(table))[0]


df1000: 49
url_list:  49


  df_top1000 = pd.read_html(str(table))[0]


df1000: 182




url_list:  182


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 409




url_list:  409


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 786




url_list:  786


  df_top1000 = pd.read_html(str(table))[0]


df1000: 941




url_list:  941


  df_top1000 = pd.read_html(str(table))[0]


df1000: 277




url_list:  277


  df_top1000 = pd.read_html(str(table))[0]


df1000: 656




url_list:  656


  df_top1000 = pd.read_html(str(table))[0]


df1000: 724




url_list:  724


  df_top1000 = pd.read_html(str(table))[0]


df1000: 822




url_list:  822


  df_top1000 = pd.read_html(str(table))[0]


df1000: 504




url_list:  504


  df_top1000 = pd.read_html(str(table))[0]


df1000: 974




url_list:  974


  df_top1000 = pd.read_html(str(table))[0]


df1000: 263




url_list:  263


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 137
url_list:  137


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 468




url_list:  468


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 599




url_list:  599


  df_top1000 = pd.read_html(str(table))[0]


df1000: 309




url_list:  309


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 530




url_list:  530


  df_top1000 = pd.read_html(str(table))[0]


df1000: 140




url_list:  140


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 556




url_list:  556


  df_top1000 = pd.read_html(str(table))[0]


df1000: 286
url_list:  286


  df_top1000 = pd.read_html(str(table))[0]


df1000: 16
url_list:  16


  df_top1000 = pd.read_html(str(table))[0]


df1000: 132
url_list:  132


  df_top1000 = pd.read_html(str(table))[0]


df1000: 62
url_list:  62


  df_top1000 = pd.read_html(str(table))[0]


df1000: 275




url_list:  275


  df_top1000 = pd.read_html(str(table))[0]


df1000: 439




url_list:  439


  df_top1000 = pd.read_html(str(table))[0]


df1000: 343




url_list:  343


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 981




url_list:  981


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 240




url_list:  240


  df_top1000 = pd.read_html(str(table))[0]


df1000: 495
url_list:  495


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 152




url_list:  152


  df_top1000 = pd.read_html(str(table))[0]


df1000: 615




url_list:  615


  df_top1000 = pd.read_html(str(table))[0]


df1000: 205




url_list:  205


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 220




url_list:  220


  df_top1000 = pd.read_html(str(table))[0]


df1000: 321




url_list:  321


  df_top1000 = pd.read_html(str(table))[0]


df1000: 102
url_list:  102


  df_top1000 = pd.read_html(str(table))[0]


df1000: 52
url_list:  52


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 528




url_list:  528


  df_top1000 = pd.read_html(str(table))[0]


df1000: 124
url_list:  124


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 76




url_list:  76


  df_top1000 = pd.read_html(str(table))[0]


df1000: 144
url_list:  144


  df_top1000 = pd.read_html(str(table))[0]


df1000: 99
url_list:  99


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 240




url_list:  240


  df_top1000 = pd.read_html(str(table))[0]


df1000: 222
url_list:  222


  df_top1000 = pd.read_html(str(table))[0]


df1000: 557




url_list:  557


  df_top1000 = pd.read_html(str(table))[0]


df1000: 263




url_list:  263


  df_top1000 = pd.read_html(str(table))[0]


df1000: 18
url_list:  18


  df_top1000 = pd.read_html(str(table))[0]


df1000: 104
url_list:  104


  df_top1000 = pd.read_html(str(table))[0]


df1000: 77
url_list:  77


  df_top1000 = pd.read_html(str(table))[0]


df1000: 392




url_list:  392


  df_top1000 = pd.read_html(str(table))[0]


df1000: 400




url_list:  400


  df_top1000 = pd.read_html(str(table))[0]


df1000: 196
url_list:  196


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 99




url_list:  99


  df_top1000 = pd.read_html(str(table))[0]


df1000: 230




url_list:  230


  df_top1000 = pd.read_html(str(table))[0]


df1000: 208




url_list:  208


  df_top1000 = pd.read_html(str(table))[0]


df1000: 576




url_list:  576


  df_top1000 = pd.read_html(str(table))[0]


df1000: 78
url_list:  78


  df_top1000 = pd.read_html(str(table))[0]


df1000: 120




url_list:  120


  df_top1000 = pd.read_html(str(table))[0]


df1000: 187
url_list:  187


  df_top1000 = pd.read_html(str(table))[0]


df1000: 353




url_list:  353


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 269
url_list:  269


  df_top1000 = pd.read_html(str(table))[0]


df1000: 159
url_list:  159


  df_top1000 = pd.read_html(str(table))[0]


df1000: 80




url_list:  80


  df_top1000 = pd.read_html(str(table))[0]


df1000: 522




url_list:  522


  df_top1000 = pd.read_html(str(table))[0]


df1000: 414




url_list:  414


  df_top1000 = pd.read_html(str(table))[0]


df1000: 377




url_list:  377


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 139
url_list:  139


  df_top1000 = pd.read_html(str(table))[0]


df1000: 49
url_list:  49


  df_top1000 = pd.read_html(str(table))[0]


df1000: 149
url_list:  149


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 89
url_list:  89


  df_top1000 = pd.read_html(str(table))[0]


df1000: 149
url_list:  149


  df_top1000 = pd.read_html(str(table))[0]


df1000: 347




url_list:  347


  df_top1000 = pd.read_html(str(table))[0]


df1000: 170
url_list:  170


  df_top1000 = pd.read_html(str(table))[0]


df1000: 174
url_list:  174


  df_top1000 = pd.read_html(str(table))[0]


df1000: 33
url_list:  33


  df_top1000 = pd.read_html(str(table))[0]


df1000: 5
url_list:  5


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15
url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 914




url_list:  914


  df_top1000 = pd.read_html(str(table))[0]


df1000: 198
url_list:  198


  df_top1000 = pd.read_html(str(table))[0]


df1000: 982




url_list:  982


  df_top1000 = pd.read_html(str(table))[0]


df1000: 212
url_list:  212


  df_top1000 = pd.read_html(str(table))[0]


df1000: 116




url_list:  116


  df_top1000 = pd.read_html(str(table))[0]


df1000: 95
url_list:  95


  df_top1000 = pd.read_html(str(table))[0]


df1000: 653




url_list:  653


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 190
url_list:  190


  df_top1000 = pd.read_html(str(table))[0]


df1000: 365




url_list:  365


  df_top1000 = pd.read_html(str(table))[0]


df1000: 88
url_list:  88


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 28




url_list:  28


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 66
url_list:  66


  df_top1000 = pd.read_html(str(table))[0]


df1000: 9
url_list:  9


  df_top1000 = pd.read_html(str(table))[0]


df1000: 224




url_list:  224


  df_top1000 = pd.read_html(str(table))[0]


df1000: 965




url_list:  965


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 473




url_list:  473


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 524




url_list:  524


  df_top1000 = pd.read_html(str(table))[0]


df1000: 167




url_list:  167


  df_top1000 = pd.read_html(str(table))[0]


df1000: 112




url_list:  112


  df_top1000 = pd.read_html(str(table))[0]


df1000: 154
url_list:  154


  df_top1000 = pd.read_html(str(table))[0]


df1000: 18
url_list:  18


  df_top1000 = pd.read_html(str(table))[0]


df1000: 159
url_list:  159


  df_top1000 = pd.read_html(str(table))[0]


df1000: 75
url_list:  75


  df_top1000 = pd.read_html(str(table))[0]


df1000: 125
url_list:  125


  df_top1000 = pd.read_html(str(table))[0]


df1000: 65




url_list:  65


  df_top1000 = pd.read_html(str(table))[0]


df1000: 46
url_list:  46


  df_top1000 = pd.read_html(str(table))[0]


df1000: 30
url_list:  30


  df_top1000 = pd.read_html(str(table))[0]


df1000: 34
url_list:  34


  df_top1000 = pd.read_html(str(table))[0]


df1000: 647
url_list:  647


  df_top1000 = pd.read_html(str(table))[0]


df1000: 152
url_list:  152


  df_top1000 = pd.read_html(str(table))[0]


df1000: 611




url_list:  611


  df_top1000 = pd.read_html(str(table))[0]


df1000: 670




url_list:  670


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 179




url_list:  179


  df_top1000 = pd.read_html(str(table))[0]


df1000: 155




url_list:  155


  df_top1000 = pd.read_html(str(table))[0]


df1000: 134




url_list:  134


  df_top1000 = pd.read_html(str(table))[0]


df1000: 669




url_list:  669


  df_top1000 = pd.read_html(str(table))[0]


df1000: 191




url_list:  191


  df_top1000 = pd.read_html(str(table))[0]


df1000: 100
url_list:  100


  df_top1000 = pd.read_html(str(table))[0]


df1000: 74




url_list:  74


  df_top1000 = pd.read_html(str(table))[0]


df1000: 87
url_list:  87


  df_top1000 = pd.read_html(str(table))[0]


df1000: 69
url_list:  69


  df_top1000 = pd.read_html(str(table))[0]


df1000: 46
url_list:  46


  df_top1000 = pd.read_html(str(table))[0]


df1000: 16
url_list:  16


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 496




url_list:  496


  df_top1000 = pd.read_html(str(table))[0]


df1000: 280




url_list:  280


  df_top1000 = pd.read_html(str(table))[0]


df1000: 122
url_list:  122


  df_top1000 = pd.read_html(str(table))[0]


df1000: 288




url_list:  288


  df_top1000 = pd.read_html(str(table))[0]


df1000: 373




url_list:  373


  df_top1000 = pd.read_html(str(table))[0]


df1000: 148
url_list:  148


  df_top1000 = pd.read_html(str(table))[0]


df1000: 97
url_list:  97


  df_top1000 = pd.read_html(str(table))[0]


df1000: 65




url_list:  65


  df_top1000 = pd.read_html(str(table))[0]


df1000: 114




url_list:  114


  df_top1000 = pd.read_html(str(table))[0]


df1000: 108




url_list:  108


  df_top1000 = pd.read_html(str(table))[0]


df1000: 62




url_list:  62


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 41




url_list:  41


  df_top1000 = pd.read_html(str(table))[0]


df1000: 61




url_list:  61


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15
url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 840




url_list:  840


  df_top1000 = pd.read_html(str(table))[0]


df1000: 751




url_list:  751


  df_top1000 = pd.read_html(str(table))[0]


df1000: 751




url_list:  751


  df_top1000 = pd.read_html(str(table))[0]


df1000: 22
url_list:  22


  df_top1000 = pd.read_html(str(table))[0]


df1000: 388




url_list:  388


  df_top1000 = pd.read_html(str(table))[0]


df1000: 161
url_list:  161


  df_top1000 = pd.read_html(str(table))[0]


df1000: 147
url_list:  147


  df_top1000 = pd.read_html(str(table))[0]


df1000: 122




url_list:  122


  df_top1000 = pd.read_html(str(table))[0]


df1000: 233




url_list:  233


  df_top1000 = pd.read_html(str(table))[0]


df1000: 106
url_list:  106


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 145




url_list:  145


  df_top1000 = pd.read_html(str(table))[0]


df1000: 48
url_list:  48


  df_top1000 = pd.read_html(str(table))[0]


df1000: 56
url_list:  56


  df_top1000 = pd.read_html(str(table))[0]


df1000: 88
url_list:  88


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15




url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 649




url_list:  649


  df_top1000 = pd.read_html(str(table))[0]


df1000: 89




url_list:  89


  df_top1000 = pd.read_html(str(table))[0]


df1000: 585




url_list:  585


  df_top1000 = pd.read_html(str(table))[0]


df1000: 104
url_list:  104


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000




url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 441




url_list:  441


  df_top1000 = pd.read_html(str(table))[0]


df1000: 512




url_list:  512


  df_top1000 = pd.read_html(str(table))[0]


df1000: 19
url_list:  19


  df_top1000 = pd.read_html(str(table))[0]


df1000: 80




url_list:  80


  df_top1000 = pd.read_html(str(table))[0]


df1000: 74
url_list:  74


  df_top1000 = pd.read_html(str(table))[0]


df1000: 100
url_list:  100


  df_top1000 = pd.read_html(str(table))[0]


df1000: 93
url_list:  93


  df_top1000 = pd.read_html(str(table))[0]


df1000: 61




url_list:  61


  df_top1000 = pd.read_html(str(table))[0]


df1000: 34




url_list:  34


  df_top1000 = pd.read_html(str(table))[0]


df1000: 27
url_list:  27


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 13
url_list:  13


In [75]:
# column명 변경
result_df = result_df.rename(columns={
    'url': 'youtuberme_url',
    'Youtuber': 'channel_name',
    'video views': 'total_video_views',
    'video count': 'total_video_count',
    'youtube url': 'yt_url'
})

In [76]:
# subscribers, total_video_count가 0인 경우 채널이 삭제된 경우이므로 제거
result_df = result_df[result_df['subscribers'] != 0]
result_df = result_df[result_df['total_video_count'] != 0]

In [79]:
result_df.columns

Index(['youtuberme_url', 'category', 'country', 'channel_name', 'subscribers',
       'total_video_views', 'total_video_count', 'started', 'yt_url'],
      dtype='object')

In [81]:
combined_df = pd.concat([previous_df, result_df])

# Drop duplicates based on 'channel_name' and keep only the unique entries
added_df = combined_df.drop_duplicates(subset=['channel_name'], keep=False)

# Display the new DataFrame with unique channel names
print(added_df)
unique_df = combined_df.drop_duplicates(subset=['channel_name'])
unique_df

                                           youtuberme_url            category  \
125     https://us.youtubers.me//edu-car-toy-school/yo...           education   
175     https://us.youtubers.me//theliberty/youtuber-s...           education   
261     https://us.youtubers.me//factsjunkie/youtuber-...           education   
262     https://us.youtubers.me//b-n-linh-dan-ong/yout...           education   
271       https://us.youtubers.me//shaddyz/youtuber-stats           education   
...                                                   ...                 ...   
328834  https://us.youtubers.me//6d4af462-1fe0-4cea-b3...           education   
328845  https://us.youtubers.me//top5quantum/youtuber-...           education   
328926  https://us.youtubers.me//wunder-haff/youtuber-...  science-technology   
328938     https://us.youtubers.me//idei3d/youtuber-stats  science-technology   
328940  https://us.youtubers.me//helium-network-projec...  science-technology   

              country     c

Unnamed: 0,youtuberme_url,category,country,channel_name,subscribers,total_video_views,total_video_count,started,yt_url
0,https://us.youtubers.me//pinkfong-kids-songs-s...,education,United States,Baby Shark - Pinkfong Kids’ Songs & Stories,80400000.0,4.818430e+10,3220.0,2011.0,https://us.youtubers.me//pinkfong-kids-songs-s...
1,https://us.youtubers.me//genevieve-s-playhouse...,education,United States,Genevieve's Playhouse - Learning Videos for Kids,40200000.0,3.012461e+10,636.0,2016.0,https://us.youtubers.me//genevieve-s-playhouse...
2,https://us.youtubers.me//babybus-kids-tv-songs...,education,United States,BabyBus - Kids Songs and Cartoons,35700000.0,2.825209e+10,2405.0,2016.0,https://us.youtubers.me//babybus-kids-tv-songs...
3,https://us.youtubers.me//blippi/youtuber-stats,education,United States,Blippi - Educational Videos for Kids,19800000.0,1.624648e+10,928.0,2014.0,https://us.youtubers.me//blippi/youtube
4,https://us.youtubers.me//dave-and-ava-nursery-...,education,United States,Dave and Ava - Nursery Rhymes and Baby Songs,15600000.0,1.135529e+10,975.0,2014.0,https://us.youtubers.me//dave-and-ava-nursery-...
...,...,...,...,...,...,...,...,...,...
328834,https://us.youtubers.me//6d4af462-1fe0-4cea-b3...,education,Romania,Fire Media,4320.0,3.007470e+05,220.0,2014.0,https://us.youtubers.me//6d4af462-1fe0-4cea-b3...
328845,https://us.youtubers.me//top5quantum/youtuber-...,education,Romania,Azad Unleashed,60000.0,4.549000e+03,28.0,2014.0,https://us.youtubers.me//top5quantum/youtube
328926,https://us.youtubers.me//wunder-haff/youtuber-...,science-technology,Romania,AtelierulTauro,9560.0,9.808417e+06,876.0,2010.0,https://us.youtubers.me//wunder-haff/youtube
328938,https://us.youtubers.me//idei3d/youtuber-stats,science-technology,Romania,idei3D,20800.0,9.819300e+05,72.0,2015.0,https://us.youtubers.me//idei3d/youtube


In [123]:
# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')
file_path = f"Youtube_Data_{today_date}.xlsx"
if pd.isna(result_df['channel_name'].iloc[0]):
    # Drop the first row
    unique_df = unique_df.drop(result_df.index[0])

# 쓸모 없는 채널 제거
unique_df = unique_df[~((unique_df['subscribers'] <= 3000) & (unique_df['started'] <= 2013))]
unique_df = unique_df[(unique_df['subscribers'] >= 3000) & unique_df['total_video_count'] > 10]
# Save the DataFrame to Excel with today's date in the filename
updated_df.to_excel(file_path, index=False)



In [113]:
#past_df = pd.read_excel("Youtube_Data_2024-05-14.xlsx")
# past_df에 없는 unique_df의 데이터를 channel_name 기준으로 찾는 코드
new_data = unique_df[~unique_df['channel_name'].isin(past_df['channel_name'])]

updated_df = pd.concat([past_df, new_data], ignore_index=True)

# 새로 추가된 데이터를 확인
print("Newly added rows based on 'channel_name':")
print(new_data)

Newly added rows based on 'channel_name':
                                           youtuberme_url            category  \
346     https://us.youtubers.me//abckidtv-nursery-rhym...           education   
394     https://us.youtubers.me//chiki-toonz-musica-in...           education   
448     https://us.youtubers.me//como-kids-tv-cartoon-...           education   
515     https://us.youtubers.me//theliberty/youtuber-s...           education   
522     https://us.youtubers.me//doctor-er/youtuber-stats           education   
...                                                   ...                 ...   
328834  https://us.youtubers.me//6d4af462-1fe0-4cea-b3...           education   
328845  https://us.youtubers.me//top5quantum/youtuber-...           education   
328926  https://us.youtubers.me//wunder-haff/youtuber-...  science-technology   
328938     https://us.youtubers.me//idei3d/youtuber-stats  science-technology   
328940  https://us.youtubers.me//helium-network-projec...  science-

In [119]:
# 쓸모 없는 채널 제거
updated_df = updated_df[~((updated_df['subscribers'] <= 3000) & (updated_df['started'] <= 2013))]
updated_df = updated_df[updated_df['total_video_count'] > 10]
updated_df

Unnamed: 0,youtuberme_url,category,country,channel_name,subscribers,total_video_views,total_video_count,started,yt_url
0,https://us.youtubers.me//pinkfong-kids-songs-s...,education,United States,Baby Shark - Pinkfong Kids’ Songs & Stories,80400000.0,4.818430e+10,3220.0,2011.0,https://us.youtubers.me//pinkfong-kids-songs-s...
1,https://us.youtubers.me//genevieve-s-playhouse...,education,United States,Genevieve's Playhouse - Learning Videos for Kids,40200000.0,3.012461e+10,636.0,2016.0,https://us.youtubers.me//genevieve-s-playhouse...
2,https://us.youtubers.me//babybus-kids-tv-songs...,education,United States,BabyBus - Kids Songs and Cartoons,35700000.0,2.825209e+10,2405.0,2016.0,https://us.youtubers.me//babybus-kids-tv-songs...
3,https://us.youtubers.me//blippi/youtuber-stats,education,United States,Blippi - Educational Videos for Kids,19800000.0,1.624648e+10,928.0,2014.0,https://us.youtubers.me//blippi/youtube
4,https://us.youtubers.me//dave-and-ava-nursery-...,education,United States,Dave and Ava - Nursery Rhymes and Baby Songs,15600000.0,1.135529e+10,975.0,2014.0,https://us.youtubers.me//dave-and-ava-nursery-...
...,...,...,...,...,...,...,...,...,...
200761,https://us.youtubers.me//6d4af462-1fe0-4cea-b3...,education,Romania,Fire Media,4320.0,3.007470e+05,220.0,2014.0,https://us.youtubers.me//6d4af462-1fe0-4cea-b3...
200762,https://us.youtubers.me//top5quantum/youtuber-...,education,Romania,Azad Unleashed,60000.0,4.549000e+03,28.0,2014.0,https://us.youtubers.me//top5quantum/youtube
200763,https://us.youtubers.me//wunder-haff/youtuber-...,science-technology,Romania,AtelierulTauro,9560.0,9.808417e+06,876.0,2010.0,https://us.youtubers.me//wunder-haff/youtube
200764,https://us.youtubers.me//idei3d/youtuber-stats,science-technology,Romania,idei3D,20800.0,9.819300e+05,72.0,2015.0,https://us.youtubers.me//idei3d/youtube


### 수집 채널 URL 추가----------------------------------------------- 여긴 무시

In [None]:
import schedule
import time
from sqlalchemy import create_engine

# 처음 DB에 추가할때
def append_channel(dataframe):
    engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
    conn = engine.connect()

    dataframe.to_sql(name='channel', con=engine, if_exists='append', index=False)
    conn.close()

In [None]:
# db 저장
append_channel(result_df) 

In [None]:
result_df.dropna(subset=['url'], inplace=True)
result_df.to_excel("yotube_list.xlsx")

In [None]:
# weekly 업로드 진행
def update_channel(dataframe):
    engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
    conn = engine.connect()
    sql_query = 'SELECT * FROM channel'
    df = pd.read_sql(sql_query, engine)
    
    dataframe.to_sql(name='channel', con=engine, if_exists='update', index=False)
    conn.close()

### Shorts 수집

In [None]:
refine_df

In [None]:
# 기존에 존재하다던 데이터 update 해서 다시 데이터 넣기
engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
existing_data = pd.read_sql('SELECT * FROM channel', engine)

# Check for duplicates based on the 'ID' column
duplicates = existing_data[existing_data['Youtuber'].isin(new_data['Youtuber'])]

# Update existing rows with new data
existing_data.update(new_data)

# Filter out rows that are duplicates
new_rows = new_data[~new_data['Youtuber'].isin(duplicates['Youtuber'])]

# Append new rows to the existing data
merged_data = pd.concat([existing_data, new_rows], ignore_index=True)

# Write the merged data back to the database
merged_data.to_sql('channel', engine, if_exists='replace', index=False)