### 채널 수집 (youtubersme로 기본)

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [3]:
# url 포함 youtuberme df 생성
def collect_youtuberme_url(category_url):
    page = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(page.content, "html.parser")

    # Find the table with class "top-charts"
    table = soup.find("table", class_="top-charts")

    # Find all <a> tags within the table
    href_list = []
    if table:
        for a_tag in table.find_all("a"):
            href_value = a_tag.get("href")
            if href_value.endswith("/youtuber-stats"):
                href_list.append("https://us.youtubers.me/" + href_value)
    return href_list

In [4]:
# 여러 카테고리 확인
def crawl_table_urls_extracted(start_url, table_class='top-charts', depth=1):
    visited_urls = set()
    excluded_urls = []
    extracted_strings = {}

    def extract_string_between_substrings(url, start_substring, end_substring):
        start_index = url.find(start_substring)
        end_index = url.find(end_substring, start_index + len(start_substring))
        if start_index != -1 and end_index != -1:
            return url[start_index + len(start_substring):end_index]
        return None

    def recursive_crawl(url, current_depth):
        if current_depth > depth:
            return

        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find the table with the specified class
                table = soup.find('table', class_=table_class)

                if table:
                    # Extract href attributes from anchor tags within the table
                    for row in table.find_all('tr'):
                        columns = row.find_all('td')
                        for col_index, col in enumerate(columns):
                            # Exclude href attributes from the column with the name "category"
                            if col.get_text(strip=True).lower() == 'category':
                                continue

                            hrefs = [a.get('href') for a in col.find_all('a', href=True)]

                            # Process the hrefs
                            for href in hrefs:
                                absolute_url = urljoin(url, href)

                                # Save URLs without "korea-republic-of"
                                if start_url[-8:] not in absolute_url.lower():
                                    excluded_urls.append(absolute_url)
                                else:
                                    visited_urls.add(absolute_url)

                                    # Extract and save strings between specified substrings
                                    extracted_string = extract_string_between_substrings(
                                        absolute_url,
                                        start_url[:28],
                                        '/top-1000-'
                                    )
                                    if extracted_string:
                                        # Create a set for each visited URL to remove duplicates
                                        extracted_strings.setdefault(absolute_url, set()).add(extracted_string)

                                    # Avoid revisiting the same URL
                                    if absolute_url not in visited_urls:
                                        # Recursively crawl the next level
                                        recursive_crawl(absolute_url, current_depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {e}")

    recursive_crawl(start_url, 1)

    # Convert sets to lists
    excluded_urls = list(set(excluded_urls))
    visited_urls = list(set(visited_urls))

    # Convert sets to lists within the extracted_strings dictionary
    for url, strings_set in extracted_strings.items():
        extracted_strings[url] = list(strings_set)
    updated_data = {}
    for key, value in extracted_strings.items():
        split_value = value[0].split('/')
        if len(split_value) >= 2:
            updated_value = split_value[1]
            updated_data[key] = [updated_value]
    return updated_data

In [5]:
# start url: 나라별 top 1000
# 30 개국
country_dic = {"United States": "https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states",
               "Germany":"https://us.youtubers.me/germany/all/top-1000-youtube-channels-in-germany",
               "United Kingdom":"https://us.youtubers.me/united-kingdom/all/top-1000-youtube-channels-in-united-kingdom",
               "Brazil": "https://us.youtubers.me/brazil/all/top-1000-youtube-channels-in-brazil",
               "Mexico" : "https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico",
               "Spain": "https://us.youtubers.me/spain/all/top-1000-youtube-channels-in-spain",
               "Italy" : "https://us.youtubers.me/italy/all/top-1000-youtube-channels-in-italy",
               "Czech Republic": "https://us.youtubers.me/czech-republic/all/top-1000-youtube-channels-in-czech-republic",
               "Russia":"https://us.youtubers.me/russian-federation/all/top-1000-youtube-channels-in-russian-federation",
               "India" : "https://us.youtubers.me/india/all/top-1000-youtube-channels-in-india",
               "France": "https://us.youtubers.me/france/all/top-1000-youtube-channels-in-france",
               "Japan" : "https://us.youtubers.me/japan/all/top-1000-youtube-channels-in-japan",
               "Turkey": "https://us.youtubers.me/turkey/all/top-1000-youtube-channels-in-turkey",
               "South Korea": "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of",
               "Poland": "https://us.youtubers.me/poland/all/top-1000-youtube-channels-in-poland",
               "Canada" : "https://us.youtubers.me/canada/all/top-1000-youtube-channels-in-canada",
               "Vietnam" : "https://us.youtubers.me/viet-nam/all/top-1000-youtube-channels-in-viet-nam",
               "Thailand" : "https://us.youtubers.me/thailand/all/top-1000-youtube-channels-in-thailand",
               "Indonesia" : "https://us.youtubers.me/indonesia/all/top-1000-youtube-channels-in-indonesia",
               "Ukraine" : "https://us.youtubers.me/ukraine/all/top-1000-youtube-channels-in-ukraine",
               "Morocco" : "https://us.youtubers.me/morocco/all/top-1000-youtube-channels-in-morocco",
               "Argentina" : "https://us.youtubers.me/argentina/all/top-1000-youtube-channels-in-argentina",
               "Saudi Arabia": "https://us.youtubers.me/saudi-arabia/all/top-1000-youtube-channels-in-saudi-arabia",
               "Netherlands": "https://us.youtubers.me/netherlands/all/top-1000-youtube-channels-in-netherlands",
               "Egypt": "https://us.youtubers.me/egypt/all/top-1000-youtube-channels-in-egypt",
               "Taiwan": "https://us.youtubers.me/taiwan/all/top-1000-youtube-channels-in-taiwan",
               "Australia": "https://us.youtubers.me/australia/all/top-1000-youtube-channels-in-australia",
               "Greece": "https://us.youtubers.me/greece/all/top-1000-youtube-channels-in-greece",
               "Colombia": "https://us.youtubers.me/colombia/all/top-1000-youtube-channels-in-colombia",
               "Romania" : "https://us.youtubers.me/romania/all/top-1000-youtube-channels-in-romania"
              }

In [6]:
# category 통일(union 편하게)
def category_preprocessing(df):
    df['category'] = df['category'].replace('nan', 'all')

    # Mapping of old categories to new categories
    category_mapping = {'Film & Animation': 'film-animation',
                        'Autos & Vehicles': 'autos-vehicles',
                        'Music': 'music',
                        'Movies': 'movies',
                        'Pets & Animals': 'pets-animals',
                        'Sports': 'sports',
                        'Travel & Events': 'travel-events',
                        'Gaming': 'gaming',
                        'People & Blogs': 'people-blogs',
                        'Comedy': 'comedy',
                        'Entertainment': 'entertainment',
                        'News & Politics': 'news-politics',
                        'Howto & Style': 'howto-style',
                        'Education': 'education',
                        'Science & Technology': 'science-technology',
                        'Shows': 'shows',
                        'Nonprofits & Activism': 'nonprofits-activism',
                        'all': 'all'}

    # Map the old categories to the new categories
    df['category'] = df['category'].map(category_mapping)
    return df

# category df 생성
def create_dataframe(extracted_strings, country):
    data = {'url': [], 'category': [], 'country' : []}
    for url, categories in extracted_strings.items():
        for category in categories:
            data['url'].append(url)
            data['category'].append(category)
            data['country'].append(country)
    df = pd.DataFrame(data)
    return df


In [7]:
# youtuberme df 생성
def collect_youtuberme_basic(url, country):
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'lxml')

    # 현재 페이지에서 table 태그 모두 선택하기
    table1 = soup.select('table')

    # 하나의 테이블 태그 선택하기
    table = table1[0]

    df_top1000 = pd.read_html(str(table))[0]
    print("df1000:", len(df_top1000))
    href_list = collect_youtuberme_url(url)
    df_top1000['url'] = href_list
    df_top1000['country'] = [country for x in range(len(df_top1000))]
    print("url_list: ", len(href_list))
    df_top1000 = category_preprocessing(df_top1000)
    return df_top1000

In [8]:
# test
url = "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of"
start_url_us = 'https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico'
extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
print(extracted_strings)
df_category = create_dataframe(extracted_strings, "korea")
# Display the DataFrame
print(df_category)

{'https://us.youtubers.me/korea-republic-of/people-blogs/top-1000-people-blogs-youtube-channels-in-korea-republic-of': ['people-blogs'], 'https://us.youtubers.me/korea-republic-of/music/top-1000-music-youtube-channels-in-korea-republic-of': ['music'], 'https://us.youtubers.me/korea-republic-of/entertainment/top-1000-entertainment-youtube-channels-in-korea-republic-of': ['entertainment'], 'https://us.youtubers.me/korea-republic-of/gaming/top-1000-gaming-youtube-channels-in-korea-republic-of': ['gaming'], 'https://us.youtubers.me/korea-republic-of/news-politics/top-1000-news-politics-youtube-channels-in-korea-republic-of': ['news-politics'], 'https://us.youtubers.me/korea-republic-of/sports/top-1000-sports-youtube-channels-in-korea-republic-of': ['sports'], 'https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of': ['all'], 'https://us.youtubers.me/korea-republic-of/science-technology/top-1000-science-technology-youtube-channels-in-korea-republic-of':

### 각 유명 채널별 카테고리 링크 저장
- 나라별 17개 카테고리 URL 저장 (1개는 all)

In [9]:
# category별 url 리스트 합치기
dfs = []
for country, url in country_dic.items():
    print(country, url)
    extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
    df_category = create_dataframe(extracted_strings, country)
    dfs.append(df_category)
    country_category_df = pd.concat(dfs, axis=0, ignore_index=True)
    print(country_category_df)
# 전체 URL 저장
country_category_df.to_excel("country_category_url.xlsx", index=False)
country_category_df

United States https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states
                                                  url            category  \
0   https://us.youtubers.me/united-states/educatio...           education   
1   https://us.youtubers.me/united-states/people-b...        people-blogs   
2   https://us.youtubers.me/united-states/sports/t...              sports   
3   https://us.youtubers.me/united-states/entertai...       entertainment   
4   https://us.youtubers.me/united-states/film-ani...      film-animation   
5   https://us.youtubers.me/united-states/comedy/t...              comedy   
6   https://us.youtubers.me/united-states/music/to...               music   
7   https://us.youtubers.me/united-states/gaming/t...              gaming   
8   https://us.youtubers.me/united-states/pets-ani...        pets-animals   
9   https://us.youtubers.me/united-states/all/top-...                 all   
10  https://us.youtubers.me/united-states/news-pol... 

                                                  url        category  \
0   https://us.youtubers.me/united-states/educatio...       education   
1   https://us.youtubers.me/united-states/people-b...    people-blogs   
2   https://us.youtubers.me/united-states/sports/t...          sports   
3   https://us.youtubers.me/united-states/entertai...   entertainment   
4   https://us.youtubers.me/united-states/film-ani...  film-animation   
..                                                ...             ...   
62  https://us.youtubers.me/brazil/howto-style/top...     howto-style   
63  https://us.youtubers.me/brazil/pets-animals/to...    pets-animals   
64  https://us.youtubers.me/brazil/news-politics/t...   news-politics   
65  https://us.youtubers.me/brazil/travel-events/t...   travel-events   
66  https://us.youtubers.me/brazil/autos-vehicles/...  autos-vehicles   

          country  
0   United States  
1   United States  
2   United States  
3   United States  
4   United States  
.. 

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
180  https://us.youtubers.me/france/pets-animals/to...         pets-animals   
181  https://us.youtubers.me/france/science-technol...   science-technology   
182  https://us.youtubers.me/france/nonprofits-acti...  nonprofits-activism   
183  https://us.youtubers.me/france/travel-events/t...        travel-events   
184  https://us.youtubers.me/france/movies/top-1000...               movies   

           country  
0    United States  
1    Unit

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
294  https://us.youtubers.me/thailand/film-animatio...       film-animation   
295  https://us.youtubers.me/thailand/autos-vehicle...       autos-vehicles   
296  https://us.youtubers.me/thailand/education/top...            education   
297  https://us.youtubers.me/thailand/pets-animals/...         pets-animals   
298  https://us.youtubers.me/thailand/nonprofits-ac...  nonprofits-activism   

           country  
0    United States  
1    Unit

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
409  https://us.youtubers.me/egypt/sports/top-1000-...               sports   
410  https://us.youtubers.me/egypt/autos-vehicles/t...       autos-vehicles   
411  https://us.youtubers.me/egypt/nonprofits-activ...  nonprofits-activism   
412  https://us.youtubers.me/egypt/pets-animals/top...         pets-animals   
413  https://us.youtubers.me/egypt/travel-events/to...        travel-events   

           country  
0    United States  
1    Unit

Unnamed: 0,url,category,country
0,https://us.youtubers.me/united-states/educatio...,education,United States
1,https://us.youtubers.me/united-states/people-b...,people-blogs,United States
2,https://us.youtubers.me/united-states/sports/t...,sports,United States
3,https://us.youtubers.me/united-states/entertai...,entertainment,United States
4,https://us.youtubers.me/united-states/film-ani...,film-animation,United States
...,...,...,...
492,https://us.youtubers.me/romania/autos-vehicles...,autos-vehicles,Romania
493,https://us.youtubers.me/romania/science-techno...,science-technology,Romania
494,https://us.youtubers.me/romania/travel-events/...,travel-events,Romania
495,https://us.youtubers.me/romania/shows/top-1000...,shows,Romania


### YouTuberme Dataframe Final

In [10]:
# 전체 카테고리 별 채널 수집
country_category_df = pd.read_excel("country_category_url.xlsx")
# url 돌면서 필요한 데이터 
for url, country in zip(country_category_df['url'].to_list(), country_category_df['country'].to_list()):    
    df_new = collect_youtuberme_basic(url, country)
    dfs.append(df_new)
    result_df = pd.concat(dfs, axis=0, ignore_index=True)

# 중복 제거
result_df = result_df.drop_duplicates('Youtuber')
result_df = result_df.drop("rank", axis=1)
# YouTube URL 저장
result_df['youtube url'] = [url[:-7] for url in  result_df['url'].to_list()]

  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 4
url_list:  4


  df_top1000 = pd.read_html(str(table))[0]


df1000: 95
url_list:  95


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 266
url_list:  266


  df_top1000 = pd.read_html(str(table))[0]


df1000: 668
url_list:  668


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 616
url_list:  616


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 163
url_list:  163


  df_top1000 = pd.read_html(str(table))[0]


df1000: 508
url_list:  508


  df_top1000 = pd.read_html(str(table))[0]


df1000: 436
url_list:  436


  df_top1000 = pd.read_html(str(table))[0]


df1000: 493
url_list:  493


  df_top1000 = pd.read_html(str(table))[0]


df1000: 649
url_list:  649


  df_top1000 = pd.read_html(str(table))[0]


df1000: 7
url_list:  7


  df_top1000 = pd.read_html(str(table))[0]


df1000: 650
url_list:  650


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 91
url_list:  91


  df_top1000 = pd.read_html(str(table))[0]


df1000: 219
url_list:  219


  df_top1000 = pd.read_html(str(table))[0]


df1000: 653
url_list:  653


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 923
url_list:  923


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 514
url_list:  514


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 13
url_list:  13


  df_top1000 = pd.read_html(str(table))[0]


df1000: 144
url_list:  144


  df_top1000 = pd.read_html(str(table))[0]


df1000: 365
url_list:  365


  df_top1000 = pd.read_html(str(table))[0]


df1000: 389
url_list:  389


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 300
url_list:  300


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 468
url_list:  468


  df_top1000 = pd.read_html(str(table))[0]


df1000: 882
url_list:  882


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 922
url_list:  922


  df_top1000 = pd.read_html(str(table))[0]


df1000: 4
url_list:  4


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 667
url_list:  667


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 227
url_list:  227


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 501
url_list:  501


  df_top1000 = pd.read_html(str(table))[0]


df1000: 242
url_list:  242


  df_top1000 = pd.read_html(str(table))[0]


df1000: 783
url_list:  783


  df_top1000 = pd.read_html(str(table))[0]


df1000: 220
url_list:  220


  df_top1000 = pd.read_html(str(table))[0]


df1000: 572
url_list:  572


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 514
url_list:  514


  df_top1000 = pd.read_html(str(table))[0]


df1000: 446
url_list:  446


  df_top1000 = pd.read_html(str(table))[0]


df1000: 253
url_list:  253


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 369
url_list:  369


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 11
url_list:  11


  df_top1000 = pd.read_html(str(table))[0]


df1000: 564
url_list:  564


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 406
url_list:  406


  df_top1000 = pd.read_html(str(table))[0]


df1000: 104
url_list:  104


  df_top1000 = pd.read_html(str(table))[0]


df1000: 54
url_list:  54


  df_top1000 = pd.read_html(str(table))[0]


df1000: 203
url_list:  203


  df_top1000 = pd.read_html(str(table))[0]


df1000: 39
url_list:  39


  df_top1000 = pd.read_html(str(table))[0]


df1000: 145
url_list:  145


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 379
url_list:  379


  df_top1000 = pd.read_html(str(table))[0]


df1000: 462
url_list:  462


  df_top1000 = pd.read_html(str(table))[0]


df1000: 586
url_list:  586


  df_top1000 = pd.read_html(str(table))[0]


df1000: 320
url_list:  320


  df_top1000 = pd.read_html(str(table))[0]


df1000: 351
url_list:  351


  df_top1000 = pd.read_html(str(table))[0]


df1000: 665
url_list:  665


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 107
url_list:  107


  df_top1000 = pd.read_html(str(table))[0]


df1000: 335
url_list:  335


  df_top1000 = pd.read_html(str(table))[0]


df1000: 189
url_list:  189


  df_top1000 = pd.read_html(str(table))[0]


df1000: 261
url_list:  261


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 329
url_list:  329


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 371
url_list:  371


  df_top1000 = pd.read_html(str(table))[0]


df1000: 633
url_list:  633


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 201
url_list:  201


  df_top1000 = pd.read_html(str(table))[0]


df1000: 386
url_list:  386


  df_top1000 = pd.read_html(str(table))[0]


df1000: 216
url_list:  216


  df_top1000 = pd.read_html(str(table))[0]


df1000: 291
url_list:  291


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 102
url_list:  102


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 251
url_list:  251


  df_top1000 = pd.read_html(str(table))[0]


df1000: 46
url_list:  46


  df_top1000 = pd.read_html(str(table))[0]


df1000: 494
url_list:  494


  df_top1000 = pd.read_html(str(table))[0]


df1000: 655
url_list:  655


  df_top1000 = pd.read_html(str(table))[0]


df1000: 113
url_list:  113


  df_top1000 = pd.read_html(str(table))[0]


df1000: 21
url_list:  21


  df_top1000 = pd.read_html(str(table))[0]


df1000: 635
url_list:  635


  df_top1000 = pd.read_html(str(table))[0]


df1000: 363
url_list:  363


  df_top1000 = pd.read_html(str(table))[0]


df1000: 54
url_list:  54


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 113
url_list:  113


  df_top1000 = pd.read_html(str(table))[0]


df1000: 52
url_list:  52


  df_top1000 = pd.read_html(str(table))[0]


df1000: 107
url_list:  107


  df_top1000 = pd.read_html(str(table))[0]


df1000: 123
url_list:  123


  df_top1000 = pd.read_html(str(table))[0]


df1000: 53
url_list:  53


  df_top1000 = pd.read_html(str(table))[0]


df1000: 116
url_list:  116


  df_top1000 = pd.read_html(str(table))[0]


df1000: 10
url_list:  10


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 51
url_list:  51


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 925
url_list:  925


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 859
url_list:  859


  df_top1000 = pd.read_html(str(table))[0]


df1000: 33
url_list:  33


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 696
url_list:  696


  df_top1000 = pd.read_html(str(table))[0]


df1000: 744
url_list:  744


  df_top1000 = pd.read_html(str(table))[0]


df1000: 959
url_list:  959


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 350
url_list:  350


  df_top1000 = pd.read_html(str(table))[0]


df1000: 220
url_list:  220


  df_top1000 = pd.read_html(str(table))[0]


df1000: 916
url_list:  916


  df_top1000 = pd.read_html(str(table))[0]


df1000: 417
url_list:  417


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 22
url_list:  22


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 825
url_list:  825


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 21
url_list:  21


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 437
url_list:  437


  df_top1000 = pd.read_html(str(table))[0]


df1000: 920
url_list:  920


  df_top1000 = pd.read_html(str(table))[0]


df1000: 155
url_list:  155


  df_top1000 = pd.read_html(str(table))[0]


df1000: 305
url_list:  305


  df_top1000 = pd.read_html(str(table))[0]


df1000: 339
url_list:  339


  df_top1000 = pd.read_html(str(table))[0]


df1000: 746
url_list:  746


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 732
url_list:  732


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 452
url_list:  452


  df_top1000 = pd.read_html(str(table))[0]


df1000: 534
url_list:  534


  df_top1000 = pd.read_html(str(table))[0]


df1000: 829
url_list:  829


  df_top1000 = pd.read_html(str(table))[0]


df1000: 793
url_list:  793


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 407
url_list:  407


  df_top1000 = pd.read_html(str(table))[0]


df1000: 131
url_list:  131


  df_top1000 = pd.read_html(str(table))[0]


df1000: 388
url_list:  388


  df_top1000 = pd.read_html(str(table))[0]


df1000: 47
url_list:  47


  df_top1000 = pd.read_html(str(table))[0]


df1000: 175
url_list:  175


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 371
url_list:  371


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 707
url_list:  707


  df_top1000 = pd.read_html(str(table))[0]


df1000: 908
url_list:  908


  df_top1000 = pd.read_html(str(table))[0]


df1000: 257
url_list:  257


  df_top1000 = pd.read_html(str(table))[0]


df1000: 574
url_list:  574


  df_top1000 = pd.read_html(str(table))[0]


df1000: 683
url_list:  683


  df_top1000 = pd.read_html(str(table))[0]


df1000: 747
url_list:  747


  df_top1000 = pd.read_html(str(table))[0]


df1000: 453
url_list:  453


  df_top1000 = pd.read_html(str(table))[0]


df1000: 876
url_list:  876


  df_top1000 = pd.read_html(str(table))[0]


df1000: 249
url_list:  249


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 129
url_list:  129


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 436
url_list:  436


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 544
url_list:  544


  df_top1000 = pd.read_html(str(table))[0]


df1000: 284
url_list:  284


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 500
url_list:  500


  df_top1000 = pd.read_html(str(table))[0]


df1000: 117
url_list:  117


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 256
url_list:  256


  df_top1000 = pd.read_html(str(table))[0]


df1000: 501
url_list:  501


  df_top1000 = pd.read_html(str(table))[0]


df1000: 16
url_list:  16


  df_top1000 = pd.read_html(str(table))[0]


df1000: 107
url_list:  107


  df_top1000 = pd.read_html(str(table))[0]


df1000: 58
url_list:  58


  df_top1000 = pd.read_html(str(table))[0]


df1000: 243
url_list:  243


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 753
url_list:  753


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 584
url_list:  584


  df_top1000 = pd.read_html(str(table))[0]


df1000: 578
url_list:  578


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 172
url_list:  172


  df_top1000 = pd.read_html(str(table))[0]


df1000: 736
url_list:  736


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 228
url_list:  228


  df_top1000 = pd.read_html(str(table))[0]


df1000: 418
url_list:  418


  df_top1000 = pd.read_html(str(table))[0]


df1000: 321
url_list:  321


  df_top1000 = pd.read_html(str(table))[0]


df1000: 567
url_list:  567


  df_top1000 = pd.read_html(str(table))[0]


df1000: 336
url_list:  336


  df_top1000 = pd.read_html(str(table))[0]


df1000: 286
url_list:  286


  df_top1000 = pd.read_html(str(table))[0]


df1000: 54
url_list:  54


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 877
url_list:  877


  df_top1000 = pd.read_html(str(table))[0]


df1000: 420
url_list:  420


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 323
url_list:  323


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 239
url_list:  239


  df_top1000 = pd.read_html(str(table))[0]


df1000: 80
url_list:  80


  df_top1000 = pd.read_html(str(table))[0]


df1000: 237
url_list:  237


  df_top1000 = pd.read_html(str(table))[0]


df1000: 427
url_list:  427


  df_top1000 = pd.read_html(str(table))[0]


df1000: 303
url_list:  303


  df_top1000 = pd.read_html(str(table))[0]


df1000: 207
url_list:  207


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 311
url_list:  311


  df_top1000 = pd.read_html(str(table))[0]


df1000: 146
url_list:  146


  df_top1000 = pd.read_html(str(table))[0]


df1000: 46
url_list:  46


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 423
url_list:  423


  df_top1000 = pd.read_html(str(table))[0]


df1000: 968
url_list:  968


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 334
url_list:  334


  df_top1000 = pd.read_html(str(table))[0]


df1000: 226
url_list:  226


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 485
url_list:  485


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 599
url_list:  599


  df_top1000 = pd.read_html(str(table))[0]


df1000: 197
url_list:  197


  df_top1000 = pd.read_html(str(table))[0]


df1000: 211
url_list:  211


  df_top1000 = pd.read_html(str(table))[0]


df1000: 307
url_list:  307


  df_top1000 = pd.read_html(str(table))[0]


df1000: 145
url_list:  145


  df_top1000 = pd.read_html(str(table))[0]


df1000: 95
url_list:  95


  df_top1000 = pd.read_html(str(table))[0]


df1000: 52
url_list:  52


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 521
url_list:  521


  df_top1000 = pd.read_html(str(table))[0]


df1000: 71
url_list:  71


  df_top1000 = pd.read_html(str(table))[0]


df1000: 140
url_list:  140


  df_top1000 = pd.read_html(str(table))[0]


df1000: 116
url_list:  116


  df_top1000 = pd.read_html(str(table))[0]


df1000: 96
url_list:  96


  df_top1000 = pd.read_html(str(table))[0]


df1000: 210
url_list:  210


  df_top1000 = pd.read_html(str(table))[0]


df1000: 237
url_list:  237


  df_top1000 = pd.read_html(str(table))[0]


df1000: 546
url_list:  546


  df_top1000 = pd.read_html(str(table))[0]


df1000: 253
url_list:  253


  df_top1000 = pd.read_html(str(table))[0]


df1000: 18
url_list:  18


  df_top1000 = pd.read_html(str(table))[0]


df1000: 101
url_list:  101


  df_top1000 = pd.read_html(str(table))[0]


df1000: 73
url_list:  73


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 766
url_list:  766


  df_top1000 = pd.read_html(str(table))[0]


df1000: 215
url_list:  215


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 736
url_list:  736


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 216
url_list:  216


  df_top1000 = pd.read_html(str(table))[0]


df1000: 159
url_list:  159


  df_top1000 = pd.read_html(str(table))[0]


df1000: 209
url_list:  209


  df_top1000 = pd.read_html(str(table))[0]


df1000: 71
url_list:  71


  df_top1000 = pd.read_html(str(table))[0]


df1000: 147
url_list:  147


  df_top1000 = pd.read_html(str(table))[0]


df1000: 295
url_list:  295


  df_top1000 = pd.read_html(str(table))[0]


df1000: 90
url_list:  90


  df_top1000 = pd.read_html(str(table))[0]


df1000: 133
url_list:  133


  df_top1000 = pd.read_html(str(table))[0]


df1000: 56
url_list:  56


  df_top1000 = pd.read_html(str(table))[0]


df1000: 17
url_list:  17


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 663
url_list:  663


  df_top1000 = pd.read_html(str(table))[0]


df1000: 464
url_list:  464


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 403
url_list:  403


  df_top1000 = pd.read_html(str(table))[0]


df1000: 203
url_list:  203


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 474
url_list:  474


  df_top1000 = pd.read_html(str(table))[0]


df1000: 194
url_list:  194


  df_top1000 = pd.read_html(str(table))[0]


df1000: 522
url_list:  522


  df_top1000 = pd.read_html(str(table))[0]


df1000: 230
url_list:  230


  df_top1000 = pd.read_html(str(table))[0]


df1000: 257
url_list:  257


  df_top1000 = pd.read_html(str(table))[0]


df1000: 129
url_list:  129


  df_top1000 = pd.read_html(str(table))[0]


df1000: 386
url_list:  386


  df_top1000 = pd.read_html(str(table))[0]


df1000: 368
url_list:  368


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 188
url_list:  188


  df_top1000 = pd.read_html(str(table))[0]


df1000: 98
url_list:  98


  df_top1000 = pd.read_html(str(table))[0]


df1000: 215
url_list:  215


  df_top1000 = pd.read_html(str(table))[0]


df1000: 203
url_list:  203


  df_top1000 = pd.read_html(str(table))[0]


df1000: 552
url_list:  552


  df_top1000 = pd.read_html(str(table))[0]


df1000: 73
url_list:  73


  df_top1000 = pd.read_html(str(table))[0]


df1000: 111
url_list:  111


  df_top1000 = pd.read_html(str(table))[0]


df1000: 344
url_list:  344


  df_top1000 = pd.read_html(str(table))[0]


df1000: 177
url_list:  177


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 264
url_list:  264


  df_top1000 = pd.read_html(str(table))[0]


df1000: 149
url_list:  149


  df_top1000 = pd.read_html(str(table))[0]


df1000: 77
url_list:  77


  df_top1000 = pd.read_html(str(table))[0]


df1000: 499
url_list:  499


  df_top1000 = pd.read_html(str(table))[0]


df1000: 408
url_list:  408


  df_top1000 = pd.read_html(str(table))[0]


df1000: 353
url_list:  353


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 134
url_list:  134


  df_top1000 = pd.read_html(str(table))[0]


df1000: 43
url_list:  43


  df_top1000 = pd.read_html(str(table))[0]


df1000: 148
url_list:  148


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 85
url_list:  85


  df_top1000 = pd.read_html(str(table))[0]


df1000: 327
url_list:  327


  df_top1000 = pd.read_html(str(table))[0]


df1000: 168
url_list:  168


  df_top1000 = pd.read_html(str(table))[0]


df1000: 146
url_list:  146


  df_top1000 = pd.read_html(str(table))[0]


df1000: 167
url_list:  167


  df_top1000 = pd.read_html(str(table))[0]


df1000: 30
url_list:  30


  df_top1000 = pd.read_html(str(table))[0]


df1000: 5
url_list:  5


  df_top1000 = pd.read_html(str(table))[0]


df1000: 13
url_list:  13


  df_top1000 = pd.read_html(str(table))[0]


df1000: 904
url_list:  904


  df_top1000 = pd.read_html(str(table))[0]


df1000: 190
url_list:  190


  df_top1000 = pd.read_html(str(table))[0]


df1000: 951
url_list:  951


  df_top1000 = pd.read_html(str(table))[0]


df1000: 207
url_list:  207


  df_top1000 = pd.read_html(str(table))[0]


df1000: 114
url_list:  114


  df_top1000 = pd.read_html(str(table))[0]


df1000: 88
url_list:  88


  df_top1000 = pd.read_html(str(table))[0]


df1000: 637
url_list:  637


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 183
url_list:  183


  df_top1000 = pd.read_html(str(table))[0]


df1000: 337
url_list:  337


  df_top1000 = pd.read_html(str(table))[0]


df1000: 86
url_list:  86


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 28
url_list:  28


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 65
url_list:  65


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 221
url_list:  221


  df_top1000 = pd.read_html(str(table))[0]


df1000: 953
url_list:  953


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 471
url_list:  471


  df_top1000 = pd.read_html(str(table))[0]


df1000: 517
url_list:  517


  df_top1000 = pd.read_html(str(table))[0]


df1000: 166
url_list:  166


  df_top1000 = pd.read_html(str(table))[0]


df1000: 111
url_list:  111


  df_top1000 = pd.read_html(str(table))[0]


df1000: 154
url_list:  154


  df_top1000 = pd.read_html(str(table))[0]


df1000: 17
url_list:  17


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 154
url_list:  154


  df_top1000 = pd.read_html(str(table))[0]


df1000: 123
url_list:  123


  df_top1000 = pd.read_html(str(table))[0]


df1000: 61
url_list:  61


  df_top1000 = pd.read_html(str(table))[0]


df1000: 75
url_list:  75


  df_top1000 = pd.read_html(str(table))[0]


df1000: 43
url_list:  43


  df_top1000 = pd.read_html(str(table))[0]


df1000: 29
url_list:  29


  df_top1000 = pd.read_html(str(table))[0]


df1000: 34
url_list:  34


  df_top1000 = pd.read_html(str(table))[0]


df1000: 638
url_list:  638


  df_top1000 = pd.read_html(str(table))[0]


df1000: 150
url_list:  150


  df_top1000 = pd.read_html(str(table))[0]


df1000: 601
url_list:  601


  df_top1000 = pd.read_html(str(table))[0]


df1000: 668
url_list:  668


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 153
url_list:  153


  df_top1000 = pd.read_html(str(table))[0]


df1000: 131
url_list:  131


  df_top1000 = pd.read_html(str(table))[0]


df1000: 175
url_list:  175


  df_top1000 = pd.read_html(str(table))[0]


df1000: 655
url_list:  655


  df_top1000 = pd.read_html(str(table))[0]


df1000: 189
url_list:  189


  df_top1000 = pd.read_html(str(table))[0]


df1000: 99
url_list:  99


  df_top1000 = pd.read_html(str(table))[0]


df1000: 85
url_list:  85


  df_top1000 = pd.read_html(str(table))[0]


df1000: 71
url_list:  71


  df_top1000 = pd.read_html(str(table))[0]


df1000: 69
url_list:  69


  df_top1000 = pd.read_html(str(table))[0]


df1000: 46
url_list:  46


  df_top1000 = pd.read_html(str(table))[0]


df1000: 16
url_list:  16


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 615
url_list:  615


  df_top1000 = pd.read_html(str(table))[0]


df1000: 171
url_list:  171


  df_top1000 = pd.read_html(str(table))[0]


df1000: 140
url_list:  140


  df_top1000 = pd.read_html(str(table))[0]


df1000: 327
url_list:  327


  df_top1000 = pd.read_html(str(table))[0]


df1000: 103
url_list:  103


  df_top1000 = pd.read_html(str(table))[0]


df1000: 221
url_list:  221


  df_top1000 = pd.read_html(str(table))[0]


df1000: 127
url_list:  127


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 190
url_list:  190


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 72
url_list:  72


  df_top1000 = pd.read_html(str(table))[0]


df1000: 386
url_list:  386


  df_top1000 = pd.read_html(str(table))[0]


df1000: 17
url_list:  17


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15
url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 26
url_list:  26


  df_top1000 = pd.read_html(str(table))[0]


df1000: 17
url_list:  17


  df_top1000 = pd.read_html(str(table))[0]


df1000: 182
url_list:  182


  df_top1000 = pd.read_html(str(table))[0]


df1000: 353
url_list:  353


  df_top1000 = pd.read_html(str(table))[0]


df1000: 524
url_list:  524


  df_top1000 = pd.read_html(str(table))[0]


df1000: 739
url_list:  739


  df_top1000 = pd.read_html(str(table))[0]


df1000: 220
url_list:  220


  df_top1000 = pd.read_html(str(table))[0]


df1000: 86
url_list:  86


  df_top1000 = pd.read_html(str(table))[0]


df1000: 48
url_list:  48


  df_top1000 = pd.read_html(str(table))[0]


df1000: 98
url_list:  98


  df_top1000 = pd.read_html(str(table))[0]


df1000: 464
url_list:  464


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 83
url_list:  83


  df_top1000 = pd.read_html(str(table))[0]


df1000: 92
url_list:  92


  df_top1000 = pd.read_html(str(table))[0]


df1000: 103
url_list:  103


  df_top1000 = pd.read_html(str(table))[0]


df1000: 36
url_list:  36


  df_top1000 = pd.read_html(str(table))[0]


df1000: 50
url_list:  50


  df_top1000 = pd.read_html(str(table))[0]


df1000: 26
url_list:  26


  df_top1000 = pd.read_html(str(table))[0]


df1000: 543
url_list:  543


  df_top1000 = pd.read_html(str(table))[0]


df1000: 240
url_list:  240


  df_top1000 = pd.read_html(str(table))[0]


df1000: 840
url_list:  840


  df_top1000 = pd.read_html(str(table))[0]


df1000: 758
url_list:  758


  df_top1000 = pd.read_html(str(table))[0]


df1000: 681
url_list:  681


  df_top1000 = pd.read_html(str(table))[0]


df1000: 89
url_list:  89


  df_top1000 = pd.read_html(str(table))[0]


df1000: 266
url_list:  266


  df_top1000 = pd.read_html(str(table))[0]


df1000: 206
url_list:  206


  df_top1000 = pd.read_html(str(table))[0]


df1000: 177
url_list:  177


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 279
url_list:  279


  df_top1000 = pd.read_html(str(table))[0]


df1000: 45
url_list:  45


  df_top1000 = pd.read_html(str(table))[0]


df1000: 103
url_list:  103


  df_top1000 = pd.read_html(str(table))[0]


df1000: 153
url_list:  153


  df_top1000 = pd.read_html(str(table))[0]


df1000: 126
url_list:  126


  df_top1000 = pd.read_html(str(table))[0]


df1000: 5
url_list:  5


  df_top1000 = pd.read_html(str(table))[0]


df1000: 25
url_list:  25


  df_top1000 = pd.read_html(str(table))[0]


df1000: 496
url_list:  496


  df_top1000 = pd.read_html(str(table))[0]


df1000: 279
url_list:  279


  df_top1000 = pd.read_html(str(table))[0]


df1000: 285
url_list:  285


  df_top1000 = pd.read_html(str(table))[0]


df1000: 122
url_list:  122


  df_top1000 = pd.read_html(str(table))[0]


df1000: 368
url_list:  368


  df_top1000 = pd.read_html(str(table))[0]


df1000: 148
url_list:  148


  df_top1000 = pd.read_html(str(table))[0]


df1000: 96
url_list:  96


  df_top1000 = pd.read_html(str(table))[0]


df1000: 114
url_list:  114


  df_top1000 = pd.read_html(str(table))[0]


df1000: 106
url_list:  106


  df_top1000 = pd.read_html(str(table))[0]


df1000: 62
url_list:  62


  df_top1000 = pd.read_html(str(table))[0]


df1000: 65
url_list:  65


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 41
url_list:  41


  df_top1000 = pd.read_html(str(table))[0]


df1000: 60
url_list:  60


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15
url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 801
url_list:  801


  df_top1000 = pd.read_html(str(table))[0]


df1000: 735
url_list:  735


  df_top1000 = pd.read_html(str(table))[0]


df1000: 718
url_list:  718


  df_top1000 = pd.read_html(str(table))[0]


df1000: 370
url_list:  370


  df_top1000 = pd.read_html(str(table))[0]


df1000: 19
url_list:  19


  df_top1000 = pd.read_html(str(table))[0]


df1000: 150
url_list:  150


  df_top1000 = pd.read_html(str(table))[0]


df1000: 141
url_list:  141


  df_top1000 = pd.read_html(str(table))[0]


df1000: 116
url_list:  116


  df_top1000 = pd.read_html(str(table))[0]


df1000: 224
url_list:  224


  df_top1000 = pd.read_html(str(table))[0]


df1000: 106
url_list:  106


  df_top1000 = pd.read_html(str(table))[0]


df1000: 136
url_list:  136


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 45
url_list:  45


  df_top1000 = pd.read_html(str(table))[0]


df1000: 49
url_list:  49


  df_top1000 = pd.read_html(str(table))[0]


df1000: 86
url_list:  86


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15
url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 644
url_list:  644


  df_top1000 = pd.read_html(str(table))[0]


df1000: 577
url_list:  577


  df_top1000 = pd.read_html(str(table))[0]


df1000: 104
url_list:  104


  df_top1000 = pd.read_html(str(table))[0]


df1000: 83
url_list:  83


  df_top1000 = pd.read_html(str(table))[0]


df1000: 438
url_list:  438


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 507
url_list:  507


  df_top1000 = pd.read_html(str(table))[0]


df1000: 80
url_list:  80


  df_top1000 = pd.read_html(str(table))[0]


df1000: 18
url_list:  18


  df_top1000 = pd.read_html(str(table))[0]


df1000: 73
url_list:  73


  df_top1000 = pd.read_html(str(table))[0]


df1000: 100
url_list:  100


  df_top1000 = pd.read_html(str(table))[0]


df1000: 90
url_list:  90


  df_top1000 = pd.read_html(str(table))[0]


df1000: 59
url_list:  59


  df_top1000 = pd.read_html(str(table))[0]


df1000: 34
url_list:  34


  df_top1000 = pd.read_html(str(table))[0]


df1000: 26
url_list:  26


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 13
url_list:  13


In [18]:
# column명 변경
result_df = result_df.rename(columns={
    'url': 'youtuberme_url',
    'Youtuber': 'channel_name',
    'video views': 'total_video_views',
    'video count': 'total_video_count',
    'youtube url': 'yt_url'
})

In [28]:
# subscribers가 0인 경우 채널이 삭제된 경우이므로 제거
result_df = result_df[result_df['subscribers'] != 0]

Unnamed: 0,youtuberme_url,category,country,channel_name,subscribers,video_views,video_count,started,youtube_url
0,https://us.youtubers.me//abckidtv-nursery-rhym...,education,United States,Cocomelon - Nursery Rhymes,174000000.0,181065509035,1157,2006,https://us.youtubers.me//abckidtv-nursery-rhym...
1,https://us.youtubers.me//pinkfong-kids-songs-s...,education,United States,Baby Shark - Pinkfong Kids’ Songs & Stories,80400000.0,48184301971,3220,2011,https://us.youtubers.me//pinkfong-kids-songs-s...
2,https://us.youtubers.me//genevieve-s-playhouse...,education,United States,Genevieve's Playhouse - Learning Videos for Kids,40200000.0,30124608781,636,2016,https://us.youtubers.me//genevieve-s-playhouse...
3,https://us.youtubers.me//babybus-kids-tv-songs...,education,United States,BabyBus - Kids Songs and Cartoons,35700000.0,28252093300,2405,2016,https://us.youtubers.me//babybus-kids-tv-songs...
4,https://us.youtubers.me//blippi/youtuber-stats,education,United States,Blippi - Educational Videos for Kids,19800000.0,16246480827,928,2014,https://us.youtubers.me//blippi/youtube
...,...,...,...,...,...,...,...,...,...
196538,https://us.youtubers.me//betania/youtuber-stats,nonprofits-activism,Romania,Betania Oradea,7430.0,1779858,832,2009,https://us.youtubers.me//betania/youtube
196539,https://us.youtubers.me//sergiu-brega/youtuber...,nonprofits-activism,Romania,Sergiu Brega,2850.0,1561907,259,2011,https://us.youtubers.me//sergiu-brega/youtube
196540,https://us.youtubers.me//craiova-cetatea-banil...,nonprofits-activism,Romania,Craiova Cetatea Banilor Las Vegas-ul Romaniei,1010.0,801704,435,2016,https://us.youtubers.me//craiova-cetatea-banil...
196541,https://us.youtubers.me//speran-a-in-iisus-abo...,nonprofits-activism,Romania,Harul TV,59000.0,188709,1227,2012,https://us.youtubers.me//speran-a-in-iisus-abo...


In [22]:
from datetime import datetime
# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')
file_path = f"Youtube_Data_{today_date}.xlsx"
if pd.isna(result_df['channel_name'].iloc[0]):
    # Drop the first row
    result_df = result_df.drop(result_df.index[0])
# Save the DataFrame to Excel with today's date in the filename
result_df.to_excel(file_path, index=False)


### 수집 채널 URL 추가----------------------------------------------- 여긴 무시

In [9]:
import schedule
import time
from sqlalchemy import create_engine

# 처음 DB에 추가할때
def append_channel(dataframe):
    engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
    conn = engine.connect()

    dataframe.to_sql(name='channel', con=engine, if_exists='append', index=False)
    conn.close()

In [11]:
# db 저장
append_channel(result_df) 

In [19]:
result_df.dropna(subset=['url'], inplace=True)
result_df.to_excel("yotube_list.xlsx")

In [17]:
# weekly 업로드 진행
def update_channel(dataframe):
    engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
    conn = engine.connect()
    sql_query = 'SELECT * FROM channel'
    df = pd.read_sql(sql_query, engine)
    
    dataframe.to_sql(name='channel', con=engine, if_exists='update', index=False)
    conn.close()

### Shorts 수집

In [27]:
refine_df

NameError: name 'refine_df' is not defined

In [39]:
# 기존에 존재하다던 데이터 update 해서 다시 데이터 넣기
engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
existing_data = pd.read_sql('SELECT * FROM channel', engine)

# Check for duplicates based on the 'ID' column
duplicates = existing_data[existing_data['Youtuber'].isin(new_data['Youtuber'])]

# Update existing rows with new data
existing_data.update(new_data)

# Filter out rows that are duplicates
new_rows = new_data[~new_data['Youtuber'].isin(duplicates['Youtuber'])]

# Append new rows to the existing data
merged_data = pd.concat([existing_data, new_rows], ignore_index=True)

# Write the merged data back to the database
merged_data.to_sql('channel', engine, if_exists='replace', index=False)

Unnamed: 0,rank,Youtuber,subscribers,video views,video count,category,started,url,category_ranking
0,1,한국고전영화 Korean Classic Film,844000.0,338119500,370,Movies,2011,,
1,1,김프로KIMPRO,28400000.0,29282621107,2233,people-blogs,2017,https://us.youtubers.me/5f2ac6ed-7607-4084-8d9...,NAN
2,2,TwinRoozi 쌍둥이 루지,10600000.0,6676946688,553,people-blogs,2018,https://us.youtubers.me/skt-t1/youtuber-stats,NAN
3,3,Byungari 병아리언니,7540000.0,6372469710,772,people-blogs,2020,https://us.youtubers.me/805407e2-3ce6-4a72-a61...,NAN
4,4,팀일루션 노성율 - TEAM1LLUSION,5790000.0,6263576833,828,people-blogs,2010,https://us.youtubers.me/heykin-couple/youtuber...,NAN
...,...,...,...,...,...,...,...,...,...
7811,49,Anonymous Messengers,493.0,2525,6,nonprofits-activism,2016,https://us.youtubers.me/anonymous-e144fd85-358...,NAN
7812,50,네더고래의 대.단.한 채널.,5980.0,536,1,nonprofits-activism,2012,https://us.youtubers.me/wildbreeze/youtuber-stats,NAN
7813,51,오렌지LAB,8020.0,0,0,nonprofits-activism,2018,https://us.youtubers.me/650d2f3e-90b8-48ee-9ce...,NAN
7814,52,Álan V,959.0,0,0,nonprofits-activism,2016,https://us.youtubers.me/tv-825cb6e4-9590-4f10-...,NAN
