### 채널 수집 (youtubersme로 기본)

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime
import glob
import os

In [2]:
# url 포함 youtuberme df 생성
def collect_youtuberme_url(category_url):
    page = requests.get(url)
    # Parse the HTML content
    soup = BeautifulSoup(page.content, "html.parser")

    # Find the table with class "top-charts"
    table = soup.find("table", class_="top-charts")

    # Find all <a> tags within the table
    href_list = []
    if table:
        for a_tag in table.find_all("a"):
            href_value = a_tag.get("href")
            if href_value.endswith("/youtuber-stats"):
                href_list.append("https://us.youtubers.me/" + href_value)
    return href_list

In [3]:
# 여러 카테고리 확인
def crawl_table_urls_extracted(start_url, table_class='top-charts', depth=1):
    visited_urls = set()
    excluded_urls = []
    extracted_strings = {}

    def extract_string_between_substrings(url, start_substring, end_substring):
        start_index = url.find(start_substring)
        end_index = url.find(end_substring, start_index + len(start_substring))
        if start_index != -1 and end_index != -1:
            return url[start_index + len(start_substring):end_index]
        return None

    def recursive_crawl(url, current_depth):
        if current_depth > depth:
            return

        try:
            response = requests.get(url)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')

                # Find the table with the specified class
                table = soup.find('table', class_=table_class)

                if table:
                    # Extract href attributes from anchor tags within the table
                    for row in table.find_all('tr'):
                        columns = row.find_all('td')
                        for col_index, col in enumerate(columns):
                            # Exclude href attributes from the column with the name "category"
                            if col.get_text(strip=True).lower() == 'category':
                                continue

                            hrefs = [a.get('href') for a in col.find_all('a', href=True)]

                            # Process the hrefs
                            for href in hrefs:
                                absolute_url = urljoin(url, href)

                                # Save URLs without "korea-republic-of"
                                if start_url[-8:] not in absolute_url.lower():
                                    excluded_urls.append(absolute_url)
                                else:
                                    visited_urls.add(absolute_url)

                                    # Extract and save strings between specified substrings
                                    extracted_string = extract_string_between_substrings(
                                        absolute_url,
                                        start_url[:28],
                                        '/top-1000-'
                                    )
                                    if extracted_string:
                                        # Create a set for each visited URL to remove duplicates
                                        extracted_strings.setdefault(absolute_url, set()).add(extracted_string)

                                    # Avoid revisiting the same URL
                                    if absolute_url not in visited_urls:
                                        # Recursively crawl the next level
                                        recursive_crawl(absolute_url, current_depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {e}")

    recursive_crawl(start_url, 1)

    # Convert sets to lists
    excluded_urls = list(set(excluded_urls))
    visited_urls = list(set(visited_urls))

    # Convert sets to lists within the extracted_strings dictionary
    for url, strings_set in extracted_strings.items():
        extracted_strings[url] = list(strings_set)
    updated_data = {}
    for key, value in extracted_strings.items():
        split_value = value[0].split('/')
        if len(split_value) >= 2:
            updated_value = split_value[1]
            updated_data[key] = [updated_value]
    return updated_data

In [4]:
# start url: 나라별 top 1000
# 30 개국
country_dic = {"United States": "https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states",
               "Germany":"https://us.youtubers.me/germany/all/top-1000-youtube-channels-in-germany",
               "United Kingdom":"https://us.youtubers.me/united-kingdom/all/top-1000-youtube-channels-in-united-kingdom",
               "Brazil": "https://us.youtubers.me/brazil/all/top-1000-youtube-channels-in-brazil",
               "Mexico" : "https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico",
               "Spain": "https://us.youtubers.me/spain/all/top-1000-youtube-channels-in-spain",
               "Italy" : "https://us.youtubers.me/italy/all/top-1000-youtube-channels-in-italy",
               "Czech Republic": "https://us.youtubers.me/czech-republic/all/top-1000-youtube-channels-in-czech-republic",
               "Russia":"https://us.youtubers.me/russian-federation/all/top-1000-youtube-channels-in-russian-federation",
               "India" : "https://us.youtubers.me/india/all/top-1000-youtube-channels-in-india",
               "France": "https://us.youtubers.me/france/all/top-1000-youtube-channels-in-france",
               "Japan" : "https://us.youtubers.me/japan/all/top-1000-youtube-channels-in-japan",
               "Turkey": "https://us.youtubers.me/turkey/all/top-1000-youtube-channels-in-turkey",
               "South Korea": "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of",
               "Poland": "https://us.youtubers.me/poland/all/top-1000-youtube-channels-in-poland",
               "Canada" : "https://us.youtubers.me/canada/all/top-1000-youtube-channels-in-canada",
               "Vietnam" : "https://us.youtubers.me/viet-nam/all/top-1000-youtube-channels-in-viet-nam",
               "Thailand" : "https://us.youtubers.me/thailand/all/top-1000-youtube-channels-in-thailand",
               "Indonesia" : "https://us.youtubers.me/indonesia/all/top-1000-youtube-channels-in-indonesia",
               "Ukraine" : "https://us.youtubers.me/ukraine/all/top-1000-youtube-channels-in-ukraine",
               "Morocco" : "https://us.youtubers.me/morocco/all/top-1000-youtube-channels-in-morocco",
               "Argentina" : "https://us.youtubers.me/argentina/all/top-1000-youtube-channels-in-argentina",
               "Saudi Arabia": "https://us.youtubers.me/saudi-arabia/all/top-1000-youtube-channels-in-saudi-arabia",
               "Netherlands": "https://us.youtubers.me/netherlands/all/top-1000-youtube-channels-in-netherlands",
               "Egypt": "https://us.youtubers.me/egypt/all/top-1000-youtube-channels-in-egypt",
               "Taiwan": "https://us.youtubers.me/taiwan/all/top-1000-youtube-channels-in-taiwan",
               "Australia": "https://us.youtubers.me/australia/all/top-1000-youtube-channels-in-australia",
               "Greece": "https://us.youtubers.me/greece/all/top-1000-youtube-channels-in-greece",
               "Colombia": "https://us.youtubers.me/colombia/all/top-1000-youtube-channels-in-colombia",
               "Romania" : "https://us.youtubers.me/romania/all/top-1000-youtube-channels-in-romania"
              }

In [5]:
# category 통일(union 편하게)
def category_preprocessing(df):
    df['category'] = df['category'].replace('nan', 'all')

    # Mapping of old categories to new categories
    category_mapping = {'Film & Animation': 'film-animation',
                        'Autos & Vehicles': 'autos-vehicles',
                        'Music': 'music',
                        'Movies': 'movies',
                        'Pets & Animals': 'pets-animals',
                        'Sports': 'sports',
                        'Travel & Events': 'travel-events',
                        'Gaming': 'gaming',
                        'People & Blogs': 'people-blogs',
                        'Comedy': 'comedy',
                        'Entertainment': 'entertainment',
                        'News & Politics': 'news-politics',
                        'Howto & Style': 'howto-style',
                        'Education': 'education',
                        'Science & Technology': 'science-technology',
                        'Shows': 'shows',
                        'Nonprofits & Activism': 'nonprofits-activism',
                        'all': 'all'}

    # Map the old categories to the new categories
    df['category'] = df['category'].map(category_mapping)
    return df

# category df 생성
def create_dataframe(extracted_strings, country):
    data = {'url': [], 'category': [], 'country' : []}
    for url, categories in extracted_strings.items():
        for category in categories:
            data['url'].append(url)
            data['category'].append(category)
            data['country'].append(country)
    df = pd.DataFrame(data)
    return df


In [6]:
# youtuberme df 생성
def collect_youtuberme_basic(url, country):
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'lxml')

    # 현재 페이지에서 table 태그 모두 선택하기
    table1 = soup.select('table')

    # 하나의 테이블 태그 선택하기
    table = table1[0]

    df_top1000 = pd.read_html(str(table))[0]
    print("df1000:", len(df_top1000))
    href_list = collect_youtuberme_url(url)
    df_top1000['url'] = href_list
    df_top1000['country'] = [country for x in range(len(df_top1000))]
    print("url_list: ", len(href_list))
    df_top1000 = category_preprocessing(df_top1000)
    return df_top1000

In [7]:
# test
url = "https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of"
start_url_us = 'https://us.youtubers.me/mexico/all/top-1000-youtube-channels-in-mexico'
extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
print(extracted_strings)
df_category = create_dataframe(extracted_strings, "korea")
# Display the DataFrame
df_category

{'https://us.youtubers.me/korea-republic-of/people-blogs/top-1000-people-blogs-youtube-channels-in-korea-republic-of': ['people-blogs'], 'https://us.youtubers.me/korea-republic-of/music/top-1000-music-youtube-channels-in-korea-republic-of': ['music'], 'https://us.youtubers.me/korea-republic-of/gaming/top-1000-gaming-youtube-channels-in-korea-republic-of': ['gaming'], 'https://us.youtubers.me/korea-republic-of/entertainment/top-1000-entertainment-youtube-channels-in-korea-republic-of': ['entertainment'], 'https://us.youtubers.me/korea-republic-of/news-politics/top-1000-news-politics-youtube-channels-in-korea-republic-of': ['news-politics'], 'https://us.youtubers.me/korea-republic-of/sports/top-1000-sports-youtube-channels-in-korea-republic-of': ['sports'], 'https://us.youtubers.me/korea-republic-of/all/top-1000-youtube-channels-in-korea-republic-of': ['all'], 'https://us.youtubers.me/korea-republic-of/film-animation/top-1000-film-animation-youtube-channels-in-korea-republic-of': ['film-

Unnamed: 0,url,category,country
0,https://us.youtubers.me/korea-republic-of/peop...,people-blogs,korea
1,https://us.youtubers.me/korea-republic-of/musi...,music,korea
2,https://us.youtubers.me/korea-republic-of/gami...,gaming,korea
3,https://us.youtubers.me/korea-republic-of/ente...,entertainment,korea
4,https://us.youtubers.me/korea-republic-of/news...,news-politics,korea
5,https://us.youtubers.me/korea-republic-of/spor...,sports,korea
6,https://us.youtubers.me/korea-republic-of/all/...,all,korea
7,https://us.youtubers.me/korea-republic-of/film...,film-animation,korea
8,https://us.youtubers.me/korea-republic-of/scie...,science-technology,korea
9,https://us.youtubers.me/korea-republic-of/come...,comedy,korea


### 기존 수집했던 데이터 불러오기
- 날짜 가장 빠른 순으로 호출

In [8]:
# List all files that match the pattern "Youtube_Data_*.xlsx"
files = glob.glob("Youtube_Data_*.xlsx")

# Extract dates from filenames and find the latest date
dates = [os.path.splitext(os.path.basename(file))[0].replace("Youtube_Data_", "") for file in files]
dates = sorted(dates, reverse=True)

if dates:
    # Load the latest file based on the date
    latest_file = f"Youtube_Data_{dates[0]}.xlsx"
    previous_df = pd.read_excel(latest_file)
    print(f"Loaded file: {latest_file}")
else:
    print("No files found.")

# Display the dataframe
previous_df.head()

Loaded file: Youtube_Data_2024-05-14.xlsx


Unnamed: 0,youtuberme_url,category,country,channel_name,subscribers,total_video_views,total_video_count,started,yt_url
0,https://us.youtubers.me//pinkfong-kids-songs-s...,education,United States,Baby Shark - Pinkfong Kids’ Songs & Stories,80400000.0,48184301971,3220,2011,https://us.youtubers.me//pinkfong-kids-songs-s...
1,https://us.youtubers.me//genevieve-s-playhouse...,education,United States,Genevieve's Playhouse - Learning Videos for Kids,40200000.0,30124608781,636,2016,https://us.youtubers.me//genevieve-s-playhouse...
2,https://us.youtubers.me//babybus-kids-tv-songs...,education,United States,BabyBus - Kids Songs and Cartoons,35700000.0,28252093300,2405,2016,https://us.youtubers.me//babybus-kids-tv-songs...
3,https://us.youtubers.me//blippi/youtuber-stats,education,United States,Blippi - Educational Videos for Kids,19800000.0,16246480827,928,2014,https://us.youtubers.me//blippi/youtube
4,https://us.youtubers.me//dave-and-ava-nursery-...,education,United States,Dave and Ava - Nursery Rhymes and Baby Songs,15600000.0,11355289210,975,2014,https://us.youtubers.me//dave-and-ava-nursery-...


### 각 유명 채널별 카테고리 링크 저장
- 나라별 17개 카테고리 URL 저장 (1개는 all)

In [9]:
# Define the file name
file_name = "country_category_url.xlsx"

# Check if the file exists
if os.path.exists(file_name):
    # Load the Excel file into a DataFrame
    country_category_df = pd.read_excel(file_name)
else:
    print(f"{file_name} does not exist. The code will not run.")
    # category별 url 리스트 합치기
    dfs = []
    for country, url in country_dic.items():
        print(country, url)
        extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
        df_category = create_dataframe(extracted_strings, country)
        dfs.append(df_category)
        country_category_df = pd.concat(dfs, axis=0, ignore_index=True)
        print(country_category_df)
    # 전체 URL 저장
    country_category_df.to_excel("country_category_url.xlsx", index=False)
    country_category_df

In [10]:
# category별 url 리스트 합치기
dfs = []
for country, url in country_dic.items():
    print(country, url)
    extracted_strings = crawl_table_urls_extracted(url, table_class='top-charts', depth=1)
    df_category = create_dataframe(extracted_strings, country)
    dfs.append(df_category)
    country_category_df = pd.concat(dfs, axis=0, ignore_index=True)
    print(country_category_df)
# 전체 URL 저장
country_category_df.to_excel("country_category_url.xlsx", index=False)
country_category_df

United States https://us.youtubers.me/united-states/all/top-1000-youtube-channels-in-united-states
                                                  url            category  \
0   https://us.youtubers.me/united-states/educatio...           education   
1   https://us.youtubers.me/united-states/people-b...        people-blogs   
2   https://us.youtubers.me/united-states/sports/t...              sports   
3   https://us.youtubers.me/united-states/entertai...       entertainment   
4   https://us.youtubers.me/united-states/film-ani...      film-animation   
5   https://us.youtubers.me/united-states/comedy/t...              comedy   
6   https://us.youtubers.me/united-states/music/to...               music   
7   https://us.youtubers.me/united-states/gaming/t...              gaming   
8   https://us.youtubers.me/united-states/science-...  science-technology   
9   https://us.youtubers.me/united-states/pets-ani...        pets-animals   
10  https://us.youtubers.me/united-states/all/top-... 

                                                  url        category  \
0   https://us.youtubers.me/united-states/educatio...       education   
1   https://us.youtubers.me/united-states/people-b...    people-blogs   
2   https://us.youtubers.me/united-states/sports/t...          sports   
3   https://us.youtubers.me/united-states/entertai...   entertainment   
4   https://us.youtubers.me/united-states/film-ani...  film-animation   
..                                                ...             ...   
62  https://us.youtubers.me/brazil/howto-style/top...     howto-style   
63  https://us.youtubers.me/brazil/pets-animals/to...    pets-animals   
64  https://us.youtubers.me/brazil/news-politics/t...   news-politics   
65  https://us.youtubers.me/brazil/all/top-1000-yo...             all   
66  https://us.youtubers.me/brazil/autos-vehicles/...  autos-vehicles   

          country  
0   United States  
1   United States  
2   United States  
3   United States  
4   United States  
.. 

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
179  https://us.youtubers.me/france/pets-animals/to...         pets-animals   
180  https://us.youtubers.me/france/science-technol...   science-technology   
181  https://us.youtubers.me/france/nonprofits-acti...  nonprofits-activism   
182  https://us.youtubers.me/france/travel-events/t...        travel-events   
183  https://us.youtubers.me/france/movies/top-1000...               movies   

           country  
0    United States  
1    Unit

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
293  https://us.youtubers.me/thailand/travel-events...        travel-events   
294  https://us.youtubers.me/thailand/autos-vehicle...       autos-vehicles   
295  https://us.youtubers.me/thailand/education/top...            education   
296  https://us.youtubers.me/thailand/pets-animals/...         pets-animals   
297  https://us.youtubers.me/thailand/nonprofits-ac...  nonprofits-activism   

           country  
0    United States  
1    Unit

                                                   url             category  \
0    https://us.youtubers.me/united-states/educatio...            education   
1    https://us.youtubers.me/united-states/people-b...         people-blogs   
2    https://us.youtubers.me/united-states/sports/t...               sports   
3    https://us.youtubers.me/united-states/entertai...        entertainment   
4    https://us.youtubers.me/united-states/film-ani...       film-animation   
..                                                 ...                  ...   
408  https://us.youtubers.me/egypt/science-technolo...   science-technology   
409  https://us.youtubers.me/egypt/autos-vehicles/t...       autos-vehicles   
410  https://us.youtubers.me/egypt/nonprofits-activ...  nonprofits-activism   
411  https://us.youtubers.me/egypt/pets-animals/top...         pets-animals   
412  https://us.youtubers.me/egypt/travel-events/to...        travel-events   

           country  
0    United States  
1    Unit

Unnamed: 0,url,category,country
0,https://us.youtubers.me/united-states/educatio...,education,United States
1,https://us.youtubers.me/united-states/people-b...,people-blogs,United States
2,https://us.youtubers.me/united-states/sports/t...,sports,United States
3,https://us.youtubers.me/united-states/entertai...,entertainment,United States
4,https://us.youtubers.me/united-states/film-ani...,film-animation,United States
...,...,...,...
491,https://us.youtubers.me/romania/autos-vehicles...,autos-vehicles,Romania
492,https://us.youtubers.me/romania/science-techno...,science-technology,Romania
493,https://us.youtubers.me/romania/travel-events/...,travel-events,Romania
494,https://us.youtubers.me/romania/shows/top-1000...,shows,Romania


In [11]:
country_category_df

Unnamed: 0,url,category,country
0,https://us.youtubers.me/united-states/educatio...,education,United States
1,https://us.youtubers.me/united-states/people-b...,people-blogs,United States
2,https://us.youtubers.me/united-states/sports/t...,sports,United States
3,https://us.youtubers.me/united-states/entertai...,entertainment,United States
4,https://us.youtubers.me/united-states/film-ani...,film-animation,United States
...,...,...,...
491,https://us.youtubers.me/romania/autos-vehicles...,autos-vehicles,Romania
492,https://us.youtubers.me/romania/science-techno...,science-technology,Romania
493,https://us.youtubers.me/romania/travel-events/...,travel-events,Romania
494,https://us.youtubers.me/romania/shows/top-1000...,shows,Romania


In [12]:
dfs

[                                                  url            category  \
 0   https://us.youtubers.me/united-states/educatio...           education   
 1   https://us.youtubers.me/united-states/people-b...        people-blogs   
 2   https://us.youtubers.me/united-states/sports/t...              sports   
 3   https://us.youtubers.me/united-states/entertai...       entertainment   
 4   https://us.youtubers.me/united-states/film-ani...      film-animation   
 5   https://us.youtubers.me/united-states/comedy/t...              comedy   
 6   https://us.youtubers.me/united-states/music/to...               music   
 7   https://us.youtubers.me/united-states/gaming/t...              gaming   
 8   https://us.youtubers.me/united-states/science-...  science-technology   
 9   https://us.youtubers.me/united-states/pets-ani...        pets-animals   
 10  https://us.youtubers.me/united-states/all/top-...                 all   
 11  https://us.youtubers.me/united-states/news-pol...       new

### YouTuberme Dataframe Final

In [13]:
# 전체 카테고리 별 채널 수집
country_category_df = pd.read_excel("country_category_url.xlsx")
# url 돌면서 필요한 데이터 
for url, country in zip(country_category_df['url'].to_list(), country_category_df['country'].to_list()):    
    df_new = collect_youtuberme_basic(url, country)
    dfs.append(df_new)
    result_df = pd.concat(dfs, axis=0, ignore_index=True)

# 중복 제거
result_df = result_df.drop_duplicates('Youtuber')
result_df = result_df.drop("rank", axis=1)
# YouTube URL 저장
result_df['youtube url'] = [url[:-7] for url in  result_df['url'].to_list()]

  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 4
url_list:  4


  df_top1000 = pd.read_html(str(table))[0]


df1000: 95
url_list:  95


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 290
url_list:  290


  df_top1000 = pd.read_html(str(table))[0]


df1000: 711
url_list:  711


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 639
url_list:  639


  df_top1000 = pd.read_html(str(table))[0]


df1000: 177
url_list:  177


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 534
url_list:  534


  df_top1000 = pd.read_html(str(table))[0]


df1000: 481
url_list:  481


  df_top1000 = pd.read_html(str(table))[0]


df1000: 522
url_list:  522


  df_top1000 = pd.read_html(str(table))[0]


df1000: 666
url_list:  666


  df_top1000 = pd.read_html(str(table))[0]


df1000: 7
url_list:  7


  df_top1000 = pd.read_html(str(table))[0]


df1000: 679
url_list:  679


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 91
url_list:  91


  df_top1000 = pd.read_html(str(table))[0]


df1000: 230
url_list:  230


  df_top1000 = pd.read_html(str(table))[0]


df1000: 668
url_list:  668


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 937
url_list:  937


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 519
url_list:  519


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 13
url_list:  13


  df_top1000 = pd.read_html(str(table))[0]


df1000: 159
url_list:  159


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 404
url_list:  404


  df_top1000 = pd.read_html(str(table))[0]


df1000: 405
url_list:  405


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 317
url_list:  317


  df_top1000 = pd.read_html(str(table))[0]


df1000: 492
url_list:  492


  df_top1000 = pd.read_html(str(table))[0]


df1000: 898
url_list:  898


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 4
url_list:  4


  df_top1000 = pd.read_html(str(table))[0]


df1000: 731
url_list:  731


  df_top1000 = pd.read_html(str(table))[0]


df1000: 249
url_list:  249


  df_top1000 = pd.read_html(str(table))[0]


df1000: 236
url_list:  236


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 525
url_list:  525


  df_top1000 = pd.read_html(str(table))[0]


df1000: 287
url_list:  287


  df_top1000 = pd.read_html(str(table))[0]


df1000: 897
url_list:  897


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 650
url_list:  650


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 461
url_list:  461


  df_top1000 = pd.read_html(str(table))[0]


df1000: 586
url_list:  586


  df_top1000 = pd.read_html(str(table))[0]


df1000: 274
url_list:  274


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 386
url_list:  386


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 11
url_list:  11


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 597
url_list:  597


  df_top1000 = pd.read_html(str(table))[0]


df1000: 418
url_list:  418


  df_top1000 = pd.read_html(str(table))[0]


df1000: 111
url_list:  111


  df_top1000 = pd.read_html(str(table))[0]


df1000: 62
url_list:  62


  df_top1000 = pd.read_html(str(table))[0]


df1000: 222
url_list:  222


  df_top1000 = pd.read_html(str(table))[0]


df1000: 42
url_list:  42


  df_top1000 = pd.read_html(str(table))[0]


df1000: 169
url_list:  169


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 487
url_list:  487


  df_top1000 = pd.read_html(str(table))[0]


df1000: 409
url_list:  409


  df_top1000 = pd.read_html(str(table))[0]


df1000: 603
url_list:  603


  df_top1000 = pd.read_html(str(table))[0]


df1000: 335
url_list:  335


  df_top1000 = pd.read_html(str(table))[0]


df1000: 116
url_list:  116


  df_top1000 = pd.read_html(str(table))[0]


df1000: 370
url_list:  370


  df_top1000 = pd.read_html(str(table))[0]


df1000: 730
url_list:  730


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 374
url_list:  374


  df_top1000 = pd.read_html(str(table))[0]


df1000: 197
url_list:  197


  df_top1000 = pd.read_html(str(table))[0]


df1000: 290
url_list:  290


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 342
url_list:  342


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 384
url_list:  384


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 672
url_list:  672


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 218
url_list:  218


  df_top1000 = pd.read_html(str(table))[0]


df1000: 235
url_list:  235


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 396
url_list:  396


  df_top1000 = pd.read_html(str(table))[0]


df1000: 324
url_list:  324


  df_top1000 = pd.read_html(str(table))[0]


df1000: 114
url_list:  114


  df_top1000 = pd.read_html(str(table))[0]


df1000: 57
url_list:  57


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 271
url_list:  271


  df_top1000 = pd.read_html(str(table))[0]


df1000: 696
url_list:  696


  df_top1000 = pd.read_html(str(table))[0]


df1000: 523
url_list:  523


  df_top1000 = pd.read_html(str(table))[0]


df1000: 117
url_list:  117


  df_top1000 = pd.read_html(str(table))[0]


df1000: 23
url_list:  23


  df_top1000 = pd.read_html(str(table))[0]


df1000: 652
url_list:  652


  df_top1000 = pd.read_html(str(table))[0]


df1000: 365
url_list:  365


  df_top1000 = pd.read_html(str(table))[0]


df1000: 56
url_list:  56


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 58
url_list:  58


  df_top1000 = pd.read_html(str(table))[0]


df1000: 116
url_list:  116


  df_top1000 = pd.read_html(str(table))[0]


df1000: 115
url_list:  115


  df_top1000 = pd.read_html(str(table))[0]


df1000: 130
url_list:  130


  df_top1000 = pd.read_html(str(table))[0]


df1000: 55
url_list:  55


  df_top1000 = pd.read_html(str(table))[0]


df1000: 118
url_list:  118


  df_top1000 = pd.read_html(str(table))[0]


df1000: 11
url_list:  11


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 53
url_list:  53


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 972
url_list:  972


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 33
url_list:  33


  df_top1000 = pd.read_html(str(table))[0]


df1000: 916
url_list:  916


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 733
url_list:  733


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 800
url_list:  800


  df_top1000 = pd.read_html(str(table))[0]


df1000: 247
url_list:  247


  df_top1000 = pd.read_html(str(table))[0]


df1000: 382
url_list:  382


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 976
url_list:  976


  df_top1000 = pd.read_html(str(table))[0]


df1000: 435
url_list:  435


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 22
url_list:  22


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 877
url_list:  877


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 21
url_list:  21


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 505
url_list:  505


  df_top1000 = pd.read_html(str(table))[0]


df1000: 971
url_list:  971


  df_top1000 = pd.read_html(str(table))[0]


df1000: 211
url_list:  211


  df_top1000 = pd.read_html(str(table))[0]


df1000: 345
url_list:  345


  df_top1000 = pd.read_html(str(table))[0]


df1000: 777
url_list:  777


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 766
url_list:  766


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 496
url_list:  496


  df_top1000 = pd.read_html(str(table))[0]


df1000: 846
url_list:  846


  df_top1000 = pd.read_html(str(table))[0]


df1000: 572
url_list:  572


  df_top1000 = pd.read_html(str(table))[0]


df1000: 441
url_list:  441


  df_top1000 = pd.read_html(str(table))[0]


df1000: 861
url_list:  861


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 147
url_list:  147


  df_top1000 = pd.read_html(str(table))[0]


df1000: 406
url_list:  406


  df_top1000 = pd.read_html(str(table))[0]


df1000: 49
url_list:  49


  df_top1000 = pd.read_html(str(table))[0]


df1000: 184
url_list:  184


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 419
url_list:  419


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 807
url_list:  807


  df_top1000 = pd.read_html(str(table))[0]


df1000: 952
url_list:  952


  df_top1000 = pd.read_html(str(table))[0]


df1000: 282
url_list:  282


  df_top1000 = pd.read_html(str(table))[0]


df1000: 732
url_list:  732


  df_top1000 = pd.read_html(str(table))[0]


df1000: 672
url_list:  672


  df_top1000 = pd.read_html(str(table))[0]


df1000: 841
url_list:  841


  df_top1000 = pd.read_html(str(table))[0]


df1000: 510
url_list:  510


  df_top1000 = pd.read_html(str(table))[0]


df1000: 999
url_list:  999


  df_top1000 = pd.read_html(str(table))[0]


df1000: 267
url_list:  267


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 140
url_list:  140


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 471
url_list:  471


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 609
url_list:  609


  df_top1000 = pd.read_html(str(table))[0]


df1000: 317
url_list:  317


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 540
url_list:  540


  df_top1000 = pd.read_html(str(table))[0]


df1000: 147
url_list:  147


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 562
url_list:  562


  df_top1000 = pd.read_html(str(table))[0]


df1000: 291
url_list:  291


  df_top1000 = pd.read_html(str(table))[0]


df1000: 16
url_list:  16


  df_top1000 = pd.read_html(str(table))[0]


df1000: 137
url_list:  137


  df_top1000 = pd.read_html(str(table))[0]


df1000: 65
url_list:  65


  df_top1000 = pd.read_html(str(table))[0]


df1000: 282
url_list:  282


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 797
url_list:  797


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 652
url_list:  652


  df_top1000 = pd.read_html(str(table))[0]


df1000: 729
url_list:  729


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 855
url_list:  855


  df_top1000 = pd.read_html(str(table))[0]


df1000: 192
url_list:  192


  df_top1000 = pd.read_html(str(table))[0]


df1000: 505
url_list:  505


  df_top1000 = pd.read_html(str(table))[0]


df1000: 270
url_list:  270


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 407
url_list:  407


  df_top1000 = pd.read_html(str(table))[0]


df1000: 665
url_list:  665


  df_top1000 = pd.read_html(str(table))[0]


df1000: 362
url_list:  362


  df_top1000 = pd.read_html(str(table))[0]


df1000: 341
url_list:  341


  df_top1000 = pd.read_html(str(table))[0]


df1000: 57
url_list:  57


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 926
url_list:  926


  df_top1000 = pd.read_html(str(table))[0]


df1000: 429
url_list:  429


  df_top1000 = pd.read_html(str(table))[0]


df1000: 327
url_list:  327


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 249
url_list:  249


  df_top1000 = pd.read_html(str(table))[0]


df1000: 245
url_list:  245


  df_top1000 = pd.read_html(str(table))[0]


df1000: 83
url_list:  83


  df_top1000 = pd.read_html(str(table))[0]


df1000: 437
url_list:  437


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 319
url_list:  319


  df_top1000 = pd.read_html(str(table))[0]


df1000: 216
url_list:  216


  df_top1000 = pd.read_html(str(table))[0]


df1000: 331
url_list:  331


  df_top1000 = pd.read_html(str(table))[0]


df1000: 150
url_list:  150


  df_top1000 = pd.read_html(str(table))[0]


df1000: 46
url_list:  46


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 441
url_list:  441


  df_top1000 = pd.read_html(str(table))[0]


df1000: 345
url_list:  345


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 983
url_list:  983


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 241
url_list:  241


  df_top1000 = pd.read_html(str(table))[0]


df1000: 495
url_list:  495


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 154
url_list:  154


  df_top1000 = pd.read_html(str(table))[0]


df1000: 619
url_list:  619


  df_top1000 = pd.read_html(str(table))[0]


df1000: 206
url_list:  206


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 221
url_list:  221


  df_top1000 = pd.read_html(str(table))[0]


df1000: 323
url_list:  323


  df_top1000 = pd.read_html(str(table))[0]


df1000: 103
url_list:  103


  df_top1000 = pd.read_html(str(table))[0]


df1000: 52
url_list:  52


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 528
url_list:  528


  df_top1000 = pd.read_html(str(table))[0]


df1000: 125
url_list:  125


  df_top1000 = pd.read_html(str(table))[0]


df1000: 76
url_list:  76


  df_top1000 = pd.read_html(str(table))[0]


df1000: 144
url_list:  144


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 101
url_list:  101


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 240
url_list:  240


  df_top1000 = pd.read_html(str(table))[0]


df1000: 225
url_list:  225


  df_top1000 = pd.read_html(str(table))[0]


df1000: 573
url_list:  573


  df_top1000 = pd.read_html(str(table))[0]


df1000: 263
url_list:  263


  df_top1000 = pd.read_html(str(table))[0]


df1000: 18
url_list:  18


  df_top1000 = pd.read_html(str(table))[0]


df1000: 105
url_list:  105


  df_top1000 = pd.read_html(str(table))[0]


df1000: 77
url_list:  77


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 785
url_list:  785


  df_top1000 = pd.read_html(str(table))[0]


df1000: 223
url_list:  223


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 745
url_list:  745


  df_top1000 = pd.read_html(str(table))[0]


df1000: 223
url_list:  223


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 165
url_list:  165


  df_top1000 = pd.read_html(str(table))[0]


df1000: 216
url_list:  216


  df_top1000 = pd.read_html(str(table))[0]


df1000: 71
url_list:  71


  df_top1000 = pd.read_html(str(table))[0]


df1000: 304
url_list:  304


  df_top1000 = pd.read_html(str(table))[0]


df1000: 149
url_list:  149


  df_top1000 = pd.read_html(str(table))[0]


df1000: 93
url_list:  93


  df_top1000 = pd.read_html(str(table))[0]


df1000: 137
url_list:  137


  df_top1000 = pd.read_html(str(table))[0]


df1000: 61
url_list:  61


  df_top1000 = pd.read_html(str(table))[0]


df1000: 17
url_list:  17


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 468
url_list:  468


  df_top1000 = pd.read_html(str(table))[0]


df1000: 664
url_list:  664


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 418
url_list:  418


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 211
url_list:  211


  df_top1000 = pd.read_html(str(table))[0]


df1000: 487
url_list:  487


  df_top1000 = pd.read_html(str(table))[0]


df1000: 196
url_list:  196


  df_top1000 = pd.read_html(str(table))[0]


df1000: 231
url_list:  231


  df_top1000 = pd.read_html(str(table))[0]


df1000: 151
url_list:  151


  df_top1000 = pd.read_html(str(table))[0]


df1000: 264
url_list:  264


  df_top1000 = pd.read_html(str(table))[0]


df1000: 535
url_list:  535


  df_top1000 = pd.read_html(str(table))[0]


df1000: 395
url_list:  395


  df_top1000 = pd.read_html(str(table))[0]


df1000: 402
url_list:  402


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 197
url_list:  197


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 100
url_list:  100


  df_top1000 = pd.read_html(str(table))[0]


df1000: 231
url_list:  231


  df_top1000 = pd.read_html(str(table))[0]


df1000: 209
url_list:  209


  df_top1000 = pd.read_html(str(table))[0]


df1000: 580
url_list:  580


  df_top1000 = pd.read_html(str(table))[0]


df1000: 79
url_list:  79


  df_top1000 = pd.read_html(str(table))[0]


df1000: 121
url_list:  121


  df_top1000 = pd.read_html(str(table))[0]


df1000: 187
url_list:  187


  df_top1000 = pd.read_html(str(table))[0]


df1000: 356
url_list:  356


  df_top1000 = pd.read_html(str(table))[0]


df1000: 271
url_list:  271


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 161
url_list:  161


  df_top1000 = pd.read_html(str(table))[0]


df1000: 80
url_list:  80


  df_top1000 = pd.read_html(str(table))[0]


df1000: 527
url_list:  527


  df_top1000 = pd.read_html(str(table))[0]


df1000: 425
url_list:  425


  df_top1000 = pd.read_html(str(table))[0]


df1000: 381
url_list:  381


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 140
url_list:  140


  df_top1000 = pd.read_html(str(table))[0]


df1000: 51
url_list:  51


  df_top1000 = pd.read_html(str(table))[0]


df1000: 149
url_list:  149


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 89
url_list:  89


  df_top1000 = pd.read_html(str(table))[0]


df1000: 149
url_list:  149


  df_top1000 = pd.read_html(str(table))[0]


df1000: 350
url_list:  350


  df_top1000 = pd.read_html(str(table))[0]


df1000: 170
url_list:  170


  df_top1000 = pd.read_html(str(table))[0]


df1000: 175
url_list:  175


  df_top1000 = pd.read_html(str(table))[0]


df1000: 33
url_list:  33


  df_top1000 = pd.read_html(str(table))[0]


df1000: 5
url_list:  5


  df_top1000 = pd.read_html(str(table))[0]


df1000: 16
url_list:  16


  df_top1000 = pd.read_html(str(table))[0]


df1000: 915
url_list:  915


  df_top1000 = pd.read_html(str(table))[0]


df1000: 199
url_list:  199


  df_top1000 = pd.read_html(str(table))[0]


df1000: 990
url_list:  990


  df_top1000 = pd.read_html(str(table))[0]


df1000: 213
url_list:  213


  df_top1000 = pd.read_html(str(table))[0]


df1000: 117
url_list:  117


  df_top1000 = pd.read_html(str(table))[0]


df1000: 96
url_list:  96


  df_top1000 = pd.read_html(str(table))[0]


df1000: 656
url_list:  656


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 191
url_list:  191


  df_top1000 = pd.read_html(str(table))[0]


df1000: 367
url_list:  367


  df_top1000 = pd.read_html(str(table))[0]


df1000: 91
url_list:  91


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 28
url_list:  28


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 67
url_list:  67


  df_top1000 = pd.read_html(str(table))[0]


df1000: 10
url_list:  10


  df_top1000 = pd.read_html(str(table))[0]


df1000: 225
url_list:  225


  df_top1000 = pd.read_html(str(table))[0]


df1000: 966
url_list:  966


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 474
url_list:  474


  df_top1000 = pd.read_html(str(table))[0]


df1000: 526
url_list:  526


  df_top1000 = pd.read_html(str(table))[0]


df1000: 167
url_list:  167


  df_top1000 = pd.read_html(str(table))[0]


df1000: 112
url_list:  112


  df_top1000 = pd.read_html(str(table))[0]


df1000: 154
url_list:  154


  df_top1000 = pd.read_html(str(table))[0]


df1000: 18
url_list:  18


  df_top1000 = pd.read_html(str(table))[0]


df1000: 159
url_list:  159


  df_top1000 = pd.read_html(str(table))[0]


df1000: 75
url_list:  75


  df_top1000 = pd.read_html(str(table))[0]


df1000: 129
url_list:  129


  df_top1000 = pd.read_html(str(table))[0]


df1000: 66
url_list:  66


  df_top1000 = pd.read_html(str(table))[0]


df1000: 46
url_list:  46


  df_top1000 = pd.read_html(str(table))[0]


df1000: 30
url_list:  30


  df_top1000 = pd.read_html(str(table))[0]


df1000: 34
url_list:  34


  df_top1000 = pd.read_html(str(table))[0]


df1000: 653
url_list:  653


  df_top1000 = pd.read_html(str(table))[0]


df1000: 152
url_list:  152


  df_top1000 = pd.read_html(str(table))[0]


df1000: 611
url_list:  611


  df_top1000 = pd.read_html(str(table))[0]


df1000: 670
url_list:  670


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 179
url_list:  179


  df_top1000 = pd.read_html(str(table))[0]


df1000: 156
url_list:  156


  df_top1000 = pd.read_html(str(table))[0]


df1000: 135
url_list:  135


  df_top1000 = pd.read_html(str(table))[0]


df1000: 673
url_list:  673


  df_top1000 = pd.read_html(str(table))[0]


df1000: 193
url_list:  193


  df_top1000 = pd.read_html(str(table))[0]


df1000: 100
url_list:  100


  df_top1000 = pd.read_html(str(table))[0]


df1000: 75
url_list:  75


  df_top1000 = pd.read_html(str(table))[0]


df1000: 87
url_list:  87


  df_top1000 = pd.read_html(str(table))[0]


df1000: 69
url_list:  69


  df_top1000 = pd.read_html(str(table))[0]


df1000: 46
url_list:  46


  df_top1000 = pd.read_html(str(table))[0]


df1000: 16
url_list:  16


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 635
url_list:  635


  df_top1000 = pd.read_html(str(table))[0]


df1000: 176
url_list:  176


  df_top1000 = pd.read_html(str(table))[0]


df1000: 110
url_list:  110


  df_top1000 = pd.read_html(str(table))[0]


df1000: 142
url_list:  142


  df_top1000 = pd.read_html(str(table))[0]


df1000: 230
url_list:  230


  df_top1000 = pd.read_html(str(table))[0]


df1000: 333
url_list:  333


  df_top1000 = pd.read_html(str(table))[0]


df1000: 129
url_list:  129


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 200
url_list:  200


  df_top1000 = pd.read_html(str(table))[0]


df1000: 3
url_list:  3


  df_top1000 = pd.read_html(str(table))[0]


df1000: 405
url_list:  405


  df_top1000 = pd.read_html(str(table))[0]


df1000: 75
url_list:  75


  df_top1000 = pd.read_html(str(table))[0]


df1000: 19
url_list:  19


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15
url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 31
url_list:  31


  df_top1000 = pd.read_html(str(table))[0]


df1000: 18
url_list:  18


  df_top1000 = pd.read_html(str(table))[0]


df1000: 184
url_list:  184


  df_top1000 = pd.read_html(str(table))[0]


df1000: 358
url_list:  358


  df_top1000 = pd.read_html(str(table))[0]


df1000: 758
url_list:  758


  df_top1000 = pd.read_html(str(table))[0]


df1000: 530
url_list:  530


  df_top1000 = pd.read_html(str(table))[0]


df1000: 222
url_list:  222


  df_top1000 = pd.read_html(str(table))[0]


df1000: 87
url_list:  87


  df_top1000 = pd.read_html(str(table))[0]


df1000: 51
url_list:  51


  df_top1000 = pd.read_html(str(table))[0]


df1000: 100
url_list:  100


  df_top1000 = pd.read_html(str(table))[0]


df1000: 465
url_list:  465


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 84
url_list:  84


  df_top1000 = pd.read_html(str(table))[0]


df1000: 93
url_list:  93


  df_top1000 = pd.read_html(str(table))[0]


df1000: 45
url_list:  45


  df_top1000 = pd.read_html(str(table))[0]


df1000: 104
url_list:  104


  df_top1000 = pd.read_html(str(table))[0]


df1000: 50
url_list:  50


  df_top1000 = pd.read_html(str(table))[0]


df1000: 26
url_list:  26


  df_top1000 = pd.read_html(str(table))[0]


df1000: 548
url_list:  548


  df_top1000 = pd.read_html(str(table))[0]


df1000: 244
url_list:  244


  df_top1000 = pd.read_html(str(table))[0]


df1000: 799
url_list:  799


  df_top1000 = pd.read_html(str(table))[0]


df1000: 851
url_list:  851


  df_top1000 = pd.read_html(str(table))[0]


df1000: 699
url_list:  699


  df_top1000 = pd.read_html(str(table))[0]


df1000: 94
url_list:  94


  df_top1000 = pd.read_html(str(table))[0]


df1000: 269
url_list:  269


  df_top1000 = pd.read_html(str(table))[0]


df1000: 209
url_list:  209


  df_top1000 = pd.read_html(str(table))[0]


df1000: 181
url_list:  181


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 282
url_list:  282


  df_top1000 = pd.read_html(str(table))[0]


df1000: 49
url_list:  49


  df_top1000 = pd.read_html(str(table))[0]


df1000: 107
url_list:  107


  df_top1000 = pd.read_html(str(table))[0]


df1000: 155
url_list:  155


  df_top1000 = pd.read_html(str(table))[0]


df1000: 129
url_list:  129


  df_top1000 = pd.read_html(str(table))[0]


df1000: 5
url_list:  5


  df_top1000 = pd.read_html(str(table))[0]


df1000: 25
url_list:  25


  df_top1000 = pd.read_html(str(table))[0]


df1000: 496
url_list:  496


  df_top1000 = pd.read_html(str(table))[0]


df1000: 281
url_list:  281


  df_top1000 = pd.read_html(str(table))[0]


df1000: 122
url_list:  122


  df_top1000 = pd.read_html(str(table))[0]


df1000: 288
url_list:  288


  df_top1000 = pd.read_html(str(table))[0]


df1000: 373
url_list:  373


  df_top1000 = pd.read_html(str(table))[0]


df1000: 148
url_list:  148


  df_top1000 = pd.read_html(str(table))[0]


df1000: 97
url_list:  97


  df_top1000 = pd.read_html(str(table))[0]


df1000: 65
url_list:  65


  df_top1000 = pd.read_html(str(table))[0]


df1000: 114
url_list:  114


  df_top1000 = pd.read_html(str(table))[0]


df1000: 108
url_list:  108


  df_top1000 = pd.read_html(str(table))[0]


df1000: 62
url_list:  62


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 41
url_list:  41


  df_top1000 = pd.read_html(str(table))[0]


df1000: 61
url_list:  61


  df_top1000 = pd.read_html(str(table))[0]


df1000: 6
url_list:  6


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15
url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1
url_list:  1


  df_top1000 = pd.read_html(str(table))[0]


df1000: 848
url_list:  848


  df_top1000 = pd.read_html(str(table))[0]


df1000: 751
url_list:  751


  df_top1000 = pd.read_html(str(table))[0]


df1000: 758
url_list:  758


  df_top1000 = pd.read_html(str(table))[0]


df1000: 22
url_list:  22


  df_top1000 = pd.read_html(str(table))[0]


df1000: 389
url_list:  389


  df_top1000 = pd.read_html(str(table))[0]


df1000: 162
url_list:  162


  df_top1000 = pd.read_html(str(table))[0]


df1000: 150
url_list:  150


  df_top1000 = pd.read_html(str(table))[0]


df1000: 123
url_list:  123


  df_top1000 = pd.read_html(str(table))[0]


df1000: 238
url_list:  238


  df_top1000 = pd.read_html(str(table))[0]


df1000: 107
url_list:  107


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 146
url_list:  146


  df_top1000 = pd.read_html(str(table))[0]


df1000: 48
url_list:  48


  df_top1000 = pd.read_html(str(table))[0]


df1000: 57
url_list:  57


  df_top1000 = pd.read_html(str(table))[0]


df1000: 89
url_list:  89


  df_top1000 = pd.read_html(str(table))[0]


df1000: 15
url_list:  15


  df_top1000 = pd.read_html(str(table))[0]


df1000: 649
url_list:  649


  df_top1000 = pd.read_html(str(table))[0]


df1000: 89
url_list:  89


  df_top1000 = pd.read_html(str(table))[0]


df1000: 585
url_list:  585


  df_top1000 = pd.read_html(str(table))[0]


df1000: 104
url_list:  104


  df_top1000 = pd.read_html(str(table))[0]


df1000: 1000
url_list:  1000


  df_top1000 = pd.read_html(str(table))[0]


df1000: 441
url_list:  441


  df_top1000 = pd.read_html(str(table))[0]


df1000: 514
url_list:  514


  df_top1000 = pd.read_html(str(table))[0]


df1000: 19
url_list:  19


  df_top1000 = pd.read_html(str(table))[0]


df1000: 80
url_list:  80


  df_top1000 = pd.read_html(str(table))[0]


df1000: 74
url_list:  74


  df_top1000 = pd.read_html(str(table))[0]


df1000: 100
url_list:  100


  df_top1000 = pd.read_html(str(table))[0]


df1000: 93
url_list:  93


  df_top1000 = pd.read_html(str(table))[0]


df1000: 62
url_list:  62


  df_top1000 = pd.read_html(str(table))[0]


df1000: 34
url_list:  34


  df_top1000 = pd.read_html(str(table))[0]


df1000: 27
url_list:  27


  df_top1000 = pd.read_html(str(table))[0]


df1000: 2
url_list:  2


  df_top1000 = pd.read_html(str(table))[0]


df1000: 13
url_list:  13


In [14]:
result_df

Unnamed: 0,url,category,country,Youtuber,subscribers,video views,video count,started,youtube url
0,https://us.youtubers.me/united-states/educatio...,education,United States,,,,,,https://us.youtubers.me/united-states/educatio...
496,https://us.youtubers.me//abckidtv-nursery-rhym...,education,United States,Cocomelon - Nursery Rhymes,183000000.0,1.885494e+11,1285.0,2006.0,https://us.youtubers.me//abckidtv-nursery-rhym...
497,https://us.youtubers.me//pinkfong-kids-songs-s...,education,United States,Baby Shark - Pinkfong Kids’ Songs & Stories,77900000.0,4.608556e+10,3322.0,2011.0,https://us.youtubers.me//pinkfong-kids-songs-s...
498,https://us.youtubers.me//genevieve-s-playhouse...,education,United States,Genevieve's Playhouse - Learning Videos for Kids,42600000.0,3.245692e+10,650.0,2016.0,https://us.youtubers.me//genevieve-s-playhouse...
499,https://us.youtubers.me//babybus-kids-tv-songs...,education,United States,BabyBus - Kids Songs and Cartoons,38700000.0,2.991610e+10,2538.0,2016.0,https://us.youtubers.me//babybus-kids-tv-songs...
...,...,...,...,...,...,...,...,...,...
232855,https://us.youtubers.me//sergiu-brega/youtuber...,nonprofits-activism,Romania,Sergiu Brega,2860.0,1.570858e+06,259.0,2011.0,https://us.youtubers.me//sergiu-brega/youtube
232856,https://us.youtubers.me//craiova-cetatea-banil...,nonprofits-activism,Romania,Craiova Cetatea Banilor Las Vegas-ul Romaniei,1030.0,8.153100e+05,435.0,2016.0,https://us.youtubers.me//craiova-cetatea-banil...
232857,https://us.youtubers.me//golden-era/youtuber-s...,nonprofits-activism,Romania,Golden Era,41900.0,2.708490e+05,430.0,2010.0,https://us.youtubers.me//golden-era/youtube
232859,https://us.youtubers.me//speran-a-in-iisus-abo...,nonprofits-activism,Romania,Harul TV,58900.0,2.202790e+05,1639.0,2012.0,https://us.youtubers.me//speran-a-in-iisus-abo...


In [15]:
# column명 변경
result_df = result_df.rename(columns={
    'url': 'youtuberme_url',
    'Youtuber': 'channel_name',
    'video views': 'total_video_views',
    'video count': 'total_video_count',
    'youtube url': 'yt_url'
})

In [23]:
# subscribers, total_video_count가 0인 경우 채널이 삭제된 경우이므로 제거
result_df = result_df[(result_df['subscribers'] != 0) & (result_df['subscribers'].notna())]
# result_df = result_df[result_df['total_video_count'] != 0& (result_df['total_video_count'].notna())]

In [24]:
result_df.columns

Index(['youtuberme_url', 'category', 'country', 'channel_name', 'subscribers',
       'total_video_views', 'total_video_count', 'started', 'yt_url'],
      dtype='object')

In [25]:
combined_df = pd.concat([previous_df, result_df])

# Drop duplicates based on 'channel_name' and keep only the unique entries
added_df = combined_df.drop_duplicates(subset=['channel_name'], keep=False)

# Display the new DataFrame with unique channel names
print(added_df)
unique_df = combined_df.drop_duplicates(subset=['channel_name'])
unique_df

                                           youtuberme_url  \
125     https://us.youtubers.me//edu-car-toy-school/yo...   
172     https://us.youtubers.me//shawn-woods/youtuber-...   
175     https://us.youtubers.me//theliberty/youtuber-s...   
261     https://us.youtubers.me//factsjunkie/youtuber-...   
262     https://us.youtubers.me//b-n-linh-dan-ong/yout...   
...                                                   ...   
232775  https://us.youtubers.me//speedseekersitr/youtu...   
232799  https://us.youtubers.me//wunder-haff/youtuber-...   
232811     https://us.youtubers.me//idei3d/youtuber-stats   
232813  https://us.youtubers.me//helium-network-projec...   
232852  https://us.youtubers.me//muntele-sionului/yout...   

                   category        country  \
125               education  United States   
172               education  United States   
175               education  United States   
261               education  United States   
262               education  United S

Unnamed: 0,youtuberme_url,category,country,channel_name,subscribers,total_video_views,total_video_count,started,yt_url
0,https://us.youtubers.me//pinkfong-kids-songs-s...,education,United States,Baby Shark - Pinkfong Kids’ Songs & Stories,80400000.0,4.818430e+10,3220.0,2011.0,https://us.youtubers.me//pinkfong-kids-songs-s...
1,https://us.youtubers.me//genevieve-s-playhouse...,education,United States,Genevieve's Playhouse - Learning Videos for Kids,40200000.0,3.012461e+10,636.0,2016.0,https://us.youtubers.me//genevieve-s-playhouse...
2,https://us.youtubers.me//babybus-kids-tv-songs...,education,United States,BabyBus - Kids Songs and Cartoons,35700000.0,2.825209e+10,2405.0,2016.0,https://us.youtubers.me//babybus-kids-tv-songs...
3,https://us.youtubers.me//blippi/youtuber-stats,education,United States,Blippi - Educational Videos for Kids,19800000.0,1.624648e+10,928.0,2014.0,https://us.youtubers.me//blippi/youtube
4,https://us.youtubers.me//dave-and-ava-nursery-...,education,United States,Dave and Ava - Nursery Rhymes and Baby Songs,15600000.0,1.135529e+10,975.0,2014.0,https://us.youtubers.me//dave-and-ava-nursery-...
...,...,...,...,...,...,...,...,...,...
232775,https://us.youtubers.me//speedseekersitr/youtu...,autos-vehicles,Romania,SpeedSeekersITR,26200.0,6.963130e+05,39.0,2018.0,https://us.youtubers.me//speedseekersitr/youtube
232799,https://us.youtubers.me//wunder-haff/youtuber-...,science-technology,Romania,AtelierulTauro,9600.0,9.836875e+06,895.0,2010.0,https://us.youtubers.me//wunder-haff/youtube
232811,https://us.youtubers.me//idei3d/youtuber-stats,science-technology,Romania,idei3D,20800.0,9.821320e+05,72.0,2015.0,https://us.youtubers.me//idei3d/youtube
232813,https://us.youtubers.me//helium-network-projec...,science-technology,Romania,Paul & Alex,2930.0,4.955580e+05,213.0,2017.0,https://us.youtubers.me//helium-network-projec...


In [19]:
# Get today's date
today_date = datetime.today().strftime('%Y-%m-%d')
file_path = f"Youtube_Data_{today_date}.xlsx"
if pd.isna(result_df['channel_name'].iloc[0]):
    # Drop the first row
    unique_df = unique_df.drop(result_df.index[0])
# Save the DataFrame to Excel with today's date in the filename
unique_df.to_excel(file_path, index=False)



### 기존 URL에 신규 채널 추가 작업 (없앨 목록 체크 필요)
- 날짜로 최신 데이터 확인

In [32]:
import pandas as pd
import os
import re
from datetime import datetime

# Use the current working directory
directory = os.getcwd()

def get_latest_files(directory, prefix="Youtube_Data_", extension=".xlsx"):
    """Retrieve the latest and second latest files in the directory."""
    files = [f for f in os.listdir(directory) if f.startswith(prefix) and f.endswith(extension)]
    date_pattern = re.compile(r'(\d{4}-\d{2}-\d{2})')
    dated_files = []

    for file in files:
        match = date_pattern.search(file)
        if match:
            date_str = match.group(1)
            try:
                date_obj = datetime.strptime(date_str, "%Y-%m-%d")
                dated_files.append((file, date_obj))
            except ValueError:
                continue

    # Sort files by date in descending order
    dated_files.sort(key=lambda x: x[1], reverse=True)

    # Return the latest and second latest files if available
    if len(dated_files) >= 2:
        return dated_files[0][0], dated_files[1][0]
    elif len(dated_files) == 1:
        return dated_files[0][0], None
    else:
        return None, None

def update_latest_file(directory, latest_file, second_latest_file):
    """Compare the latest and second latest files, update and save new data."""
    # Read the latest and second latest Excel files into DataFrames
    latest_df = pd.read_excel(os.path.join(directory, latest_file))
    second_latest_df = pd.read_excel(os.path.join(directory, second_latest_file))

    # Identify new channels in the latest file that are not in the second latest file
    new_channels = latest_df[~latest_df['channel_name'].isin(second_latest_df['channel_name'])]

    # Identify channels in the second latest file that are not in the latest file
    missing_channels = second_latest_df[~second_latest_df['channel_name'].isin(latest_df['channel_name'])]

    # Add missing channels to the latest DataFrame
    if not missing_channels.empty:
        updated_latest_df = pd.concat([latest_df, missing_channels], ignore_index=True)
    else:
        updated_latest_df = latest_df

    # Save the new channels to a separate Excel file
    if not new_channels.empty:
        new_filename = f"Youtube_Data_added_{datetime.today().strftime('%Y-%m-%d')}.xlsx"
        new_channels.to_excel(os.path.join(directory, new_filename), index=False)
        print(f"New channels saved to: {new_filename}")
    else:
        print("No new channels found in the latest file.")

    # Save the updated latest file
    updated_filename = f"Updated_{latest_file}"
    updated_latest_df.to_excel(os.path.join(directory, updated_filename), index=False)
    print(f"Updated latest file saved as: {updated_filename}")

    # Return the names of the newly added channels
    return missing_channels['channel_name'].tolist() if not missing_channels.empty else []

# Step 1: Retrieve the latest and second latest Excel files
latest_file, second_latest_file = get_latest_files(directory)

if latest_file and second_latest_file:
    print(f"Latest file: {latest_file}")
    print(f"Second latest file: {second_latest_file}")

    # Step 2: Compare the files and update the latest file
    newly_added_channels = update_latest_file(directory, latest_file, second_latest_file)

    if newly_added_channels:
        print("The following channels were added to the latest file:")
        for channel in newly_added_channels:
            print(channel)
else:
    print("Not enough files to perform the comparison. At least two files are required.")


Latest file: Youtube_Data_2024-10-09.xlsx
Second latest file: Youtube_Data_2024-05-14.xlsx
New channels saved to: Youtube_Data_added_2024-10-09.xlsx
Updated latest file saved as: Updated_Youtube_Data_2024-10-09.xlsx
The following channels were added to the latest file:
Baby Shark - Pinkfong Kids’ Songs & Stories


### 수집 채널 URL 추가----------------------------------------------- 여긴 무시

In [20]:
import schedule
import time
from sqlalchemy import create_engine

# 처음 DB에 추가할때
def append_channel(dataframe):
    engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
    conn = engine.connect()

    dataframe.to_sql(name='channel', con=engine, if_exists='append', index=False)
    conn.close()

In [21]:
# db 저장
append_channel(result_df) 

In [22]:
result_df.dropna(subset=['url'], inplace=True)
result_df.to_excel("yotube_list.xlsx")

KeyError: ['url']

In [None]:
# weekly 업로드 진행
def update_channel(dataframe):
    engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
    conn = engine.connect()
    sql_query = 'SELECT * FROM channel'
    df = pd.read_sql(sql_query, engine)
    
    dataframe.to_sql(name='channel', con=engine, if_exists='update', index=False)
    conn.close()

### Shorts 수집

In [None]:
refine_df

In [None]:
# 기존에 존재하다던 데이터 update 해서 다시 데이터 넣기
engine = create_engine("mysql+pymysql://root:"+"2000"+"@127.0.0.1" + "/yt_db")
existing_data = pd.read_sql('SELECT * FROM channel', engine)

# Check for duplicates based on the 'ID' column
duplicates = existing_data[existing_data['Youtuber'].isin(new_data['Youtuber'])]

# Update existing rows with new data
existing_data.update(new_data)

# Filter out rows that are duplicates
new_rows = new_data[~new_data['Youtuber'].isin(duplicates['Youtuber'])]

# Append new rows to the existing data
merged_data = pd.concat([existing_data, new_rows], ignore_index=True)

# Write the merged data back to the database
merged_data.to_sql('channel', engine, if_exists='replace', index=False)