## Please read the README.md First

In [21]:
import requests
from bs4 import BeautifulSoup
import csv
import re
import json


url = 'https://huggingface.co/models?sort=trending'
response = requests.get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')

total_pages = html_soup.find_all('a', class_='rounded-lg px-2.5 py-1 hover:bg-gray-50 dark:hover:bg-gray-800')
total_pages_length = len(total_pages)
a = str(total_pages[total_pages_length - 1])
page_number = re.findall(r'>([\d,]+)<', a)
if page_number:
    page_number = int(page_number[0].replace(',', ''))

# 如果想要取得完整上萬筆資料請註解掉以下這行
page_number = 10


base_url = "https://huggingface.co/models?p="

for page in range(1, page_number+1):
    url = base_url + str(page) + "&sort=trending"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}")
        continue
    
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    model_name = html_soup.find_all('h4', class_='text-md truncate font-mono text-black dark:group-hover/repo:text-yellow-500 group-hover/repo:text-indigo-600 text-smd')
    model_detail = html_soup.find_all('div', class_='mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400')

    model_detail = html_soup.find_all('div', class_='mr-1 flex items-center overflow-hidden whitespace-nowrap text-sm leading-tight text-gray-400')
    
    model_detail_list = []
    for div in model_detail:
        model_detail_list.append(div.get_text(separator=' ', strip=True))
    
    splitted_model_detail_list = []

    for item in model_detail_list:
        splitted_item = item.split(' • ')

        splitted_model_detail_list.append(splitted_item)

    data = [None]*len(splitted_model_detail_list)

    for i, row in enumerate(splitted_model_detail_list):
        new_row = [None]*4  # 創建一個新的包含4個None的列表

        for item in row:
            if 'Updated' in item:
                new_row[1] = item
            elif re.search(r'\d', item):
                if new_row[2] is None:
                    new_row[2] = item
                else:
                    new_row[3] = item
            else:
                new_row[0] = item

        if new_row[2] is not None and new_row[3] is None:
            new_row[3] = new_row[2]
            new_row[2] = None

        data[i] = new_row
        
    h4_tags = html_soup.find_all('h4', class_='text-md truncate font-mono text-black dark:group-hover/repo:text-yellow-500 group-hover/repo:text-indigo-600 text-smd')

    h4_texts = [tag.get_text() for tag in h4_tags]
        
    prefix = "https://api-inference.huggingface.co/models/"

    combined_strings = []

    for suffix in h4_texts:
        combined_string = f"{prefix}{suffix}"

        combined_strings.append(combined_string)

    if len(data) == len(combined_strings):
        combined_list = []
        for item, url in zip(data, combined_strings):
            combined_item = item.copy()
            combined_item.append(url)
            combined_list.append(combined_item)
            
    resulting_dict = {key: value for key, value in zip(h4_texts, combined_list)}
            
    with open(f'LLMs_page_{page}.csv', mode='w', newline='') as file:
        writer = csv.writer(file)
        
        writer.writerow(['Model', 'Task', 'Update Time', 'Download', 'Like', 'API URL'])
        
        for key, values in resulting_dict.items():
            writer.writerow([key] + values)
            
    with open(f'LLMs_page_{page}.json', 'w', encoding='utf-8') as file:
        json.dump(resulting_dict, file, ensure_ascii=False, indent=4)