In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import importlib

import Python_scripts.check_404_error as check_404_error
importlib.reload(check_404_error)

# Setting up Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")  # Turn Off to debug
service = Service(executable_path='Dependencies//chromedriver.exe') 
driver = webdriver.Chrome(service=service, options=chrome_options)

base_urls_tags = {
    "https://huggingface.co/models?pipeline_tag=text-classification&sort=trending": "Text Classification",
    "https://huggingface.co/models?pipeline_tag=token-classification&sort=trending": "Token Classification",
    "https://huggingface.co/models?pipeline_tag=table-question-answering&sort=trending": "Table Question Answering",
    "https://huggingface.co/models?pipeline_tag=question-answering&sort=trending": "Question Answering",    
    "https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=trending": "Zero-Shot Classification",
    "https://huggingface.co/models?pipeline_tag=translation&sort=trending": "Translation",
    "https://huggingface.co/models?pipeline_tag=summarization&sort=trending": "Summarization",
    "https://huggingface.co/models?pipeline_tag=feature-extraction&sort=trending": "Feature Extraction", 
    "https://huggingface.co/models?pipeline_tag=text-generation&sort=trending": "Text Generation",
    "https://huggingface.co/models?pipeline_tag=text2text-generation&sort=trending": "Text2Text Generation",
    "https://huggingface.co/models?pipeline_tag=fill-mask&sort=trending": "Fill-Mask",
    "https://huggingface.co/models?pipeline_tag=sentence-similarity&sort=trending": "Sentence Similarity"
}

def collect_model_links(base_url, tag):
    model_links = []
    unique_links = set()

    try:
        driver.get(base_url)
        print(f"Collecting links from {base_url}...")

        # Find the number of pages from the specific element before "Next" button
        num_pages_element = WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((By.XPATH, "//li[a[contains(text(),'Next')]]/preceding-sibling::li[1]/a"))
        )
        num_pages = int(num_pages_element.text.replace(',', ''))
        print(f"Number of pages: {num_pages}")


        for page in range(0, num_pages):
            current_page_url = f"{base_url}&p={page}"
            driver.get(current_page_url)

            # Check for 404 error
            try:
                # wait for 5 seconds for the page to be available
                model_elements = WebDriverWait(driver, 2).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article.overview-card-wrapper a"))
                )
            except:
                continue

            # Extracting href attributes
            for element in model_elements:
                link = element.get_attribute('href')
                if link not in unique_links:
                    model_links.append((link, tag))
                    unique_links.add(link)

        print(f"Found {len(model_links)} links from {base_url}.")

    except Exception as e:
        print(f"An error occurred: {e}")

    return model_links

all_model_links = []

try:
    for url, tag in base_urls_tags.items():
        model_links = collect_model_links(url, tag)
        all_model_links.extend(model_links)

except Exception as e:
    print("An error occurred:")
    print(e)

finally:
    driver.quit()

# Create DataFrame with model links and their corresponding tags
df = pd.DataFrame(all_model_links, columns=['Model Link', 'Tag'])
print(df)
df.to_csv('Outputs//model_links-NLP.csv', index=False)


Collecting links from https://huggingface.co/models?pipeline_tag=text-classification&sort=trending...
Number of pages: 2235
Found 65156 links from https://huggingface.co/models?pipeline_tag=text-classification&sort=trending.
Collecting links from https://huggingface.co/models?pipeline_tag=token-classification&sort=trending...
Number of pages: 642
Found 18026 links from https://huggingface.co/models?pipeline_tag=token-classification&sort=trending.
Collecting links from https://huggingface.co/models?pipeline_tag=table-question-answering&sort=trending...
Number of pages: 4
Found 102 links from https://huggingface.co/models?pipeline_tag=table-question-answering&sort=trending.
Collecting links from https://huggingface.co/models?pipeline_tag=question-answering&sort=trending...
Number of pages: 402
Found 11060 links from https://huggingface.co/models?pipeline_tag=question-answering&sort=trending.
Collecting links from https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=tr