In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import re

# Base URL of the page to scrape
base_url = "https://huggingface.co/organizations?p="

# Number of pages to scrape
num_pages = 3196
data = []

for page in range(3193, num_pages + 1):
    url = base_url + str(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all the article elements containing the organization links and organization types
    organization_articles = soup.find_all("article", class_="overview-card-wrapper group items-center sm:flex overflow-hidden")
    
    # Iterate through each article to extract the required information
    for article in organization_articles:
        # Extract the href link from the 'a' tag (Organization ID)
        href_element = article.find("a", class_="flex w-max flex-1 items-center overflow-hidden p-3")
        if href_element:
            href = href_element['href'].replace('/', '')  # Extract and clean the href value (remove "/")
        else:
            href = "No href found"

        # Extract the organization name from the 'h4' tag
        org_name_element = article.find("h4", class_="truncate font-semibold md:text-lg")
        if org_name_element:
            org_name = org_name_element.get_text(strip=True)  # Extract the organization name
        else:
            org_name = "No organization name found"

        # Extract the organization type (if it exists)
        org_type_element = article.find("div", class_="truncate text-sm text-gray-400")
        if org_type_element:
            org_type_text = org_type_element.get_text(strip=True)
            # Capture the first phrase of the organization type, allowing for multi-word types (e.g., 'non-profit')
            org_type_text = re.search(r'^[a-zA-Z-]+', org_type_text)
            if org_type_text:
                org_type_text = org_type_text.group(0)
            else:
                org_type_text = "Enterprise"
        else:
            org_type_text = "Enterprise"


        data.append([href, org_name, org_type_text])

with open("organizations.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Organization ID", "Organization Name", "Organization Type"])  # Write the header row
    writer.writerows(data)  # Write the data rows

print("Data has been written to organizations.csv")


Data has been written to organizations.csv
