In [None]:
import requests
import pandas as pd
import csv

# Load the dependencies from CSV files
tag_df = pd.read_csv("Dependencies//tag_list.csv")
tag_list = list(tag_df["Model Tags"])
org_df = pd.read_csv("Dependencies//organizations.csv")

# Base URL for Hugging Face models API
BASE_URL = "https://huggingface.co/api/models"

def get_model_links_and_arxiv(limit):
    model_data_list = []
    page = 0
    page_size = 100
    total_fetched = 0

    while total_fetched < limit:
        url = f"{BASE_URL}?limit={page_size}&offset={page * page_size}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error: Unable to fetch models (status code: {response.status_code})")
            break
        
        models_data = response.json()
        if not models_data:
            # Stop if there are no more models returned
            print(f"No more models to fetch. Total fetched: {total_fetched}")
            break

        # Collect model links, arXiv tags, and check for link type
        for model in models_data:
            if total_fetched < limit:
                model_link = f"https://huggingface.co/{model['modelId']}"

                # Extract the org_id from modelId
                org_id = model['modelId'].split('/')[0]

                # Check if arXiv tags or related fields exist
                arxiv_tags = []
                if 'tags' in model:
                    arxiv_tags = [tag for tag in model['tags'] if 'arxiv' in tag.lower()]
                number_of_papers = len(arxiv_tags)

                # Check if any tag from tag_list is present in model's tags
                link_type = [tag for tag in model.get('tags', []) if tag in tag_list]
                link_type = ', '.join(link_type) if link_type else 'None'

                # Look up the organization in org_df based on org_id
                if org_id in org_df['Organization ID'].values:
                    org_row = org_df[org_df['Organization ID'] == org_id].iloc[0]
                    org_name = org_row['Organization Name']
                    org_type = org_row['Organization Type']
                else:
                    org_name = 'NA'
                    org_type = 'NA'

                model_data_list.append({
                    'model_link': model_link,
                    'arxiv_tags': arxiv_tags,
                    'number_of_papers': number_of_papers,
                    'link_type': link_type,
                    'org_id': org_id,
                    'org_name': org_name,
                    'org_type': org_type
                })

                total_fetched += 1
                # Print progress after every 100000 models fetched
                if total_fetched % 100000 == 0:
                    print(f"Progress: {total_fetched} models processed")

            else:
                break

        # Move to the next page
        page += 1

    return model_data_list

# Set the hard limit to 1500000 models
limit = 1500000

# Fetch model links and their arXiv tags
model_data = get_model_links_and_arxiv(limit=limit)

# Save the data to a CSV file with the additional columns for organization name and type
csv_file = "model_links_arxiv_link_type_org_info.csv"
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Model Link', 'ArXiv Tags', 'Number of Papers', 'Link Type', 'Org ID', 'Organization Name', 'Organization Type'])
    
    # Write the data rows
    for data in model_data:
        model_link = data['model_link']
        arxiv_tags = ', '.join(data['arxiv_tags']) if data['arxiv_tags'] else 'None'
        number_of_papers = data['number_of_papers']
        link_type = data['link_type']
        org_id = data['org_id']
        org_name = data['org_name']
        org_type = data['org_type']
        writer.writerow([model_link, arxiv_tags, number_of_papers, link_type, org_id, org_name, org_type])

print(f"Data saved to {csv_file}")


Progress: 100000 models processed
Progress: 200000 models processed
Progress: 300000 models processed
Progress: 400000 models processed
Progress: 500000 models processed
Progress: 600000 models processed
Progress: 700000 models processed
Progress: 800000 models processed
Progress: 900000 models processed
Progress: 1000000 models processed
