In [25]:
# Libraries
from huggingface_hub import HfApi, ModelCard
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import pandas as pd

# Suppress the logging output for huggingface_hub
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)

In [26]:


api = HfApi()

# Search for notebooks related to software engineering

## Search by 'search' function
### Software Engineering

In [27]:
# Search for models related to software engineering
software_engineering_models = api.list_models(search="software engineering",cardData=True, sort='downloads', direction=-1)

# Example for analyzing model cards: https://github.com/Weixin-Liang/AI-model-card-analysis-HuggingFace/blob/main/api_example.ipynb

for model in software_engineering_models:
    model_info = api.model_info(model.id)
    print(f"Model name: {model_info.id}")
    print(f"Downloads: {model_info.downloads}")
    print(f"Downloads all time: {model_info.downloads_all_time}")
    print(f"Model tags: {model_info.tags}", "\n")

Model name: justinlamlamlam/softwareengineering
Downloads: 6
Downloads all time: None
Model tags: ['transformers', 'safetensors', 'bert', 'text-classification', 'arxiv:1910.09700', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 

Model name: burakkececi/bert-turkish-software-engineering
Downloads: 4
Downloads all time: None
Model tags: ['transformers', 'safetensors', 'bert', 'fill-mask', 'tr', 'license:mit', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 

Model name: burakkececi/bert-software-engineering
Downloads: 2
Downloads all time: None
Model tags: ['transformers', 'safetensors', 'bert', 'fill-mask', 'en', 'license:mit', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 



### Software Development

In [28]:
software_development_models = api.list_models(search="software development",cardData=True, sort='downloads', direction=-1)

for model in software_development_models:
    model_info = api.model_info(model.id)
    print(f"Model name: {model_info.id}")
    print(f"Downloads: {model_info.downloads}")
    print(f"Downloads all time: {model_info.downloads_all_time}")
    print(f"Model tags: {model_info.tags}","\n")

Model name: Jayian/software-development
Downloads: 0
Downloads all time: None
Model tags: ['license:creativeml-openrail-m', 'region:us'] 



## Search by tags
### Software Engineering

In [29]:
software_engineering_models_tags = api.list_models(tags=["software engineering"], cardData=True, sort='downloads', direction=-1)


for model in software_engineering_models_tags:
    model_info = api.model_info(model.id)
    print(f"Model name: {model_info.id}")
    print(f"Downloads: {model_info.downloads}")
    print(f"Downloads all time: {model_info.downloads_all_time}")
    print(f"Model tags: {model_info.tags}", "\n")

Model name: taidng/wikiser-bert-large
Downloads: 21
Downloads all time: None
Model tags: ['transformers', 'pytorch', 'safetensors', 'bert', 'token-classification', 'software engineering', 'ner', 'named-entity recognition', 'en', 'dataset:wikiser', 'arxiv:2308.10564', 'license:apache-2.0', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 

Model name: taidng/wikiser-bert-base
Downloads: 7
Downloads all time: None
Model tags: ['transformers', 'pytorch', 'safetensors', 'bert', 'token-classification', 'software engineering', 'ner', 'named-entity recognition', 'en', 'dataset:wikiser', 'arxiv:2308.10564', 'license:apache-2.0', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 

Model name: thearod5/nl-bert
Downloads: 4
Downloads all time: None
Model tags: ['transformers', 'safetensors', 'bert', 'text-classification', 'software engineering', 'software traceability', 'arxiv:2207.01084', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 



## Search by cards

In [None]:
# List models with card data sorted by downloads in descending order
models = api.list_models(cardData=True, sort='downloads', direction=-1, limit=10000)

# Define relevant words for filtering
words = [
    "software engineering", "software development", "software", 
    "engineering", "development", "programming",
    "developer", "software developer", "software engineer", 
    "web development", "web developer", "web", "web design",
    "web designer", "code", "coding", "MLOps", "DevOps"
]

# Prepare list to hold data for the dataframe
data = []

# Function to process each model
def process_model(model):
    try:
        # Skip models with 0 downloads
        if model.downloads == 0:
            return None
        
        # Load the model card
        model_card = ModelCard.load(model.id)
        
        # Check if any relevant word is in the model card content
        for word in words:
            if word in model_card.content:
                return {
                    "Model Name": model.id,
                    "Matching Word": word,
                    "Downloads": model.downloads
                }
    except Exception:
        return None

# Use ThreadPoolExecutor to process models concurrently
with ThreadPoolExecutor(max_workers=12) as executor:
    futures = [executor.submit(process_model, model) for model in models]
    
    # As completed threads return, collect the results
    for future in as_completed(futures):
        result = future.result()
        if result:
            data.append(result)

# Create a pandas DataFrame from the extracted data
df = pd.DataFrame(data, columns=["Model Name", "Matching Word", "Downloads"])

# Save DataFrame to Excel
output_file = "software_engineering_models.xlsx"
df.to_excel(output_file, index=False)

print(f"Data saved to {output_file}")