In [1]:
# Libraries
from huggingface_hub import HfApi, ModelCard
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
import pandas as pd

# Suppress the logging output for huggingface_hub
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)

In [2]:
api = HfApi()

# Search for notebooks related to software engineering

## Search by 'search' function
### Software Engineering

In [3]:
# Search for models related to software engineering
software_engineering_models = api.list_models(search="software engineering",cardData=True, sort='downloads', direction=-1)

# Example for analyzing model cards: https://github.com/Weixin-Liang/AI-model-card-analysis-HuggingFace/blob/main/api_example.ipynb

for model in software_engineering_models:
    model_info = api.model_info(model.id)
    print(f"Model name: {model_info.id}")
    print(f"Downloads: {model_info.downloads}")
    print(f"Downloads all time: {model_info.downloads_all_time}")
    print(f"Model tags: {model_info.tags}", "\n")

Model name: justinlamlamlam/softwareengineering
Downloads: 2
Downloads all time: None
Model tags: ['transformers', 'safetensors', 'bert', 'text-classification', 'arxiv:1910.09700', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 

Model name: burakkececi/bert-turkish-software-engineering
Downloads: 0
Downloads all time: None
Model tags: ['transformers', 'safetensors', 'bert', 'fill-mask', 'tr', 'license:mit', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 

Model name: burakkececi/bert-software-engineering
Downloads: 0
Downloads all time: None
Model tags: ['transformers', 'safetensors', 'bert', 'fill-mask', 'en', 'license:mit', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 



### Software Development

In [4]:
software_development_models = api.list_models(search="software development",cardData=True, sort='downloads', direction=-1)

for model in software_development_models:
    model_info = api.model_info(model.id)
    print(f"Model name: {model_info.id}")
    print(f"Downloads: {model_info.downloads}")
    print(f"Downloads all time: {model_info.downloads_all_time}")
    print(f"Model tags: {model_info.tags}","\n")

Model name: Jayian/software-development
Downloads: 0
Downloads all time: None
Model tags: ['license:creativeml-openrail-m', 'region:us'] 



## Search by tags
### Software Engineering

In [5]:
software_engineering_models_tags = api.list_models(tags=["software engineering"], cardData=True, sort='downloads', direction=-1)


for model in software_engineering_models_tags:
    model_info = api.model_info(model.id)
    print(f"Model name: {model_info.id}")
    print(f"Downloads: {model_info.downloads}")
    print(f"Downloads all time: {model_info.downloads_all_time}")
    print(f"Model tags: {model_info.tags}", "\n")

Model name: taidng/wikiser-bert-base
Downloads: 15
Downloads all time: None
Model tags: ['transformers', 'pytorch', 'safetensors', 'bert', 'token-classification', 'software engineering', 'ner', 'named-entity recognition', 'en', 'dataset:wikiser', 'arxiv:2308.10564', 'license:apache-2.0', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 

Model name: thearod5/nl-bert
Downloads: 12
Downloads all time: None
Model tags: ['transformers', 'safetensors', 'bert', 'text-classification', 'software engineering', 'software traceability', 'arxiv:2207.01084', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 

Model name: taidng/wikiser-bert-large
Downloads: 11
Downloads all time: None
Model tags: ['transformers', 'pytorch', 'safetensors', 'bert', 'token-classification', 'software engineering', 'ner', 'named-entity recognition', 'en', 'dataset:wikiser', 'arxiv:2308.10564', 'license:apache-2.0', 'autotrain_compatible', 'endpoints_compatible', 'region:us'] 



In [6]:
import pandas as pd
import re

df = pd.read_excel('./software_engineering_models.xlsx')

def extract_links_or_names(text):
    # Ensure the input is a string, otherwise return an empty string
    if isinstance(text, str):
        # Use regex to find full links starting with https://arxiv.org/abs/ or https://arxiv.org/pdf/
        links_or_names = re.findall(r'(https://arxiv.org/(abs|pdf)/\d+\.\d+(?:v\d+)?)', text)
        # Extract the full match from the tuple and return
        return ' '.join([link[0] for link in links_or_names])
    else:
        return pd.NA


# Apply the function to the 'paper_code' column
df['paper_code_clean'] = df['paper_code'].apply(extract_links_or_names)

df.to_excel('software_engineering_models_clean.xlsx', index=False)
