In [1]:
from huggingface_hub import HfApi, ModelCard
import json, string, time
from datetime import datetime
import re
import traceback
import google.generativeai as genai
import pandas as pd

In [2]:
# Authentication
HFTOKEN = "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"
GTOKEN = "AIzaSyB8HMqIGvscURWPF75CwnZlXnFFsGh0Vlg"

In [3]:
# Helper functions
def clean_identifier(text):
    """Clean text for URI safety"""
    if not text:
        return "unknown"
    return re.sub(r'[^a-zA-Z0-9-]', '', str(text).replace(' ', '-'))[:50]

def get_dynamic_mapping(entity_key):
    """Generate predicate and class URIs dynamically"""
    clean_key = ''.join([w.capitalize() for w in re.split('[^a-zA-Z0-9]', entity_key)])
    return {
        "class_uri": f"{clean_key}",
        "predicate_uri": f"has{clean_key}"
    }

In [4]:
def extract_hf_entities(text):
    """Open-ended entity extraction using LLMs"""
    try:
        prompt = f"""Extract ALL technical metadata from this model card as JSON.
Include: architectures, licenses, datasets, metrics, hardware, training details,
evaluation scores, libraries used, and any other technical specifications.
Use simple key-value pairs. Example:
{{
  "license": "MIT",
  "architecture": "Transformer",
  "trainingData": ["Common Crawl", "Wikipedia"]
}}

Model card text:
{text[:10000]}"""
        genai.configure(api_key=GTOKEN)
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
        response = model.generate_content(prompt)
        json_str = response.text.replace("```json", "").replace("```", "").strip()
        return json.loads(json_str)
    
    except Exception as e:
        print(f"Extraction error: {str(e)}")
        return {}

In [5]:
def generate_hf_triples(model_data):
    triples = []
    model_uri = f"{clean_identifier(model_data['id'])}"
    
    # Base type assertion
    triples.append({"s": model_uri, "p": "rdf:type", "o": "Model"})

    if "entities" in model_data:
        for key, value in model_data["entities"].items():
            if not value or key.lower() in ['id', 'model']:
                continue

            # Get dynamic ontology mapping
            mapping = get_dynamic_mapping(key)
            values = value if isinstance(value, list) else [value]

            for val in values:
                if pd.isna(val) or str(val).lower() in ['none', 'null', 'nan']:
                    continue

                # Create unique entity URI
                entity_uri = f"{model_uri}-{clean_identifier(key)}-{clean_identifier(str(val))}"
                
                # Add triples
                triples.extend([
                    {"s": model_uri, "p": mapping['predicate_uri'], "o": entity_uri},
                    {"s": entity_uri, "p": "rdf:type", "o": mapping['class_uri']},
                    {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(val)},
                    {"s": mapping['class_uri'], "p": "rdfs:subClassOf", "o": "Component"}
                ])
    
    return triples

In [6]:
def process_huggingface_models(limit=20):
    api = HfApi(token=HFTOKEN)
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model in enumerate(models):
        try:
            # Get model card
            card = ModelCard.load(model.modelId, token=HFTOKEN)
            entities = extract_hf_entities(card.text)
            
            if not entities:
                print(f"Skipping {model.modelId} - no entities found")
                continue

            # Generate triples
            model_data = {"id": model.modelId, "entities": entities}
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)

            # Progress reporting
            if idx % 5 == 0:
                print(f"Processed {idx+1}/{len(models)}: {model.modelId}")
                print(f"Generated {len(triples)} triples from {len(entities)} entities")

        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")

    # Save results
    with open("dynamic_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)

    # Print statistics
    print("\n=== STATISTICS ===")
    print(f"Total models processed: {len(models)}")
    print(f"Total triples generated: {len(all_triples)}")
    
    return all_triples

In [7]:
if __name__ == "__main__":
    print("=== DYNAMIC TRIPLE EXTRACTION STARTED ===")
    triples = process_huggingface_models(limit=50)
    print("\n=== COMPLETED ===")
    print(f"Final triple count: {len(triples)}")

=== DYNAMIC TRIPLE EXTRACTION STARTED ===
Processed 1/50: timm/mobilenetv3_small_100.lamb_in1k
Generated 69 triples from 15 entities
Processed 6/50: google-bert/bert-base-uncased
Generated 89 triples from 11 entities
Processed 11/50: Bingsu/adetailer
Generated 73 triples from 6 entities
Processed 16/50: pyannote/wespeaker-voxceleb-resnet34-LM
Generated 53 triples from 13 entities
Processed 21/50: distilbert/distilbert-base-uncased
Generated 105 triples from 18 entities
Extraction error: Extra data: line 4 column 1 (char 5)
Skipping unslothai/1 - no entities found
Processed 31/50: kresnik/wav2vec2-large-xlsr-korean
Generated 57 triples from 11 entities
Processed 36/50: google/vit-base-patch16-224
Generated 77 triples from 17 entities
Processed 41/50: google-t5/t5-base
Generated 153 triples from 13 entities
Extraction error: Expecting value: line 14 column 6 (char 1332)
Skipping bartowski/DeepSeek-R1-Distill-Qwen-14B-GGUF - no entities found
Processed 46/50: facebook/contriever-msmarco
G