In [1]:
from huggingface_hub import HfApi, ModelCard
import json, re, time
import pandas as pd
import google.generativeai as genai
import owlready2
from rdflib import Graph, URIRef
from owlready2 import get_ontology, default_world


# Authentication
HFTOKEN = "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"
GTOKEN = "AIzaSyB8HMqIGvscURWPF75CwnZlXnFFsGh0Vlg"
onto = get_ontology("http://purl.obolibrary.org/obo/mcro.owl").load()
ONTOLOGY_BASE = "http://purl.obolibrary.org/obo/mcro.owl#"

def clean_identifier(text):
    """Clean text for URI safety"""
    if not text:
        return "unknown"
    return re.sub(r'[^a-zA-Z0-9-]', '', str(text).replace(' ', '-'))[:50]

def get_ontology_mapping(entity_key):
    # Check if class exists in ontology
    clean_key = ''.join(w.capitalize() for w in re.split('[^a-zA-Z0-9]', entity_key))
    ontology_class = onto.search_one(iri=f"*{clean_key}")
    
    if ontology_class:
        return {
            "class_uri": ontology_class.iri,
            "predicate_uri": f"{ontology_class.namespace.base_iri}has{clean_key}"
        }
    else:
        # Fallback if class not found
        return {
            "class_uri": f"mcro:Custom{clean_key}",
            "predicate_uri": f"mcro:has{clean_key}"
        }

def extract_hf_entities(text):
    """Ontology-aware entity extraction with enhanced prompt"""
    try:
        prompt = f"""
** STRICT JSON EXTRACTION INSTRUCTIONS **

Extract technical metadata from the model card below, following the ontology: 
{ONTOLOGY_BASE}

** REQUIRED FIELDS (use ontology terms when possible): **
- License (license)
- Architecture (architecture)
- Training datasets (trainingData) [array]
- Evaluation metrics (metrics) [array]
- Hardware used (hardware) [array]
- Libraries/frameworks (libraries) [array]
- Model type (modelType)
- Languages supported (languages) [array]
- Pre-trained model (pretrainedModel)
- Publication citations (citation) [array]

** OUTPUT SPECIFICATIONS: **
1. Use STRICT JSON format (NO markdown)
2. ALL keys must be in double quotes
3. Strings with special characters must use escaped quotes (\")
4. URLs MUST be complete (http://... not "http:")
5. Arrays must have proper comma separation
6. Use ontology URIs when possible (e.g., "{ONTOLOGY_BASE}Transformer")
7. Omit fields with no valid data

** EXAMPLE FORMAT: **
{{
  "license": "MIT",
  "architecture": "{ONTOLOGY_BASE}Transformer",
  "trainingData": [
    "http://purl.obolibrary.org/obo/mcro.owl#ImageNet-1k",
    "Wikipedia"
  ],
  "metrics": {{
    "accuracy": 0.95,
    "f1": 0.93
  }},
  "libraries": ["PyTorch", "HuggingFace"]
}}

** MODEL CARD TEXT: **
{text[:10000]}
"""
        
        genai.configure(api_key=GTOKEN)
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
        response = model.generate_content(prompt)
        json_str = response.text.strip()
        json_str = re.sub(r'^```json\s*|```.*$', '', json_str, flags=re.IGNORECASE)
        return json.loads(json_str)
    
    except Exception as e:
        print(f"Extraction error: {str(e)}")
        return {}

def generate_hf_triples(model_data):
    triples = []
    model_uri = f"{clean_identifier(model_data['id'])}"
    
    # Base type assertion using ontology
    triples.append({
        "s": model_uri,
        "p": "rdf:type",
        "o": f"{ONTOLOGY_BASE}Model"
    })

    if "entities" in model_data:
        for key, value in model_data["entities"].items():
            if not value or key.lower() in ['id', 'model']:
                continue

            mapping = get_ontology_mapping(key)
            values = value if isinstance(value, list) else [value]

            for val in values:
                if pd.isna(val) or str(val).lower() in ['none', 'null', 'nan']:
                    continue

                entity_uri = f"{model_uri}-{clean_identifier(key)}-{clean_identifier(str(val))}"
                
                triples.extend([
                    {"s": model_uri, "p": mapping['predicate_uri'], "o": entity_uri},
                    {"s": entity_uri, "p": "rdf:type", "o": mapping['class_uri']},
                    {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(val)},
                    {"s": mapping['class_uri'], "p": "rdfs:subClassOf", 
                     "o": f"{ONTOLOGY_BASE}Component"}
                ])
    
    return triples

def process_huggingface_models(limit=20):
    api = HfApi(token=HFTOKEN)
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model in enumerate(models):
        try:
            card = ModelCard.load(model.modelId, token=HFTOKEN)
            entities = extract_hf_entities(card.text)
            
            if not entities:
                print(f"Skipping {model.modelId} - no entities found")
                continue

            model_data = {"id": model.modelId, "entities": entities}
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)

            if idx % 5 == 0:
                print(f"Processed {idx+1}/{len(models)}: {model.modelId}")

        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")

    with open("ontology_aligned_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)

    print("\n=== STATISTICS ===")
    print(f"Total models processed: {len(models)}")
    print(f"Total triples generated: {len(all_triples)}")
    
    return all_triples

if __name__ == "__main__":
    print("=== ONTOLOGY-ALIGNED TRIPLE EXTRACTION STARTED ===")
    triples = process_huggingface_models(limit=50)
    print("\n=== COMPLETED ===")
    print(f"Final triple count: {len(triples)}")



=== ONTOLOGY-ALIGNED TRIPLE EXTRACTION STARTED ===
Processed 1/50: timm/mobilenetv3_small_100.lamb_in1k
Processed 6/50: openai/clip-vit-large-patch14
Processed 11/50: facebook/esmfold_v1
Processed 16/50: timm/resnet50.a1_in1k
Processed 21/50: distilbert/distilbert-base-uncased
Processed 26/50: CIDAS/clipseg-rd64-refined
Processed 31/50: facebook/opt-125m
Processed 36/50: google/siglip-so400m-patch14-384
Processed 41/50: jonatasgrosman/wav2vec2-large-xlsr-53-japanese
Processed 46/50: google-t5/t5-base

=== STATISTICS ===
Total models processed: 50
Total triples generated: 2614

=== COMPLETED ===
Final triple count: 2614
