In [38]:
from huggingface_hub import HfApi, ModelCard
import json
import re
import google.generativeai as genai
from rdflib import Graph, Namespace, URIRef, Literal, RDF

# Configuration
GTOKEN = "AIzaSyCLwWkDW03zjzVKUQf3ui5wgcreVJdsMbw"
HFTOKEN = "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"
MCRO_TTL_PATH = "mcro.ttl"
OUTPUT_JSON_PATH = "extracted_triples.json"

# Initialize Gemini
genai.configure(api_key=GTOKEN)
gemini_model = genai.GenerativeModel('gemini-2.0-flash')

def clean_identifier(text):
    """Generate safe identifier"""
    return re.sub(r'[^a-zA-Z0-9]', '', str(text).replace(" ", ""))[:50]

def upload_mcro_ontology():
    """Upload .ttl ontology to Gemini"""
    try:
        print("Uploading MCRO ontology...")
        mcro_file = genai.upload_file(path=MCRO_TTL_PATH)
        print(f"Ontology uploaded: {mcro_file.name}")
        return mcro_file
    except Exception as e:
        print(f"File upload failed: {e}")
        raise

def get_mapped_triples(model_card_text, mcro_file, model_id):

    prompt = f"""Using the attached MCRO ontology file ({mcro_file.uri}), analyze this Hugging Face model card and return:

1. All metadata fields (like license, description, tags, dataset, etc.)
2. Map each to appropriate MCRO ontology concepts using exact CURIE syntax
   - Example CURIE: mcro:HasLicense
3. Return ONLY a JSON array of triples in this format:
[
  {{
    "s": "mcro:{clean_identifier(model_id)}",
    "p": "rdf:type",
    "o": "mcro:Model"
  }},
  {{
    "s": "mcro:{clean_identifier(model_id)}",
    "p": "mcro:HasLicense",
    "o": "mcro:{clean_identifier(model_id)}-License"
  }},
  {{
    "s": "mcro:{clean_identifier(model_id)}-License",
    "p": "rdf:type",
    "o": "mcro:License"
  }},
  {{
    "s": "mcro:{clean_identifier(model_id)}-License",
    "p": "prov:hasTextValue",
    "o": "mit"
  }}
]
Important Rules:
- Only use terms from the ontology
- Use CURIE format (prefix:localname)
- Always link back to base namespace: http://sbmi.uth.edu/ontology/mcro#
- For literal values, use prov:hasTextValue
- No explanation or markdown
- Keep all responses strictly within JSON format"""

    try:
        response = gemini_model.generate_content(
            contents=[prompt, mcro_file],
            request_options={"timeout": 60}
        )

        # Extract & parse JSON
        json_str = response.text.strip()
        if "```json" in json_str:
            json_str = json_str.split("```json")[1].split("```")[0].strip()

        return json.loads(json_str)
    
    except Exception as e:
        print(f"Gemini error: {e}")
        return []

def process_huggingface_models(limit=10):
    """Main pipeline with single-prompt mapping"""
    mcro_file = upload_mcro_ontology()
    api = HfApi(token=HFTOKEN)
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model_info in enumerate(models):
        try:
            card = ModelCard.load(model_info.id, token=HFTOKEN)
            triples = get_mapped_triples(card.text, mcro_file, model_info.id)
            
            if triples:
                all_triples.extend(triples)
                print(f"Processed {idx+1}/{len(models)}: {model_info.id}")
                print(f"Generated {len(triples)} triples")
            else:
                print(f"No triples returned for {model_info.id}")

        except Exception as e:
            print(f"Error processing {model_info.id}: {str(e)}")

    # Save results
    with open(OUTPUT_JSON_PATH, "w") as f:
        json.dump(all_triples, f, indent=2)

    print(f"\n✅ Saved {len(all_triples)} triples to {OUTPUT_JSON_PATH}")
    return all_triples

if __name__ == "__main__":
    print("=== ONTOLOGY-AWARE TRIPLE GENERATION STARTED ===")
    process_huggingface_models(limit=10)
    print("\n=== COMPLETED ===")

=== ONTOLOGY-AWARE TRIPLE GENERATION STARTED ===
Uploading MCRO ontology...
Ontology uploaded: files/qjwrxu4xzdae
Gemini error: 500 Internal error encountered.
No triples returned for timm/mobilenetv3_small_100.lamb_in1k
Processed 2/10: sentence-transformers/all-MiniLM-L6-v2
Generated 14 triples
Gemini error: 500 Internal error encountered.
No triples returned for Falconsai/nsfw_image_detection
Processed 4/10: dima806/fairface_age_image_detection
Generated 18 triples
Processed 5/10: google-bert/bert-base-uncased
Generated 10 triples
Processed 6/10: openai/clip-vit-large-patch14
Generated 20 triples
Gemini error: 500 Internal error encountered.
No triples returned for TheBloke/phi-2-GGUF
Processed 8/10: facebook/esmfold_v1
Generated 10 triples
Processed 9/10: amazon/chronos-t5-small
Generated 13 triples
Processed 10/10: Bingsu/adetailer
Generated 13 triples

✅ Saved 98 triples to extracted_triples.json

=== COMPLETED ===
