In [5]:
from huggingface_hub import HfApi, ModelCard
import json
import re
import google.generativeai as genai
from rdflib import Graph, Namespace, URIRef, Literal, RDF

# Configuration
GTOKEN = "AIzaSyCLwWkDW03zjzVKUQf3ui5wgcreVJdsMbw"
HFTOKEN = "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"
MCRO_TTL_PATH = "mcro.ttl"
OUTPUT_JSON_PATH = "extracted_triples.json"

# Initialize Gemini
genai.configure(api_key=GTOKEN)
gemini_model = genai.GenerativeModel('gemini-2.0-flash')

def clean_identifier(text):
    """Generate safe identifier"""
    return re.sub(r'[^a-zA-Z0-9]', '', str(text).replace(" ", ""))[:50]

def upload_mcro_ontology():
    """Upload .ttl ontology to Gemini"""
    try:
        print("Uploading MCRO ontology...")
        mcro_file = genai.upload_file(path=MCRO_TTL_PATH)
        print(f"Ontology uploaded: {mcro_file.name}")
        return mcro_file
    except Exception as e:
        print(f"File upload failed: {e}")
        raise

def get_mapped_triples(model_card_text, mcro_file, model_id):
    prompt = f"""Using the attached Model Card Ontology (MCRO) file ({mcro_file.uri}), analyze this Hugging Face model card text and return only RDF triples in JSON format. Follow these strict rules:

### 🎯 Rules for Mapping
1. Only use terms defined in the MCRO ontology.
2. Always map metadata fields to appropriate MCRO concepts **by their CURIEs**, such as:
   - license → mcro:LicenseInformationSection
   - dataset → mcro:DatasetInformationSection
   - model architecture → mcro:ModelArchitectureInformationSection
   - citation → mcro:CitationInformationSection
   - intended use case → mcro:UseCaseInformationSection
3. Use proper relationships:
   - `rdf:type` for types
   - `prov:hasTextValue` for textual values (like "mit", "CNN", "ImageNet")
   - Appropriate `mcro:hasX` properties for linking model to its sections
4. Never assign `rdf:type` to abstract IAO classes like `obo:IAO_*`.
5. Never directly type instances with `obo:MCRO_0000004`, `obo:MCRO_0000016`, etc. — always use CURIEs like `mcro:CitationInformationSection`, `mcro:LicenseInformationSection`.

### 📄 Sample Output Format:
[
  {{
    "s": "mcro:{clean_identifier(model_id)}",
    "p": "rdf:type",
    "o": "mcro:Model"
  }},
  {{
    "s": "mcro:{clean_identifier(model_id)}",
    "p": "mcro:hasLicense",
    "o": "mcro:{clean_identifier(model_id)}-License"
  }},
  {{
    "s": "mcro:{clean_identifier(model_id)}-License",
    "p": "rdf:type",
    "o": "mcro:LicenseInformationSection"
  }},
  {{
    "s": "mcro:{clean_identifier(model_id)}-License",
    "p": "prov:hasTextValue",
    "o": "mit"
  }}
]
Important: Return ONLY the JSON array. No explanation. No markdown.

### 📥 Input Text:
{model_card_text}
"""

    try:
        response = gemini_model.generate_content(
            contents=[prompt, mcro_file],
            request_options={"timeout": 60}
        )

        # Extract JSON
        json_str = response.text.strip()
        if "```json" in json_str:
            json_str = json_str.split("```json")[1].split("```")[0].strip()

        return json.loads(json_str)
    
    except Exception as e:
        print(f"Error: {e}")
        return []

def process_huggingface_models(limit=10):
    """Main pipeline with single-prompt mapping"""
    mcro_file = upload_mcro_ontology()
    api = HfApi(token=HFTOKEN)
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model_info in enumerate(models):
        try:
            card = ModelCard.load(model_info.id, token=HFTOKEN)
            triples = get_mapped_triples(card.text, mcro_file, model_info.id)
            
            if triples:
                all_triples.extend(triples)
                print(f"Processed {idx+1}/{len(models)}: {model_info.id}")
                print(f"Generated {len(triples)} triples")
            else:
                print(f"No triples returned for {model_info.id}")

        except Exception as e:
            print(f"Error processing {model_info.id}: {str(e)}")

    # Save results
    with open(OUTPUT_JSON_PATH, "w") as f:
        json.dump(all_triples, f, indent=2)

    print(f"\n✅ Saved {len(all_triples)} triples to {OUTPUT_JSON_PATH}")
    return all_triples

if __name__ == "__main__":
    print("=== ONTOLOGY-AWARE TRIPLE GENERATION STARTED ===")
    process_huggingface_models(limit=10)
    print("\n=== COMPLETED ===")

=== ONTOLOGY-AWARE TRIPLE GENERATION STARTED ===
Uploading MCRO ontology...
Ontology uploaded: files/n5h5y1vw8u7z
Processed 1/10: timm/mobilenetv3_small_100.lamb_in1k
Generated 16 triples
Processed 2/10: sentence-transformers/all-MiniLM-L6-v2
Generated 10 triples
Processed 3/10: Falconsai/nsfw_image_detection
Generated 19 triples
Processed 4/10: dima806/fairface_age_image_detection
Generated 10 triples
Error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.0-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.goo