In [None]:
from huggingface_hub import HfApi, ModelCard
import json
import re
import pandas as pd
import google.generativeai as genai

# Authentication
HFTOKEN = "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"
GTOKEN = "AIzaSyDQQY8FmOW6erFivgwsHjAdf419PYddNis"

processed_classes = set()

def clean_identifier(text):
    """Generate safe URI component"""
    if pd.isna(text) or text in ['None', 'nan']:
        return "unknown"
    return re.sub(r'[^a-zA-Z0-9]', '', str(text).replace(' ', ''))[:50]

def get_dynamic_mapping(entity_key):
    """Generate ontology mappings with base URI"""
    clean_key = ''.join([w.capitalize() for w in re.split('[^a-zA-Z0-9]', entity_key)])
    return {
        "class_uri": f"http://purl.obolibrary.org/obo/mcro.owl#{clean_key}",
        "predicate_uri": f"http://purl.obolibrary.org/obo/mcro.owl#has{clean_key}"
    }

def extract_hf_entities(text):
    """LLM-based entity extraction with improved prompt"""
    try:
        prompt = f"""Extract technical metadata as JSON key-value pairs:
Include: architecture, license, dataset, metrics, hardware, training details, 
evaluation scores, libraries, technical specifications and so on
Example format: {{"license": "MIT", "architecture": "Transformer"}}

Model card text:
{text[:10000]}"""
        
        genai.configure(api_key=GTOKEN)
        model = genai.GenerativeModel('gemini-1.5-flash')
        response = model.generate_content(prompt)
        json_str = response.text.strip().replace("```json", "").replace("```", "")
        return json.loads(json_str)
    
    except Exception as e:
        print(f"Extraction error: {str(e)}")
        return {}

def generate_hf_triples(model_data):
    """Generate RDF triples with ontology alignment"""
    global processed_classes
    triples = []
    model_id = model_data.get('id', 'unknown')
    model_uri = f"http://purl.obolibrary.org/obo/mcro.owl#{clean_identifier(model_id)}"
    
    # Base model type assertion
    triples.append({
        "s": model_uri,
        "p": "rdf:type",
        "o": "http://purl.obolibrary.org/obo/mcro.owl#Model"
    })

    if "entities" not in model_data:
        return triples

    for key, value in model_data["entities"].items():
        if not value or key.lower() in {'id', 'model'}:
            continue

        mapping = get_dynamic_mapping(key)
        values = value if isinstance(value, list) else [value]

        for val in values:
            if pd.isna(val) or str(val).lower() in {'none', 'null', 'nan'}:
                continue

            # Create entity URI based on key only
            entity_uri = f"{model_uri}-{clean_identifier(key)}"
            
            # Add core triples
            triples.extend([
                {"s": model_uri, "p": mapping['predicate_uri'], "o": entity_uri},
                {"s": entity_uri, "p": "rdf:type", "o": mapping['class_uri']},
                {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(val)}
            ])
            
            # Add class hierarchy once per class
            if mapping['class_uri'] not in processed_classes:
                triples.append({
                    "s": mapping['class_uri'],
                    "p": "rdfs:subClassOf",
                    "o": "http://purl.obolibrary.org/obo/mcro.owl#Component"
                })
                processed_classes.add(mapping['class_uri'])

    return triples

def process_huggingface_models(limit=10):
    """Process models and generate RDF triples"""
    api = HfApi(token=HFTOKEN)
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model in enumerate(models):
        try:
            card = ModelCard.load(model.modelId, token=HFTOKEN)
            entities = extract_hf_entities(card.text)
            
            if not entities:
                print(f"Skipping {model.modelId} - no entities found")
                continue

            model_data = {"id": model.modelId, "entities": entities}
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)

            if idx % 5 == 0:
                print(f"Processed {idx+1}/{len(models)}: {model.modelId}")
                print(f"Generated {len(triples)} triples from {len(entities)} entities")

        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")

    with open("extracted_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)

    print("\n=== STATISTICS ===")
    print(f"Total models processed: {len(models)}")
    print(f"Total triples generated: {len(all_triples)}")
    return all_triples

if __name__ == "__main__":
    print("=== TRIPLE GENERATION STARTED ===")
    triples = process_huggingface_models(limit=50)
    print("\n=== COMPLETED ===")
    print(f"Final triple count: {len(triples)}")

=== TRIPLE GENERATION STARTED ===
Extraction error: 400 API key expired. Please renew the API key. [reason: "API_KEY_INVALID"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
, locale: "en-US"
message: "API key expired. Please renew the API key."
]
Skipping timm/mobilenetv3_small_100.lamb_in1k - no entities found
Extraction error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 48
}
]
Skipping sentence-transformers/all-MiniLM-L6-v2 - no entities found
Extraction error: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [vi