In [7]:
from huggingface_hub import HfApi, ModelCard
import json
import re
import pandas as pd
import google.generativeai as genai

# Authentication
HFTOKEN = "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"
GTOKEN = "AIzaSyCLwWkDW03zjzVKUQf3ui5wgcreVJdsMbw"

processed_classes = set()

def clean_identifier(text):
    """Generate safe URI component"""
    if pd.isna(text) or text in ['None', 'nan']:
        return "unknown"
    return re.sub(r'[^a-zA-Z0-9]', '', str(text).replace(' ', ''))[:50]

def get_dynamic_mapping(entity_key):
    """Generate ontology mappings with base URI"""
    clean_key = ''.join([w.capitalize() for w in re.split('[^a-zA-Z0-9]', entity_key)])
    return {
        "class_uri": f"http://purl.obolibrary.org/obo/mcro.owl#{clean_key}",
        "predicate_uri": f"http://purl.obolibrary.org/obo/mcro.owl#has{clean_key}"
    }

def extract_hf_entities(text):
    try:
        prompt = f"""Extract metadata strictly using terms defined in the MCRO Ontology:
        http://purl.obolibrary.org/obo/mcro.owl
        
Include only entities that clearly correspond to existing MCRO classes or properties.
Avoid adding new or ad-hoc terms. Use exact labels from the ontology where applicable.

Valid categories include: Model, Dataset, Metric, TrainingProcedure, EvaluationResult, Configuration, etc.

Return a valid JSON object ONLY, no explanation. Example:
{{"dataset": "Wikipedia", "architecture": "Transformer"}}

Model card text:
{text[:10000]}"""

        genai.configure(api_key=GTOKEN)
        model = genai.GenerativeModel('gemini-2.0-flash')
        response = model.generate_content(prompt)

        # Extract JSON block
        json_str = response.text.strip()
        if "```json" in json_str:
            json_str = json_str.split("```json")[1].split("```")[0].strip()
        
        # Basic JSON sanity check
        if not json_str.startswith("{") or not json_str.endswith("}"):
            raise ValueError("Invalid JSON structure")

        # Try parsing directly
        return json.loads(json_str)
    
    except json.JSONDecodeError as je:
        print(f"JSON Decode Error: {str(je)} - attempting repair")
        try:
            # Attempt to fix common JSON issues
            json_str = re.sub(r',\s*}', '}', json_str)  # remove trailing commas
            json_str = re.sub(r',\s*]', ']', json_str)
            return json.loads(json_str)
        except Exception as je2:
            print(f"Repair failed: {str(je2)}")
            return {}
    
    except Exception as e:
        print(f"Extraction error: {str(e)}")
        return {}

def generate_hf_triples(model_data):
    """Generate RDF triples with ontology alignment"""
    global processed_classes
    triples = []
    model_id = model_data.get('id', 'unknown')
    model_uri = f"http://purl.obolibrary.org/obo/mcro.owl#{clean_identifier(model_id)}"
    
    # Base model type assertion
    triples.append({
        "s": model_uri,
        "p": "rdf:type",
        "o": "http://purl.obolibrary.org/obo/mcro.owl#Model"
    })

    if "entities" not in model_data:
        return triples

    for key, value in model_data["entities"].items():
        if not value or key.lower() in {'id', 'model'}:
            continue

        mapping = get_dynamic_mapping(key)
        values = value if isinstance(value, list) else [value]

        for val in values:
            if pd.isna(val) or str(val).lower() in {'none', 'null', 'nan'}:
                continue

            # Create entity URI based on key only
            entity_uri = f"{model_uri}-{clean_identifier(key)}"
            
            # Add core triples
            triples.extend([
                {"s": model_uri, "p": mapping['predicate_uri'], "o": entity_uri},
                {"s": entity_uri, "p": "rdf:type", "o": mapping['class_uri']},
                {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(val)}
            ])
            
            # Add class hierarchy once per class
            if mapping['class_uri'] not in processed_classes:
                triples.append({
                    "s": mapping['class_uri'],
                    "p": "rdfs:subClassOf",
                    "o": "http://purl.obolibrary.org/obo/mcro.owl#Component"
                })
                processed_classes.add(mapping['class_uri'])

    return triples

def process_huggingface_models(limit=10):
    """Process models and generate RDF triples"""
    api = HfApi(token=HFTOKEN)
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model in enumerate(models):
        try:
            card = ModelCard.load(model.modelId, token=HFTOKEN)
            entities = extract_hf_entities(card.text)
            
            if not entities:
                print(f"Skipping {model.modelId} - no entities found")
                continue

            model_data = {"id": model.modelId, "entities": entities}
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)

            if idx % 5 == 0:
                print(f"Processed {idx+1}/{len(models)}: {model.modelId}")
                print(f"Generated {len(triples)} triples from {len(entities)} entities")

        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")

    with open("extracted_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)

    print("\n=== STATISTICS ===")
    print(f"Total models processed: {len(models)}")
    print(f"Total triples generated: {len(all_triples)}")
    return all_triples

if __name__ == "__main__":
    print("=== TRIPLE GENERATION STARTED ===")
    triples = process_huggingface_models(limit=10)
    print("\n=== COMPLETED ===")
    print(f"Final triple count: {len(triples)}")

=== TRIPLE GENERATION STARTED ===
Processed 1/10: timm/mobilenetv3_small_100.lamb_in1k
Generated 13 triples from 4 entities
Processed 6/10: openai/clip-vit-large-patch14
Generated 121 triples from 6 entities

=== STATISTICS ===
Total models processed: 10
Total triples generated: 392

=== COMPLETED ===
Final triple count: 392
