In [1]:
from huggingface_hub import HfApi, ModelCard
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch, json, string, time
from datetime import datetime
import re
import json
import traceback
import google.generativeai as genai

In [2]:
# 1. Authentication and Setup
LTOKEN = "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"
GTOKEN = "AIzaSyDFZ8kE9GbQDjH30fxJsDnrfuJR-TrQYvg"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

In [3]:
import google.generativeai as genai
genai.configure(api_key=GTOKEN)

# Verify API connectivity properly
print("Working Gemini Models:")
for model in genai.list_models():
    if 'generateContent' in model.supported_generation_methods:
        print(f"- {model.name}")

Working Gemini Models:
- models/gemini-1.0-pro-vision-latest
- models/gemini-pro-vision
- models/gemini-1.5-pro-latest
- models/gemini-1.5-pro-001
- models/gemini-1.5-pro-002
- models/gemini-1.5-pro
- models/gemini-1.5-flash-latest
- models/gemini-1.5-flash-001
- models/gemini-1.5-flash-001-tuning
- models/gemini-1.5-flash
- models/gemini-1.5-flash-002
- models/gemini-1.5-flash-8b
- models/gemini-1.5-flash-8b-001
- models/gemini-1.5-flash-8b-latest
- models/gemini-1.5-flash-8b-exp-0827
- models/gemini-1.5-flash-8b-exp-0924
- models/gemini-2.0-flash-exp
- models/gemini-2.0-flash
- models/gemini-2.0-flash-001
- models/gemini-2.0-flash-lite-001
- models/gemini-2.0-flash-lite
- models/gemini-2.0-flash-lite-preview-02-05
- models/gemini-2.0-flash-lite-preview
- models/gemini-2.0-pro-exp
- models/gemini-2.0-pro-exp-02-05
- models/gemini-exp-1206
- models/gemini-2.0-flash-thinking-exp-01-21
- models/gemini-2.0-flash-thinking-exp
- models/gemini-2.0-flash-thinking-exp-1219
- models/learnlm-1.5

In [None]:
# 2. Load LLaMA Model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=LTOKEN)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
DEVICE = "cpu" if not torch.cuda.is_available() else "cuda"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    token=LTOKEN,
    torch_dtype=torch.float32,
).to(DEVICE)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Define the ontology mappings
HF_MAPPING = {
    # Core Metadata
    "model_name": "ModelCard:Model",
    "license": "ModelCard:License",
    "model_type": "ModelCard:ModelType",
    "architecture": "ModelCard:Architecture",
    "language": "ModelCard:Language",
    "base_model": "ModelCard:BaseModel",
    "pipeline_tag": "ModelCard:PipelineType",
    "library_name": "ModelCard:Library",
    "repo": "ModelCard:Repository",
    "paper": "ModelCard:Publication",
    "developers": "ModelCard:Author",

    # Environmental Impact
    "co2_emitted": "ModelCard:CarbonEmissions",
    "energy_usage_kwh": "ModelCard:EnergyUsage",
    "hardwaare_type": "ModelCard:Hardware",
    "hours_used": "ModelCard:TrainingDuration",
    "cloud_provider": "ModelCard:CloudProvider",
    "cloud_region": "ModelCard:ComputeRegion",

    # Ethics/Bias
    "bias_risks_limitations": "ModelCard:BiasAnalysis",
    "ethical_considerations": "ModelCard:EthicalImpact",
    "recommendations": "ModelCard:RiskMitigation",
    "out_of_scope_use": "ModelCard:Limitation",

    # Training/Evaluation
    "training_data": "ModelCard:TrainingData",
    "training_regime": "ModelCard:TrainingHyperparameters",
    "testing_data": "ModelCard:TestingData",
    "testing_metrics": "ModelCard:EvaluationMetrics",
    "results": "ModelCard:PerformanceResults",

    # Predicates (Semantic Relationships)
    "license_predicate": "modelcard:hasLicense",
    "architecture_predicate": "modelcard:hasArchitecture",
    "environmental_impact_predicate": "modelcard:hasEnvironmentalImpact",
    "bias_predicate": "modelcard:hasBiasAnalysis",
    "ethics_predicate": "modelcard:hasEthicalConsideration",
    "training_data_predicate": "modelcard:usesTrainingData",
    "base_model_predicate": "modelcard:derivedFrom",
    # ... (add predicates for all fields)
}

In [5]:
# Helper function to clean identifiers
def clean_identifier(text):
    """Clean text to be used as an identifier"""
    if not text:
        return "unknown"
    # Remove non-alphanumeric chars, replace spaces with underscores
    cleaned = re.sub(r'[^a-zA-Z0-9-]', '', str(text).replace(' ', '-'))
    # Ensure it's not empty
    return cleaned if cleaned else "unknown"



In [6]:
def extract_hf_entities(text):
    """Extract entities using ONLY Gemini/LLaMA """
    try:
        unified_prompt = f"""Extract model metadata as JSON with VALID SPDX LICENSE IDS. Include all available fields:
{{
  "Model name": "string",
  "License type": "string (SPDX ID)",
  "Architecture": "string",
  "CO2 emitted": "float|null",
  "Training data sources": "string",
  "Datasets": ["string"],
  "Languages": ["string"],
  "Metrics": ["string"],
  "Base model": "string",
  "Pipeline tag": "string",
  "Library name": "string"
}}

Model card text:
{text[:10000]}"""

        # Try Gemini first
        if GTOKEN:
            genai.configure(api_key=GTOKEN)
            gemini_model = genai.GenerativeModel('gemini-1.5-pro-latest')

            response = gemini_model.generate_content(unified_prompt)
            json_str = response.text.replace("```json", "").replace("```", "").strip()

            try:
                entities = json.loads(json_str)
                if not all(entities.get(k) for k in ["License type", "Architecture"]):
                    raise ValueError("Missing required fields")
                return entities
            except (json.JSONDecodeError, ValueError) as e:
                print(f"Gemini validation failed: {str(e)}")

        # Fallback to LLaMA with same unified prompt
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=LTOKEN)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=LTOKEN,
            torch_dtype=torch.float32,
        ).to(DEVICE)

        inputs = tokenizer(unified_prompt, return_tensors="pt", max_length=4096, truncation=True).to(DEVICE)
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=600,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

        raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        json_str = re.search(r'\{.*\}', raw_output, re.DOTALL).group()

        # JSON cleaning
        json_str = (json_str.replace("'", '"')
                          .replace("None", "null")
                          .replace("True", "true")
                          .replace("False", "false"))
        json_str = re.sub(r',\s*([}\]])', r'\1', json_str)

        entities = json.loads(json_str)
        if not entities.get("License type"):
            entities["License type"] = "unknown-license"

        return entities

    except Exception as e:
        print(f"LLM extraction failed: {str(e)}")
        return {}

In [7]:
# Triple generation function
def generate_hf_triples(model_data):
    triples = []
    prefix = "hf:"

    # Base model triple
    model_uri = prefix + clean_identifier(model_data['id'])
    triples.append({
        "s": model_uri,
        "p": "rdf:type",
        "o": HF_MAPPING["model_name"]
    })

    # Process extracted entities
    if "entities" in model_data:
        # License
        if "License type" in model_data["entities"] and model_data["entities"]["License type"]:
            license_value = model_data["entities"]["License type"]
            license_uri = f"{prefix}license-{clean_identifier(license_value)}"

            triples.extend([
                {
                    "s": model_uri,
                    "p": HF_MAPPING.get("license_predicate", "modelcard:hasLicense"),
                    "o": license_uri
                },
                {
                    "s": license_uri,
                    "p": "rdfs:subClassOf",
                    "o": HF_MAPPING["license"]
                },
                {
                    "s": license_uri,
                    "p": "dul:hasParameterDataValue",
                    "o": str(license_value)
                }
            ])

        # Architecture
        if "Architecture" in model_data["entities"] and model_data["entities"]["Architecture"]:
            arch_value = model_data["entities"]["Architecture"]
            arch_uri = f"{prefix}Architecture-{clean_identifier(arch_value)}"

            triples.extend([
                {
                    "s": model_uri,
                    "p": HF_MAPPING.get("architecture_predicate", "modelcard:hasArchitecture"),
                    "o": arch_uri
                },
                {
                    "s": arch_uri,
                    "p": "rdfs:subClassOf",
                    "o": HF_MAPPING["architecture"]
                },
                {
                    "s": arch_uri,
                    "p": "dul:hasParameterDataValue",
                    "o": str(arch_value)
                }
            ])

        # CO2 emissions
        if "CO2 emitted" in model_data["entities"] and model_data["entities"]["CO2 emitted"]:
            try:
                co2_value = float(model_data["entities"]["CO2 emitted"])
                co2_uri = f"{prefix}CO2-{clean_identifier(str(co2_value))}"

                triples.extend([
                    {
                        "s": model_uri,
                        "p": HF_MAPPING.get("co2_predicate", "modelcard:hasEnvironmentalImpact"),
                        "o": co2_uri
                    },
                    {
                        "s": co2_uri,
                        "p": "rdfs:subClassOf",
                        "o": HF_MAPPING["co2_emitted"]
                    },
                    {
                        "s": co2_uri,
                        "p": "dul:hasParameterDataValue",
                        "o": str(co2_value)
                    }
                ])
            except (ValueError, TypeError):
                print(f"Invalid CO2 value: {model_data['entities']['CO2 emitted']}")

        # Training data
        if "Training data sources" in model_data["entities"] and model_data["entities"]["Training data sources"]:
            data_value = model_data["entities"]["Training data sources"]
            data_uri = f"{prefix}TrainingData-{clean_identifier(data_value)[:30]}"

            triples.extend([
                {
                    "s": model_uri,
                    "p": HF_MAPPING.get("training_data_predicate", "modelcard:hasTrainingData"),
                    "o": data_uri
                },
                {
                    "s": data_uri,
                    "p": "rdfs:subClassOf",
                    "o": HF_MAPPING["training_data"]
                },
                {
                    "s": data_uri,
                    "p": "dul:hasParameterDataValue",
                    "o": str(data_value)
                }
            ])

    return triples

In [8]:
# Processing pipeline
def process_huggingface_models(limit=20):  # Reduced default limit for testing
    api = HfApi(token=LTOKEN)

    # Get models sorted by downloads
    models = list(api.list_models(
        sort="downloads",
        direction=-1,
        limit=limit,
        token=LTOKEN
    ))

    all_triples = []
    for idx, model in enumerate(models):
        try:
            # Add slight delay to avoid rate limits
            if idx % 10 == 0:
                time.sleep(0.5)

            # Get model card text
            try:
                card = ModelCard.load(model.modelId, token=LTOKEN)
                card_text = card.text
            except Exception as e:
                print(f"Error loading card for {model.modelId}: {str(e)}")
                # Create minimal triples even if we can't load the card
                model_uri = f"hf:{clean_identifier(model.modelId)}"
                all_triples.append({
                    "s": model_uri,
                    "p": "rdf:type",
                    "o": HF_MAPPING["model_name"]
                })
                continue

            # Extract entities using regex instead of LLaMA
            extracted_entities = extract_hf_entities(card_text)
            if not extracted_entities:
                print(f"Skipping {model.modelId} - no entities extracted")
                continue

            # Create model data structure
            model_data = {
                "id": model.modelId,
                "entities": extracted_entities
            }

            # Generate and collect triples
            model_triples = generate_hf_triples(model_data)
            all_triples.extend(model_triples)

            # Print progress
            if idx % 5 == 0:
                print(f"Processed {idx+1}/{len(models)}: {model.modelId}")
                # Print number of triples for this model
                print(f"Generated {len(model_triples)} triples for this model")
                # Show sample triples with different predicates if available
                predicates_shown = set()
                for triple in model_triples:
                    if triple["p"] not in predicates_shown and len(predicates_shown) < 3:
                        print(f"Sample triple: {triple}")
                        predicates_shown.add(triple["p"])

        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")
            traceback.print_exc()  # Print full stack trace for debugging

    # Save to file
    with open("top_models_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)

    print("\n=== STATISTICS ===")
    predicates = {}
    for triple in all_triples:
        p = triple["p"]
        predicates[p] = predicates.get(p, 0) + 1

    print("Predicate counts:")
    for p, count in sorted(predicates.items(), key=lambda x: x[1], reverse=True):
        print(f"  {p}: {count}")

    return all_triples

In [None]:
# Main function
if __name__ == "__main__":
    print("=== STARTING PROCESS ===")
    triples = process_huggingface_models(limit=20)  # Use smaller limit for testing

    print("\n=== RESULTS SAVED TO FILE ===")
    print(f"Total triples generated: {len(triples)}")

=== STARTING PROCESS ===
Processed 1/20: FacebookAI/xlm-roberta-large
Generated 10 triples for this model
Sample triple: {'s': 'hf:FacebookAIxlm-roberta-large', 'p': 'rdf:type', 'o': 'ModelCard:Model'}
Sample triple: {'s': 'hf:FacebookAIxlm-roberta-large', 'p': 'modelcard:hasLicense', 'o': 'hf:license-MIT'}
Sample triple: {'s': 'hf:license-MIT', 'p': 'rdfs:subClassOf', 'o': 'ModelCard:License'}
Processed 6/20: timm/mobilenetv3_small_100.lamb_in1k
Generated 10 triples for this model
Sample triple: {'s': 'hf:timmmobilenetv3small100lambin1k', 'p': 'rdf:type', 'o': 'ModelCard:Model'}
Sample triple: {'s': 'hf:timmmobilenetv3small100lambin1k', 'p': 'modelcard:hasLicense', 'o': 'hf:license-Apache-20'}
Sample triple: {'s': 'hf:license-Apache-20', 'p': 'rdfs:subClassOf', 'o': 'ModelCard:License'}
Processed 11/20: Bingsu/adetailer
Generated 10 triples for this model
Sample triple: {'s': 'hf:Bingsuadetailer', 'p': 'rdf:type', 'o': 'ModelCard:Model'}
Sample triple: {'s': 'hf:Bingsuadetailer', 'p':

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]