In [1]:
from huggingface_hub import HfApi, ModelCard
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import re
import time
import traceback
import google.generativeai as genai

In [2]:
# Configuration
LTOKEN = "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"
GTOKEN = "AIzaSyB8HMqIGvscURWPF75CwnZlXnFFsGh0Vlg"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
DEVICE = "cpu"

In [3]:
# Initialize APIs
genai.configure(api_key=GTOKEN)
api = HfApi(token=LTOKEN)

# Empty mapping that will be populated dy∂modenamically
HF_MAPPING = {}

In [4]:
# Extraction prompt template
EXTRACTION_PROMPT = """
Extract ONLY valid JSON without markdown or extra text:
{schema}
Text:
{text}
""".strip()

# JSON schema for extraction
EXTRACTION_SCHEMA = {
    "Model name": "string",
    "License": "SPDX ID",
    "Architecture": "string",
    "CO2 Emitted": "float|null",
    "Training Data": "string",
    "Languages": ["string"],
    "Base Model": "string",
    "Pipeline Tag": "string",
    "Bias Analysis": "string",
    "Ethical Considerations": "string"
}

In [5]:
def create_extraction_prompt(text):
    """Generate formatted extraction prompt"""
    return EXTRACTION_PROMPT.format(
        schema=json.dumps(EXTRACTION_SCHEMA, indent=2),
        text=text[:8192]
    ).replace('\\', '\\\\')

def clean_identifier(text):
    """Create safe URI components"""
    return re.sub(r'[^a-zA-Z0-9-]', '', str(text).replace(' ', '-').lower()[:64]) or "unknown"

In [6]:
def extract_hf_entities(text):
    """Extract metadata with strict JSON validation"""
    prompt = create_extraction_prompt(text)
    
    # Gemini extraction
    try:
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
        response = model.generate_content(prompt)
        json_str = response.text.strip()
        
        # Remove any markdown artifacts
        if '```json' in json_str:
            json_str = json_str.split('```json')[1].split('```')[0].strip()
            
        # Validate JSON structure
        json_data = json.loads(json_str)
        return json_data if isinstance(json_data, dict) else {}
    except Exception as e:
        print(f"Gemini failed: {str(e)}")

    # LLaMA fallback with improved parsing
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=LTOKEN)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=LTOKEN,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True
        ).to(DEVICE)
        
        inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True).to(DEVICE)
        outputs = model.generate(
            **inputs,
            max_new_tokens=500,
            temperature=0.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract first valid JSON block
        json_start = raw.find('{')
        json_end = raw.rfind('}') + 1
        json_str = raw[json_start:json_end]
        
        # Fix common JSON issues
        json_str = (json_str
            .replace("'", '"')
            .replace('None', 'null')
            .replace('True', 'true')
            .replace('False', 'false'))
        
        return json.loads(json_str)
    except Exception as e:
        print(f"LLaMA failed: {str(e)}")
        return {}

In [7]:
def generate_hf_triples(model_data):
    """Generate RDF triples with dynamic mappings"""
    # Reset mapping for each model
    global HF_MAPPING
    HF_MAPPING = {
        "model_id": "ModelCard:Model"  # Base mapping always present
    }
    
    # Generate mappings based on extracted entities
    entities = model_data.get("entities", {})
    for key in entities:
        if key == "Model name":
            continue  # Handled separately as model ID
            
        # Create entity mapping
        entity_key = key.lower().replace(' ', '_')
        HF_MAPPING[entity_key] = f"ModelCard:{key}"
        
        # Create predicate mapping
        predicate_key = f"{entity_key}_predicate"
        HF_MAPPING[predicate_key] = f"modelcard:has{key.replace(' ', '')}"

    triples = []
    model_uri = f"hf:{clean_identifier(model_data['id'])}"
    triples.append({"s": model_uri, "p": "rdf:type", "o": HF_MAPPING["model_id"]})
    
    # Generate triples for all detected entities
    for key, value in entities.items():
        if key == "Model name":
            continue
            
        entity_key = key.lower().replace(' ', '_')
        predicate_key = f"{entity_key}_predicate"
        
        if predicate_key not in HF_MAPPING:
            continue  # Skip if mapping not created
            
        entity_uri = f"hf:{entity_key}-{clean_identifier(value)}"
        triples.extend([
            {"s": model_uri, "p": HF_MAPPING[predicate_key], "o": entity_uri},
            {"s": entity_uri, "p": "rdfs:subClassOf", "o": HF_MAPPING[entity_key]},
            {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(value)}
        ])

    return triples

In [8]:
def process_huggingface_models(limit=20):
    """Process models with enhanced error handling"""
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model in enumerate(models):
        try:
            # Load model card
            card = ModelCard.load(model.modelId, token=LTOKEN)
            card_text = card.text
            
            # Extract entities
            extracted = extract_hf_entities(card_text)
            if not extracted:
                print(f"Skipping {model.modelId} - no data")
                continue
            
            # Create data structure
            model_data = {
                "id": model.modelId,
                "entities": extracted
            }
            
            # Generate triples with dynamic mappings
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)
            
            # Progress report
            if idx % 5 == 0:
                print(f"Processed {idx}/{len(models)}: {model.modelId}")
                print(f"  Triples: {len(triples)}")

        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")
            traceback.print_exc()

    # Save results
    with open("model_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)
    
    print("\n=== STATISTICS ===")
    predicate_counts = {}
    for t in all_triples:
        predicate_counts[t['p']] = predicate_counts.get(t['p'], 0) + 1
    
    print("Predicate usage:")
    for p, count in sorted(predicate_counts.items(), 
                         key=lambda x: x[1], 
                         reverse=True):
        print(f"  {p}: {count}")
    
    return all_triples

if __name__ == "__main__":
    process_huggingface_models(limit=100)

Processed 0/100: FacebookAI/xlm-roberta-large
  Triples: 28
Processed 5/100: dima806/fairface_age_image_detection
  Triples: 28
Processed 10/100: google/electra-base-discriminator
  Triples: 28
Processed 15/100: FacebookAI/roberta-large
  Triples: 28
Processed 20/100: jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn
  Triples: 28
Processed 25/100: FacebookAI/roberta-base
  Triples: 28
Processed 30/100: jonatasgrosman/wav2vec2-large-xlsr-53-portuguese
  Triples: 28
Processed 35/100: cross-encoder/ms-marco-MiniLM-L6-v2
  Triples: 28
Processed 40/100: google/vit-base-patch16-224
  Triples: 28
Processed 45/100: google-t5/t5-small
  Triples: 28
Processed 50/100: facebook/contriever-msmarco
  Triples: 28
Processed 55/100: jonatasgrosman/wav2vec2-large-xlsr-53-dutch
  Triples: 28
Processed 60/100: openai/clip-vit-base-patch16
  Triples: 28
Processed 65/100: intfloat/multilingual-e5-small
  Triples: 28
Processed 70/100: Alibaba-NLP/gte-base-en-v1.5
  Triples: 28
Processed 75/100: unslothai/