In [1]:
from huggingface_hub import HfApi, ModelCard
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import re
import time
import traceback
import google.generativeai as genai
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Force CPU

In [2]:
# Configuration
LTOKEN = "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"
GTOKEN = "AIzaSyD-QYG2WyMAdM1uQZHMeiCqhJP61y8N9nw"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
DEVICE = "cpu"

In [3]:
# Initialize APIs
genai.configure(api_key=GTOKEN)
api = HfApi(token=LTOKEN)

# Empty mapping that will be populated dynamically
HF_MAPPING = {}

In [4]:
EXTRACTION_PROMPT = """
Extract **only the following fields** from the model card text below.  
Populate values **only if explicitly mentioned** in the text. Omit fields with no data.  
Return the result as a **strict JSON object** with **double quotes** and **no extra text**.

### Fields to Extract:
- Architecture (e.g., "RoBERTa", "BERT")
- License (e.g., "Apache-2.0", "MIT")
- Training Data (e.g., "BookCorpus, Wikipedia")
- Languages (array of strings, e.g., ["en", "fr"])
- Use Cases (array of tasks, e.g., ["text-classification"])
- Model Description (brief description)
- Dataset Construction (how data was compiled)
- Limitations (known issues)

### Rules:
1. **No extra fields**: Only include the listed fields if present in the text.
2. **Arrays**: Use square brackets for lists (e.g., ["en", "fr"]).
3. **Strings**: Use double quotes for all text values.
4. **No markdown**: Only raw JSON.

### Example Output:
{
  "Architecture": "RoBERTa",
  "License": "MIT",
  "Training Data": "BookCorpus, Wikipedia",
  "Languages": ["en"],
  "Use Cases": ["text-classification", "question-answering"],
  "Model Description": "RoBERTa-base model pre-trained by Facebook AI Research...",
  "Dataset Construction": "Combination of public datasets",
  "Limitations": "Limited support for rare languages"
}

### Text to Analyze:
{{text}}
""".strip()

In [5]:
def create_extraction_prompt(text):
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
    schema_str = schema_str.replace("{", "{{").replace("}", "}}")  # Escape schema braces
    return EXTRACTION_PROMPT.format(
        schema=schema_str,
        text=text[:8192]
    ).replace("\\", "\\\\")  # Keep existing backslash escaping

def clean_identifier(text):
    # Replace slashes and spaces with hyphens
    sanitized = re.sub(r'[^a-zA-Z0-9-]', '', 
                      str(text).replace('/', '-').replace(' ', '-').lower())
    # Collapse multiple hyphens
    sanitized = re.sub(r'-+', '-', sanitized)
    # Truncate to 64 characters
    return sanitized[:64] or "unknown"

In [6]:
def extract_hf_entities(text):
    prompt = create_extraction_prompt(text)
    
    # Gemini extraction
    try:
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
        response = model.generate_content(prompt)
        json_str = response.text.strip()
        
        # Extract valid JSON block
        match = re.search(r'\{.*\}', json_str, re.DOTALL)
        if match:
            json_str = match.group()
            try:
                return json.loads(json_str)
            except json.JSONDecodeError as e:
                print(f"Invalid JSON: {str(e)}. Raw output: {json_str[:200]}...")
                return {}
        else:
            print("No valid JSON found.")
            return {}
    except Exception as e:
        print(f"Gemini failed: {str(e)}")
        return {}

    # LLaMA fallback
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=LTOKEN)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=LTOKEN,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
            device_map="cpu"
        )

        inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True)
        inputs = {k: v.to("cpu") for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract JSON
        json_start = raw.find('{')
        json_end = raw.rfind('}') + 1
        json_str = raw[json_start:json_end].replace("'", '"')
        
        # Validate JSON
        try:
            return json.loads(json_str)
        except json.JSONDecodeError as e:
            print(f"Invalid LLaMA JSON: {str(e)}")
            print(f"Raw output: {json_str[:100]}...")
            return {}
    except Exception as e:
        print(f"LLaMA failed: {str(e)}")
        return {}

In [7]:
def generate_hf_triples(model_data):
    global HF_MAPPING
    HF_MAPPING = {
        "model_id": "ModelCard:Model"  # Base mapping always present
    }
    
    # Generate mappings based on extracted entities
    entities = model_data.get("entities", {})
    for key in entities:
        if key == "Model name":
            continue  # Handled separately as model ID
            
        # Create entity mapping
        entity_key = key.lower().replace(' ', '_')
        HF_MAPPING[entity_key] = f"ModelCard:{key}"
        
        # Create predicate mapping
        predicate_key = f"{entity_key}_predicate"
        HF_MAPPING[predicate_key] = f"modelcard:has{key.replace(' ', '')}"

    triples = []
    model_id = model_data['id']
    # Remove redundant parts (e.g., "google-bert-bert-base-uncased" → "google-bert-base-uncased")
    if "/" in model_id:
        model_id = model_id.split("/")[-1]  # Take the last segment
    model_uri = f"hf:{clean_identifier(model_id)}"
    triples.append({"s": model_uri, "p": "rdf:type", "o": HF_MAPPING["model_id"]})
    
    # Generate triples for all detected entities
    for key, value in entities.items():
        if key == "Model name":
            continue
            
        entity_key = key.lower().replace(' ', '_')
        predicate_key = f"{entity_key}_predicate"
        
        if predicate_key not in HF_MAPPING:
            continue  # Skip if mapping not created
            
        if isinstance(value, list):
            for item in value:
                if not item:  # Skip empty strings
                    continue
                entity_uri = f"hf:{entity_key}-{clean_identifier(item)}"
                triples.extend([
                    {"s": model_uri, "p": HF_MAPPING[predicate_key], "o": entity_uri},
                    {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(item)}
                ])
        else:
            entity_uri = f"hf:{entity_key}-{clean_identifier(str(value))}"
            triples.extend([
                {"s": model_uri, "p": HF_MAPPING[predicate_key], "o": entity_uri},
                {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(value)}
            ])

    return triples

In [8]:
def process_huggingface_models(limit=20):
    """Process models with enhanced error handling"""
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model in enumerate(models):
        try:
            # Load model card
            card = ModelCard.load(model.modelId, token=LTOKEN)
            card_text = card.text
            
            # Extract entities
            extracted = extract_hf_entities(card_text)
            if not extracted:
                print(f"Skipping {model.modelId} - no data")
                continue
            
            # Create data structure
            model_data = {
                "id": model.modelId,
                "entities": extracted
            }
            
            # Generate triples with dynamic mappings
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)
            
            # Progress report
            if idx % 5 == 0:
                print(f"Processed {idx}/{len(models)}: {model.modelId}")
                print(f"  Triples: {len(triples)}")

        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")
            traceback.print_exc()

    # Save results
    with open("model_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)
    
    print("\n=== STATISTICS ===")
    predicate_counts = {}
    for t in all_triples:
        predicate_counts[t['p']] = predicate_counts.get(t['p'], 0) + 1
    
    print("Predicate usage:")
    for p, count in sorted(predicate_counts.items(), 
                         key=lambda x: x[1], 
                         reverse=True):
        print(f"  {p}: {count}")
    
    return all_triples

if __name__ == "__main__":
    process_huggingface_models(limit=20)

Error processing FacebookAI/xlm-roberta-large: name 'EXTRACTION_SCHEMA' is not defined
Error processing google-bert/bert-base-uncased: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing sentence-transformers/all-MiniLM-L6-v2: name 'EXTRACTION_SCHEMA' is not defined
Error processing Falconsai/nsfw_image_detection: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing dima806/fairface_age_image_detection: name 'EXTRACTION_SCHEMA' is not defined
Error processing timm/mobilenetv3_small_100.lamb_in1k: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing amazon/chronos-t5-small: name 'EXTRACTION_SCHEMA' is not defined
Error processing openai/clip-vit-large-patch14: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing sentence-transformers/all-mpnet-base-v2: name 'EXTRACTION_SCHEMA' is not defined
Error processing google/electra-base-discriminator: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing Bingsu/adetailer: name 'EXTRACTION_SCHEMA' is not defined
Error processing timm/resnet50.a1_in1k: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing sentence-transformers/multi-qa-MiniLM-L6-cos-v1: name 'EXTRACTION_SCHEMA' is not defined
Error processing openai-community/gpt2: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing openai/clip-vit-base-patch32: name 'EXTRACTION_SCHEMA' is not defined
Error processing jonatasgrosman/wav2vec2-large-xlsr-53-english: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn: name 'EXTRACTION_SCHEMA' is not defined
Error processing google/vit-base-patch16-224-in21k: name 'EXTRACTION_SCHEMA' is not defined


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289

Error processing sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2: name 'EXTRACTION_SCHEMA' is not defined
Error processing facebook/esmfold_v1: name 'EXTRACTION_SCHEMA' is not defined

=== STATISTICS ===
Predicate usage:


Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/2564728930.py", line 2, in extract_hf_entities
    prompt = create_extraction_prompt(text)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/1707082564.py", line 2, in create_extraction_prompt
    schema_str = json.dumps(EXTRACTION_SCHEMA, indent=2)
                            ^^^^^^^^^^^^^^^^^
NameError: name 'EXTRACTION_SCHEMA' is not defined
Traceback (most recent call last):
  File "/scratch-node/20230112.2294791/ipykernel_3411625/548826572.py", line 13, in process_huggingface_models
    extracted = extract_hf_entities(card_text)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch-node/20230112.2294791/ipykernel_3411625/25647289