In [1]:
import re
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import HfApi, ModelCard
import google.generativeai as genai
import os

# Configuration
LTOKEN = "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"
GTOKEN = "YOUR_GEMINI_API_KEY"  # Uncomment and add your valid Gemini API key
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
DEVICE = "cpu"

# Initialize APIs
if GTOKEN:
    genai.configure(api_key=GTOKEN)
api = HfApi(token=LTOKEN)

# Schema-Free Prompt (no hardcoded fields)
EXTRACTION_PROMPT = """
Return **only a strict JSON object** with these rules:
1. Extract **every key-value pair** explicitly mentioned in the text (e.g., "Architecture", "License", "Training Data").
2. **Exact terms only**: Use phrases directly from the text (e.g., "BERT", "RoBERTa", "Apache-2.0").
3. **Architecture must be explicitly stated**:
   - Output "BERT" only if the text says "BERT".
   - Output "RoBERTa" only if the text says "RoBERTa".
4. Arrays: Use square brackets (e.g., ["en", "fr"]).
5. No markdown or extra text. Only JSON.
6. Double quotes only.

### Example for BERT:
{{
  "Architecture": "BERT",
  "License": "Apache-2.0",
  "Training Data": ["BooksCorpus", "Wikipedia"],
  "Languages": ["en"],
  "Use Cases": ["classification", "question-answering"]
}}

### Example for RoBERTa:
{{
  "Architecture": "RoBERTa",
  "License": "MIT",
  "Training Data": ["CCNet", "Books"],
  "Languages": ["en", "fr", "100+ others"],
  "Use Cases": ["translation", "summarization"]
}}

### Text to Analyze:
{{text}}
""".strip()

def create_extraction_prompt(text):
    max_chars = 16000  # Increased to capture more details
    sanitized_text = text[:max_chars].replace("{", "{{").replace("}", "}}")
    return EXTRACTION_PROMPT.format(text=sanitized_text)

def clean_identifier(text):
    return re.sub(r'[^a-zA-Z0-9-_ ]', '', str(text)).replace(' ', '-').replace('/', '-').lower()[:64]

def extract_hf_entities(text):
    prompt = create_extraction_prompt(text)
    
    # Try Gemini first
    if GTOKEN:
        try:
            model = genai.GenerativeModel('gemini-1.5-pro-latest')
            response = model.generate_content(prompt)
            json_str = response.text.strip()
            
            # Extract first JSON block
            match = re.search(r'\{.*?\}', json_str, re.DOTALL)
            if match:
                json_str = match.group().strip()
                try:
                    return json.loads(json_str)
                except json.JSONDecodeError as e:
                    print(f"Gemini invalid JSON: {str(e)}. Falling back to LLaMA.")
            else:
                print("Gemini returned no valid JSON. Falling back to LLaMA.")
        except Exception as e:
            print(f"Gemini failed: {str(e)}. Falling back to LLaMA.")
    
    # LLaMA fallback (CPU-only)
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=LTOKEN)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=LTOKEN,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
            device_map="cpu"
        )

        inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True)
        inputs = {k: v.to("cpu") for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                do_sample=False,  # Greedy decoding
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract and clean JSON
        match = re.search(r'\{.*?\}', raw, re.DOTALL)
        if match:
            json_str = match.group().replace("'", '"').strip()
            json_str = re.sub(r",\s*}", "}", json_str)
            json_str = re.sub(r",\s*\]", "]", json_str)
            json_str = re.sub(r'\n+', ' ', json_str)
            
            try:
                return json.loads(json_str)
            except json.JSONDecodeError as e:
                print(f"LLaMA invalid JSON: {str(e)}. Raw output: {json_str[:200]}...")
                return {}
        else:
            print("LLaMA returned no valid JSON. Skipping.")
            return {}
    except Exception as e:
        print(f"LLaMA failed: {str(e)}. Skipping.")
        return {}

def generate_hf_triples(model_data):
    triples = []
    model_uri = f"hf:{clean_identifier(model_data['id'])}"
    triples.append({"s": model_uri, "p": "rdf:type", "o": "ModelCard:Model"})
    
    entities = model_data.get("entities", {})
    for key, value in entities.items():
        if not key.strip():
            continue
            
        entity_key = key.lower().replace(' ', '_')
        predicate = f"modelcard:has{entity_key[0].upper() + entity_key[1:]}"
        
        if isinstance(value, list):
            for item in value:
                if not item.strip():
                    continue
                entity_uri = f"hf:{entity_key}-{clean_identifier(str(item))}"
                triples.extend([
                    {"s": model_uri, "p": predicate, "o": entity_uri},
                    {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(item)}
                ])
        else:
            entity_uri = f"hf:{entity_key}-{clean_identifier(str(value))}"
            triples.extend([
                {"s": model_uri, "p": predicate, "o": entity_uri},
                {"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(value)}
            ])

    return triples

def process_huggingface_models(limit=20):
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []

    for idx, model in enumerate(models):
        try:
            card = ModelCard.load(model.modelId, token=LTOKEN)
            card_text = card.text
            
            # Extract entities with Gemini/LLaMA
            extracted = extract_hf_entities(card_text)
            if not extracted:
                print(f"Skipping {model.modelId} - no data")
                continue
            
            # Generate triples dynamically
            model_data = {
                "id": model.modelId,
                "entities": extracted
            }
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)
            
            print(f"Processed {idx}/{len(models)}: {model.modelId}")
            print(f"  Triples: {len(triples)}")

        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")

    # Save results
    with open("model_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)

    # Print statistics
    print("\n=== STATISTICS ===")
    predicate_counts = {}
    for t in all_triples:
        predicate_counts[t['p']] = predicate_counts.get(t['p'], 0) + 1
    for p, count in sorted(predicate_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"  {p}: {count}")

if __name__ == "__main__":
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Force CPU for LLaMA
    process_huggingface_models(limit=20)

Error processing FacebookAI/xlm-roberta-large: name 'GTOKEN' is not defined
Error processing google-bert/bert-base-uncased: name 'GTOKEN' is not defined
Error processing sentence-transformers/all-MiniLM-L6-v2: name 'GTOKEN' is not defined
Error processing Falconsai/nsfw_image_detection: name 'GTOKEN' is not defined
Error processing dima806/fairface_age_image_detection: name 'GTOKEN' is not defined

=== STATISTICS ===
