In [None]:
import re
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import HfApi, ModelCard
import os

# Configuration
LTOKEN = "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"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
DEVICE = "cpu"

# Initialize HuggingFace API
api = HfApi(token=LTOKEN)

# Enhanced prompt to extract more entities
EXTRACTION_PROMPT = """
Extract ALL possible structured information from this model card. Include these entities:
1. Architecture (e.g., BERT, GPT, ResNet, VIT, Transformer)
2. Task (e.g., Text Classification, Object Detection, NER)
3. Language (e.g., English, Multilingual)
4. Dataset (e.g., COCO, ImageNet)
5. License (e.g., MIT, Apache)
6. Tags/Categories (as list)
7. Framework (e.g., PyTorch, TensorFlow)
8. Modalities (e.g., Text, Image, Audio)
9. Metrics (e.g., Accuracy: 92%, F1: 0.85)
10. Version

### Rules:
1. Return JSON with ALL entities (use "Unknown" for missing fields)
2. No markdown or extra text
3. Preserve all original terminology

Example:
Text: "BERT model for Spanish NER trained on CoNLL-2002"
Output: {"Architecture": "BERT", "Task": "Named Entity Recognition", "Language": "Spanish", "Dataset": "CoNLL-2002"}

Text to Analyze:
{text}
JSON Response:
""".strip()

def create_extraction_prompt(text):
    max_chars = 15000
    sanitized_text = text[:max_chars].replace("{", "{{").replace("}", "}}")
    return EXTRACTION_PROMPT.replace("{text}", sanitized_text)

def clean_identifier(text):
    return re.sub(r'[^a-zA-Z0-9-_ ]', '', str(text)).replace(' ', '-').lower()[:64]

def extract_hf_entities(text):
    prompt = create_extraction_prompt(text)
    extracted = {
        "Architecture": "Unknown",
        "Task": "Unknown",
        "Language": "Unknown",
        "Dataset": "Unknown",
        "License": "Unknown",
        "Tags": "Unknown",
        "Framework": "Unknown",
        "Modalities": "Unknown",
        "Metrics": "Unknown",
        "Version": "Unknown"
    }
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=LTOKEN)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=LTOKEN,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
            device_map=DEVICE
        )
        
        inputs = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True)
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=800,
                temperature=0.1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
        json_match = re.search(r'\{.*\}', raw, re.DOTALL)
        
        if json_match:
            json_str = json_match.group().strip()
            try:
                llm_extracted = json.loads(json_str)
                extracted.update({k: v for k, v in llm_extracted.items() if v not in ["", "Unknown"]})
            except json.JSONDecodeError:
                print("JSON decode error, using regex fallback")

    except Exception as e:
        print(f"LLM extraction failed: {str(e)}")

    # Regex fallback enhancements
    extracted["Architecture"] = detect_architecture(text) or extracted["Architecture"]
    extracted["Task"] = detect_task(text) or extracted["Task"]
    extracted["Language"] = detect_language(text) or extracted["Language"]
    extracted["License"] = detect_license(text) or extracted["License"]
    extracted["Dataset"] = detect_dataset(text) or extracted["Dataset"]
    extracted["Framework"] = detect_framework(text) or extracted["Framework"]
    extracted["Modalities"] = detect_modalities(text) or extracted["Modalities"]
    extracted["Tags"] = detect_tags(text) or extracted["Tags"]
    extracted["Metrics"] = detect_metrics(text) or extracted["Metrics"]
    extracted["Version"] = detect_version(text) or extracted["Version"]

    # Cleanup
    extracted = {k: v if v not in ["", "Unknown"] else "Unknown" for k, v in extracted.items()}
    print(f"Final extraction: {json.dumps(extracted, indent=2)}")
    return extracted

# Enhanced regex detectors
def detect_architecture(text):
    patterns = [
        r"(?i)\b(bert|roberta|gpt|vit|transformer|resnet|distilbert|albert|t5|llama|falcon|bloom|mistral|xlm|deberta|electra|mobilebert|convnext|efficientnet)\b",
        r"(?i)architecture:\s*([A-Za-z0-9-]+)",
        r"(?i)(based on|derived from)\s+([A-Za-z0-9-]+)",
    ]
    return _match_first(patterns, text)

def detect_task(text):
    patterns = [
        r"(?i)\b(text classification|named entity recognition|question answering|object detection|image segmentation|machine translation|summarization|sentiment analysis|speech recognition|tabular regression)\b",
        r"(?i)task:\s*([A-Za-z0-9 ]+)",
        r"(?i)for\s+([A-Za-z0-9 ]+)\s+task",
    ]
    return _match_first(patterns, text)

def detect_language(text):
    patterns = [
        r"(?i)\b(english|french|spanish|german|chinese|japanese|arabic|russian|portuguese|italian|dutch|multilingual)\b",
        r"(?i)language:\s*([A-Za-z]+)",
        r"(?i)trained on\s+([A-Za-z]+)\s+language",
    ]
    return _match_first(patterns, text)

def detect_license(text):
    patterns = [
        r"(?i)\b(MIT|Apache|GPL|BSD|CC-BY|CC-BY-SA|CC-BY-NC|AGPL|LGPL|MPL)\b",
        r"(?i)license:\s*([A-Za-z0-9-]+)",
        r"(?i)released under\s+([A-Za-z0-9-]+)",
    ]
    return _match_first(patterns, text)

def detect_dataset(text):
    patterns = [
        r"(?i)\b(Imagenet|COCO|Wikipedia|Common Crawl|GLUE|SQuAD|MNIST|CIFAR|WikiText|BookCorpus|OpenWebText|PubMed|SNLI|CoNLL)\b",
        r"(?i)dataset:\s*([A-Za-z0-9-]+)",
        r"(?i)trained on\s+([A-Za-z0-9-]+)\s+dataset",
    ]
    return _match_first(patterns, text)

def detect_framework(text):
    patterns = [
        r"(?i)\b(pytorch|tensorflow|jax|keras|sklearn|fastai|huggingface)\b",
        r"(?i)framework:\s*([A-Za-z0-9]+)",
    ]
    return _match_first(patterns, text)

def detect_modalities(text):
    patterns = [
        r"(?i)\b(text|image|audio|video|tabular|multimodal)\b",
        r"(?i)modality:\s*([A-Za-z0-9]+)",
    ]
    return _match_first(patterns, text)

def detect_tags(text):
    patterns = [
        r"(?i)tags:\s*([A-Za-z0-9, ]+)",
        r"(?i)categor(y|ies):\s*([A-Za-z0-9, ]+)",
    ]
    match = _match_first(patterns, text)
    return [t.strip() for t in match.split(',')] if match else "Unknown"

def detect_metrics(text):
    patterns = [
        r"(?i)(accuracy|f1|precision|recall|bleu|rouge|loss|perplexity):\s*([\d.]+%?)",
        r"(?i)metrics:\s*([A-Za-z0-9:., ]+)",
    ]
    metrics = {}
    for pattern in patterns:
        for m in re.finditer(pattern, text, re.IGNORECASE):
            if len(m.groups()) == 2:
                metrics[m.group(1).lower()] = m.group(2)
    return metrics if metrics else "Unknown"

def detect_version(text):
    patterns = [
        r"(?i)version:\s*(\d+\.\d+)",
        r"(?i)v(\d+\.\d+)",
    ]
    return _match_first(patterns, text)

def _match_first(patterns, text):
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.groups()[-1].strip().title()
    return None

def generate_hf_triples(model_data):
    triples = []
    model_id = model_data["id"]
    model_uri = f"hf:{clean_identifier(model_id)}"
    triples.append({"s": model_uri, "p": "rdf:type", "o": "ModelCard:Model"})
    
    for key, value in model_data["entities"].items():
        if value == "Unknown":
            continue
            
        predicate = f"modelcard:has{key.title().replace(' ', '')}"
        clean_key = key.lower().replace(' ', '_')
        
        if isinstance(value, list):
            for item in value:
                entity_uri = f"hf:{clean_key}-{clean_identifier(str(item))}"
                triples.append({"s": model_uri, "p": predicate, "o": entity_uri})
                triples.append({"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(item)})
        elif isinstance(value, dict):
            for subkey, subvalue in value.items():
                entity_uri = f"hf:{clean_key}-{subkey}-{clean_identifier(str(subvalue))}"
                triples.append({"s": model_uri, "p": predicate, "o": entity_uri})
                triples.append({"s": entity_uri, "p": "dul:hasParameterDataValue", "o": f"{subkey}: {subvalue}"})
        else:
            entity_uri = f"hf:{clean_key}-{clean_identifier(str(value))}"
            triples.append({"s": model_uri, "p": predicate, "o": entity_uri})
            triples.append({"s": entity_uri, "p": "dul:hasParameterDataValue", "o": str(value)})
            
    return triples

def process_huggingface_models(limit=20):
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []
    
    for idx, model in enumerate(models):
        try:
            print(f"\nProcessing {idx+1}/{len(models)}: {model.modelId}")
            card = ModelCard.load(model.modelId, token=LTOKEN)
            card_text = card.text or "No text available"
            
            extracted = extract_hf_entities(card_text)
            if all(v == "Unknown" for v in extracted.values()):
                print(f"Skipping {model.modelId} - no data")
                continue
                
            model_data = {
                "id": model.modelId,
                "entities": extracted
            }
            
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)
            print(f"  Triples generated: {len(triples)}")
            
        except Exception as e:
            print(f"Error processing {model.modelId}: {str(e)}")
    
    with open("model_triples.json", "w") as f:
        json.dump(all_triples, f, indent=2)
        
    print("\n=== STATISTICS ===")
    print(f"Total models processed: {len(models)}")
    print(f"Total triples generated: {len(all_triples)}")
    print(f"Average triples per model: {len(all_triples)/max(1, len(models)):.2f}")

if __name__ == "__main__":
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    process_huggingface_models(limit=5)

In [None]:
New code

In [2]:
import json
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import HfApi, ModelCard
import os
import traceback

# Configuration
LTOKEN = "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"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
DEVICE = "cpu"

# Initialize HuggingFace API
api = HfApi(token=LTOKEN)

# Global tokenizer and model variables
tokenizer = None
model = None

# Simplified extraction prompt focused on JSON format
EXTRACTION_PROMPT = """
Extract structured information from this model card text.

Rules:
1. Return ONLY a valid JSON object with properties of the model
2. Use double quotes for keys and string values
3. Format the JSON properly with correct syntax

Extract these properties if available:
- architecture (model architecture like Transformer, BERT, etc.)
- task (model's purpose like classification, translation, etc.)
- license (MIT, Apache, etc.)
- language (English, Chinese, etc.)
- framework (PyTorch, TensorFlow, etc.)
- dataset (training data like COCO, ImageNet, etc.)
- modality (text, image, audio, etc.)
- size (parameter count)
- metrics (performance)

Text to Analyze:
{text}

Return VALID JSON only:
""".strip()

def initialize_model():
    """Initialize the LLM model once"""
    global tokenizer, model
    
    if tokenizer is None or model is None:
        print("Loading LLaMA model...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=LTOKEN)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=LTOKEN,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
            device_map=DEVICE
        )
        print("Model loaded successfully")

def create_extraction_prompt(text):
    """Create a prompt for extraction"""
    # Truncate to avoid exceeding context length
    max_chars = 12000
    if len(text) > max_chars:
        text = text[:max_chars]
    
    # Format for the prompt
    sanitized_text = text.replace("{", "{{").replace("}", "}}")
    return EXTRACTION_PROMPT.replace("{text}", sanitized_text)

def clean_identifier(text):
    """Clean identifier for URI creation"""
    if not text:
        return "unknown"
    return re.sub(r'[^a-zA-Z0-9-_ ]', '', str(text)).replace(' ', '-').lower()[:64]

def extract_json_from_text(text):
    """Extract JSON from text in multiple ways"""
    # Try to find JSON pattern
    json_match = re.search(r'\{.*\}', text, re.DOTALL)
    if not json_match:
        return None
    
    json_str = json_match.group()
    
    # Try direct parsing
    try:
        return json.loads(json_str)
    except:
        pass
    
    # Try with single quotes replaced
    try:
        fixed = json_str.replace("'", '"')
        return json.loads(fixed)
    except:
        pass
    
    # Try with more aggressive cleaning
    try:
        # Replace single-quoted keys and values with double quotes
        pattern = r"'([^']*)'(\s*):(\s*)'([^']*)'"
        replacement = r'"\1"\2:\3"\4"'
        fixed = re.sub(pattern, replacement, json_str)
        fixed = fixed.replace("'", '"')
        return json.loads(fixed)
    except:
        return None

def extract_hf_entities(text):
    """Extract all entities using LLM approach with better JSON handling"""
    initialize_model()
    
    prompt = create_extraction_prompt(text)
    
    try:
        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt", max_length=3072, truncation=True)
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1000,
                temperature=0.1,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )
        
        # Decode the output
        raw = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Raw output sample: {raw[:100]}...")
        
        # Extract and parse JSON
        extracted = extract_json_from_text(raw)
        
        if extracted:
            print(f"Successfully extracted {len(extracted)} fields")
            return extracted
        
        # If all JSON extraction attempts failed, create default extraction
        print("JSON extraction failed, creating default extraction")
        return {"extraction_error": True, "raw_sample": raw[:200]}

    except Exception as e:
        traceback.print_exc()
        print(f"LLM extraction failed: {str(e)}")
        return {"extraction_error": True, "error": str(e)}

def generate_hf_triples(model_data):
    """Generate RDF-like triples with proper class hierarchies"""
    triples = []
    model_id = model_data["id"]
    model_uri = f"hf:{clean_identifier(model_id)}"
    
    # Basic model type triple
    triples.append({
        "s": model_uri, 
        "p": "rdf:type", 
        "o": "ModelCard:Model"
    })
    
    # Handle extraction errors
    if model_data["entities"].get("extraction_error", False):
        triples.append({
            "s": model_uri, 
            "p": "modelcard:extractionStatus", 
            "o": "Failed"
        })
        return triples
    
    # Define property mapping to ontology classes
    property_classes = {
        "architecture": "ModelCard:Architecture",
        "task": "ModelCard:Task",
        "license": "ModelCard:License",
        "language": "ModelCard:Language",
        "framework": "ModelCard:Framework",
        "dataset": "ModelCard:Dataset",
        "modality": "ModelCard:Modality",
        "size": "ModelCard:Size",
        "metrics": "ModelCard:Metrics",
        "tags": "ModelCard:Tag",
        "version": "ModelCard:Version",
    }
    
    # Process extracted fields
    for key, value in model_data["entities"].items():
        if value is None or value == "":
            continue
            
        # Normalize key name
        clean_key = key.lower()
        
        # Get property class for this key
        property_class = property_classes.get(clean_key, "ModelCard:Property")
        
        # Generate predicate name
        predicate = f"modelcard:has{key.title().replace(' ', '').replace('-', '')}"
        
        # Handle different value types
        if isinstance(value, list):
            for item in value:
                if item:  # Skip empty items
                    # Create clean identifier for this value
                    clean_value = clean_identifier(str(item))
                    
                    # Entity URI for this value
                    entity_uri = f"hf:{clean_key}-{clean_value}"
                    
                    # Triple connecting model to property value
                    triples.append({
                        "s": model_uri, 
                        "p": predicate, 
                        "o": entity_uri
                    })
                    
                    # Triple classifying the property value
                    triples.append({
                        "s": entity_uri, 
                        "p": "rdfs:subClassOf", 
                        "o": property_class
                    })
                    
                    # Triple with actual value
                    triples.append({
                        "s": entity_uri, 
                        "p": "dul:hasParameterDataValue", 
                        "o": str(item)
                    })
                    
        elif isinstance(value, dict):
            for subkey, subvalue in value.items():
                if subvalue:  # Skip empty values
                    # Create clean identifier for this subkey
                    clean_subkey = clean_identifier(subkey)
                    
                    # Entity URI for this subkey-value pair
                    entity_uri = f"hf:{clean_key}-{clean_subkey}"
                    
                    # Triple connecting model to property value
                    triples.append({
                        "s": model_uri, 
                        "p": predicate, 
                        "o": entity_uri
                    })
                    
                    # Triple classifying the property value
                    triples.append({
                        "s": entity_uri, 
                        "p": "rdfs:subClassOf", 
                        "o": property_class
                    })
                    
                    # Triple with actual value
                    triples.append({
                        "s": entity_uri, 
                        "p": "dul:hasParameterDataValue", 
                        "o": f"{subkey}: {subvalue}"
                    })
        else:
            # Create clean identifier for this value
            clean_value = clean_identifier(str(value))
            
            # Entity URI for this value
            entity_uri = f"hf:{clean_key}-{clean_value}"
            
            # Triple connecting model to property value
            triples.append({
                "s": model_uri, 
                "p": predicate, 
                "o": entity_uri
            })
            
            # Triple classifying the property value
            triples.append({
                "s": entity_uri, 
                "p": "rdfs:subClassOf", 
                "o": property_class
            })
            
            # Triple with actual value
            triples.append({
                "s": entity_uri, 
                "p": "dul:hasParameterDataValue", 
                "o": str(value)
            })
            
    return triples

def process_huggingface_models(limit=20):
    """Process HuggingFace models and extract information"""
    models = list(api.list_models(sort="downloads", direction=-1, limit=limit))
    all_triples = []
    extracted_data = []
    
    # Initialize model once before processing
    initialize_model()
    
    for idx, model in enumerate(models):
        try:
            print(f"\nProcessing {idx+1}/{len(models)}: {model.modelId}")
            card = ModelCard.load(model.modelId, token=LTOKEN)
            card_text = card.text or "No text available"
            
            extracted = extract_hf_entities(card_text)
            
            model_data = {
                "id": model.modelId,
                "entities": extracted
            }
            
            # Save extracted data after each model (incremental saving)
            extracted_data.append(model_data)
            with open("extracted_model_data_incremental.json", "w") as f:
                json.dump(extracted_data, f, indent=2)
            
            # Generate and save triples after each model
            triples = generate_hf_triples(model_data)
            all_triples.extend(triples)
            with open("model_triples_incremental.json", "w") as f:
                json.dump(all_triples, f, indent=2)
                
            print(f"  Triples generated: {len(triples)}")
            
        except Exception as e:
            traceback.print_exc()
            print(f"Error processing {model.modelId}: {str(e)}")
    
    # Save final files
    try:
        with open("model_triples.json", "w") as f:
            json.dump(all_triples, f, indent=2)
        print(f"Saved {len(all_triples)} triples to model_triples.json")
        
        with open("extracted_model_data.json", "w") as f:
            json.dump(extracted_data, f, indent=2)
        print(f"Saved {len(extracted_data)} model data entries to extracted_model_data.json")
    except Exception as e:
        traceback.print_exc()
        print(f"Error saving final files: {str(e)}")
        
    print("\n=== STATISTICS ===")
    print(f"Total models processed: {len(models)}")
    print(f"Total triples generated: {len(all_triples)}")
    print(f"Average triples per model: {len(all_triples)/max(1, len(models)):.2f}")

if __name__ == "__main__":
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU
    process_huggingface_models(limit=5)

Loading LLaMA model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully

Processing 1/5: FacebookAI/xlm-roberta-large
Raw output sample: Extract structured information from this model card text.

Rules:
1. Return ONLY a valid JSON object...
JSON extraction failed, creating default extraction
  Triples generated: 2

Processing 2/5: Falconsai/nsfw_image_detection
Raw output sample: Extract structured information from this model card text.

Rules:
1. Return ONLY a valid JSON object...
Successfully extracted 9 fields
  Triples generated: 31

Processing 3/5: sentence-transformers/all-MiniLM-L6-v2
Raw output sample: Extract structured information from this model card text.

Rules:
1. Return ONLY a valid JSON object...
JSON extraction failed, creating default extraction
  Triples generated: 2

Processing 4/5: google-bert/bert-base-uncased
Raw output sample: Extract structured information from this model card text.

Rules:
1. Return ONLY a valid JSON object...
JSON extraction failed, creating default extraction
  Triples generated: 2

Pro