# Export Fine-Tuned BERT Model for Hugging Face Hub

This notebook loads the fine-tuned controlled_bert_model.pth and saves it in the proper Hugging Face format with all necessary files.

In [13]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DistilBertConfig
import os
import json
from pathlib import Path

## Step 1: Recreate the Label Mapping

First, we need to recreate the exact label mapping that was used during training.

In [14]:
# Load the dataset to recreate the label mapping
df = pd.read_csv('../results/nova_logs_with_regex.csv')
print(f"Loaded dataset with {len(df)} logs")

def create_comprehensive_bert_dataset(df):
    """Recreate the exact dataset processing used during training"""
    
    # Regex-classified logs (convert regex labels to BERT labels)
    regex_classified = df[df['regex_label'].notnull()].copy()
    
    # Unclassified logs from target clusters only
    bert_target_clusters = [3, 5, 6, 9, 13]
    unclassified = df[df['regex_label'].isnull() & df['cluster_id'].isin(bert_target_clusters)].copy()
    
    # Create unified label mapping
    unified_labels = {
        # From regex categories
        'System_Operations_LibVirt': 'System_Operations',
        'Instance_Management_Compute': 'Instance_Management', 
        'Instance_Management_System': 'Instance_Management',
        
        # From clusters (semantic labels)
        3: 'Network_Operations',      # os_vif operations
        5: 'Resource_Management',     # compute claims
        6: 'Scheduler_Operations',    # scheduler reports
        9: 'Network_Operations',      # VIF operations (merge with cluster 3)
        13: 'Error_Handling'          # error patterns
    }
    
    # Apply unified labels to regex-classified logs
    regex_classified['bert_training_label'] = regex_classified['regex_label'].map(unified_labels)
    
    # Apply unified labels to unclassified logs
    unclassified['bert_training_label'] = unclassified['cluster_id'].map(unified_labels)
    
    # Combine datasets
    combined_data = pd.concat([
        regex_classified[['raw_log_text', 'bert_training_label']], 
        unclassified[['raw_log_text', 'bert_training_label']]
    ], ignore_index=True)
    
    # Remove any nulls
    combined_data = combined_data.dropna()
    
    return combined_data

# Recreate the training dataset
comprehensive_data = create_comprehensive_bert_dataset(df)
print(f"\nComprehensive dataset created with {len(comprehensive_data)} samples")
print(f"Label distribution:")
print(comprehensive_data['bert_training_label'].value_counts())

Loaded dataset with 54646 logs

Comprehensive dataset created with 45174 samples
Label distribution:
bert_training_label
Instance_Management     25378
System_Operations        9863
Network_Operations       4067
Resource_Management      2467
Scheduler_Operations     2462
Error_Handling            937
Name: count, dtype: int64


In [15]:
# Recreate the label encoder with the exact same mapping
label_encoder = LabelEncoder()
label_encoder.fit(comprehensive_data['bert_training_label'])

print(f"Number of labels: {len(label_encoder.classes_)}")
print(f"Label classes: {label_encoder.classes_}")
print(f"Label mapping: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

# Store the label mapping for later use
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}

print(f"\nid2label mapping: {id2label}")
print(f"label2id mapping: {label2id}")

Number of labels: 6
Label classes: ['Error_Handling' 'Instance_Management' 'Network_Operations'
 'Resource_Management' 'Scheduler_Operations' 'System_Operations']
Label mapping: {'Error_Handling': 0, 'Instance_Management': 1, 'Network_Operations': 2, 'Resource_Management': 3, 'Scheduler_Operations': 4, 'System_Operations': 5}

id2label mapping: {0: 'Error_Handling', 1: 'Instance_Management', 2: 'Network_Operations', 3: 'Resource_Management', 4: 'Scheduler_Operations', 5: 'System_Operations'}
label2id mapping: {'Error_Handling': 0, 'Instance_Management': 1, 'Network_Operations': 2, 'Resource_Management': 3, 'Scheduler_Operations': 4, 'System_Operations': 5}


## Step 2: Load and Initialize the Base Model Architecture

In [16]:
# Model configuration - same as used during training
model_name = 'distilbert-base-uncased'
num_labels = len(label_encoder.classes_)

print(f"Loading base model: {model_name}")
print(f"Number of labels: {num_labels}")

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
print(f"Tokenizer loaded successfully")

# Create the model with the same configuration used during training
model = DistilBertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

print(f"Base model loaded with {num_labels} labels")
print(f"Model configuration: {model.config}")

Loading base model: distilbert-base-uncased
Number of labels: 6
Tokenizer loaded successfully
Tokenizer loaded successfully


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Base model loaded with 6 labels
Model configuration: DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "Error_Handling",
    "1": "Instance_Management",
    "2": "Network_Operations",
    "3": "Resource_Management",
    "4": "Scheduler_Operations",
    "5": "System_Operations"
  },
  "initializer_range": 0.02,
  "label2id": {
    "Error_Handling": 0,
    "Instance_Management": 1,
    "Network_Operations": 2,
    "Resource_Management": 3,
    "Scheduler_Operations": 4,
    "System_Operations": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.52.4",
  "vocab_size": 30522
}



## Step 3: Load the Fine-Tuned Weights

In [17]:
# Load the fine-tuned weights
model_path = '../models/controlled_bert_model.pth'

if os.path.exists(model_path):
    print(f"Loading fine-tuned weights from: {model_path}")
    
    # Load the state dict
    state_dict = torch.load(model_path, map_location='cpu')
    
    # Load the weights into the model
    model.load_state_dict(state_dict)
    
    print("Fine-tuned weights loaded successfully!")
    print(f"Model is now ready for inference")
else:
    print(f"ERROR: Model file not found at {model_path}")
    print("Please make sure you have trained and saved the model first.")

Loading fine-tuned weights from: ../models/controlled_bert_model.pth
Fine-tuned weights loaded successfully!
Model is now ready for inference
Fine-tuned weights loaded successfully!
Model is now ready for inference


## Step 4: Test the Model to Verify It Works

In [18]:
# Test the model with a sample log to make sure it works
model.eval()

# Get a sample log from the dataset
sample_log = comprehensive_data['raw_log_text'].iloc[0]
print(f"Testing with sample log:")
print(f"'{sample_log[:100]}...'")

# Tokenize and predict
inputs = tokenizer(sample_log, return_tensors="pt", truncation=True, padding=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class_id = predictions.argmax().item()
    predicted_label = id2label[predicted_class_id]
    confidence = predictions[0][predicted_class_id].item()

print(f"\nPrediction successful!")
print(f"Predicted label: {predicted_label}")
print(f"Confidence: {confidence:.4f}")
print(f"Model is working correctly!")

Testing with sample log:
'INFO nova.compute.manager [req-b9d6411c-b3ea-4307-a707-ec546b0192b3] [instance: 8192614e-4a86-47cc-a...'

Prediction successful!
Predicted label: Instance_Management
Confidence: 0.7135
Model is working correctly!


## Step 5: Save Model in Hugging Face Format

This is the most important step - saving all the files needed for Hugging Face Hub.

In [19]:
# Create the output directory in the models folder
output_dir = Path('../models/infrnce_bert_model_complete')
output_dir.mkdir(exist_ok=True)

print(f"Saving model and tokenizer to: {output_dir}")

# Save the model (this creates pytorch_model.bin and config.json)
model.save_pretrained(output_dir)
print("✓ Model saved (pytorch_model.bin, config.json created)")

# Save the tokenizer (this creates tokenizer files)
tokenizer.save_pretrained(output_dir)
print("✓ Tokenizer saved (tokenizer.json, vocab.txt, etc. created)")

print(f"\nModel export completed successfully!")
print(f"All files are now in: {output_dir.absolute()}")

Saving model and tokenizer to: ../models/infrnce_bert_model_complete
✓ Model saved (pytorch_model.bin, config.json created)
✓ Tokenizer saved (tokenizer.json, vocab.txt, etc. created)

Model export completed successfully!
All files are now in: /Users/kxshrx/dev/infrnce/log_classification_system/notebooks/../models/infrnce_bert_model_complete
✓ Model saved (pytorch_model.bin, config.json created)
✓ Tokenizer saved (tokenizer.json, vocab.txt, etc. created)

Model export completed successfully!
All files are now in: /Users/kxshrx/dev/infrnce/log_classification_system/notebooks/../models/infrnce_bert_model_complete


## Step 6: Verify All Required Files Are Created

In [20]:
# List all files created
print("Files created in the export directory:")
print("=" * 50)

for file_path in sorted(output_dir.iterdir()):
    file_size = file_path.stat().st_size / (1024 * 1024)  # Size in MB
    print(f"✓ {file_path.name:<25} ({file_size:.2f} MB)")

print("\nRequired files for Hugging Face Hub:")
required_files = [
    'config.json', 
    'tokenizer.json',
    'tokenizer_config.json',
    'vocab.txt',
    'special_tokens_map.json'
]

# Check for model files (either format is acceptable)
model_files = ['pytorch_model.bin', 'model.safetensors']
model_present = False

for model_file in model_files:
    file_path = output_dir / model_file
    if file_path.exists():
        print(f"✓ {model_file} - Present")
        model_present = True
        break

if not model_present:
    print(f"✗ Model file - Missing (expected one of: {', '.join(model_files)})")

all_present = model_present
for required_file in required_files:
    file_path = output_dir / required_file
    if file_path.exists():
        print(f"✓ {required_file} - Present")
    else:
        print(f"✗ {required_file} - Missing")
        all_present = False

if all_present:
    print("\n🎉 SUCCESS: All required files are present!")
    print("Your model is ready to be uploaded to Hugging Face Hub.")
    print("\nNote: model.safetensors is the newer, safer format preferred by Hugging Face.")
else:
    print("\n⚠️  WARNING: Some required files are missing.")

Files created in the export directory:
✓ README.md                 (0.00 MB)
✓ config.json               (0.00 MB)
✓ model.safetensors         (255.44 MB)
✓ model_card.json           (0.00 MB)
✓ special_tokens_map.json   (0.00 MB)
✓ tokenizer.json            (0.68 MB)
✓ tokenizer_config.json     (0.00 MB)
✓ vocab.txt                 (0.22 MB)

Required files for Hugging Face Hub:
✓ model.safetensors - Present
✓ config.json - Present
✓ tokenizer.json - Present
✓ tokenizer_config.json - Present
✓ vocab.txt - Present
✓ special_tokens_map.json - Present

🎉 SUCCESS: All required files are present!
Your model is ready to be uploaded to Hugging Face Hub.

Note: model.safetensors is the newer, safer format preferred by Hugging Face.


## Step 7: Create Additional Metadata Files

In [21]:
# Create a README.md file for the model
readme_content = f"""---
license: apache-2.0
base_model: distilbert-base-uncased
tags:
- text-classification
- log-analysis
- openstack
- distilbert
- fine-tuned
datasets:
- custom
language:
- en
pipeline_tag: text-classification
---

# INFRNCE BERT Log Classification Model

This is a fine-tuned DistilBERT model for classifying OpenStack Nova log entries into different operational categories.

## Model Details

- **Base Model**: distilbert-base-uncased
- **Task**: Multi-class text classification
- **Number of Labels**: {len(label_encoder.classes_)}
- **Domain**: OpenStack log analysis

## Labels

The model classifies logs into the following categories:

{", ".join([f"- {label}" for label in label_encoder.classes_])}

## Usage

```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("your-username/infrnce-bert-log-classifier")
model = AutoModelForSequenceClassification.from_pretrained("your-username/infrnce-bert-log-classifier")

# Example usage
log_text = "Your OpenStack log entry here"
inputs = tokenizer(log_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class_id = predictions.argmax().item()
    
print(f"Predicted class: {{model.config.id2label[predicted_class_id]}}")
```

## Training Data

The model was trained on a curated dataset of OpenStack Nova logs with both regex-based classifications and semantic clustering.

## Performance

The model was trained with controlled accuracy to achieve optimal performance on log classification tasks.
"""

# Save README.md
readme_path = output_dir / 'README.md'
with open(readme_path, 'w') as f:
    f.write(readme_content)

print(f"✓ README.md created")

# Create a model card metadata file
model_card = {
    "model_type": "distilbert",
    "task": "text-classification",
    "tags": ["log-analysis", "openstack", "text-classification"],
    "base_model": "distilbert-base-uncased",
    "num_labels": len(label_encoder.classes_),
    "labels": label_encoder.classes_.tolist(),
    "id2label": id2label,
    "label2id": label2id
}

# Save model card
model_card_path = output_dir / 'model_card.json'
with open(model_card_path, 'w') as f:
    json.dump(model_card, f, indent=2)

print(f"✓ model_card.json created")
print(f"\nAll files ready for Hugging Face Hub upload!")

✓ README.md created
✓ model_card.json created

All files ready for Hugging Face Hub upload!


## Summary

Your model has been successfully exported! The `infrnce_bert_model_complete` directory now contains all the files needed to upload to Hugging Face Hub:

1. **model.safetensors** - Your fine-tuned model weights (newer, safer format)
2. **config.json** - Model architecture configuration
3. **tokenizer.json** - Tokenizer configuration
4. **vocab.txt** - Vocabulary file
5. **tokenizer_config.json** - Tokenizer settings
6. **special_tokens_map.json** - Special tokens mapping
7. **README.md** - Model documentation
8. **model_card.json** - Additional metadata

You can now create a private repository on Hugging Face Hub and upload the entire contents of the `infrnce_bert_model_complete` directory.

**Note**: The model was saved in the `model.safetensors` format, which is the newer, safer format preferred by Hugging Face over `pytorch_model.bin`.