# NexusML Experiment Template

This notebook provides a template for running experiments with the NexusML package. It demonstrates how to use the new architecture for equipment classification tasks.

## Setup

First, let's import the necessary modules and set up the environment.

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add the project root to the Python path if needed
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import NexusML modules
from nexusml.core.di.container import DIContainer
from nexusml.core.pipeline.context import PipelineContext
from nexusml.core.pipeline.factory import PipelineFactory
from nexusml.core.pipeline.orchestrator import PipelineOrchestrator
from nexusml.core.pipeline.registry import ComponentRegistry
from nexusml.core.config.provider import ConfigurationProvider

# Set up plotting
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context('notebook')

## Configuration

Let's set up the configuration for our experiment.

In [2]:
# Set the NEXUSML_CONFIG environment variable to point to the absolute path of the configuration file
config_file_path = os.path.abspath(os.path.join(project_root, 'nexusml/config/nexusml_config.yml'))
os.environ['NEXUSML_CONFIG'] = config_file_path
print(f"Setting NEXUSML_CONFIG to: {config_file_path}")

# Get the configuration provider
config_provider = ConfigurationProvider()

# Get the configuration with error handling
try:
    config = config_provider.config
    print(f"Configuration loaded successfully")
    print(f"Feature Engineering Configuration: {len(config.feature_engineering.text_combinations)} text combinations, {len(config.feature_engineering.numeric_columns)} numeric columns")
    print(f"Classification Configuration: {len(config.classification.classification_targets)} classification targets")
    print(f"Data Configuration: {len(config.data.required_columns)} required columns")
except Exception as e:
    print(f"Error loading configuration: {e}")
    print("Creating default configuration...")
    from nexusml.core.config.configuration import NexusMLConfig
    config = NexusMLConfig()
    config_provider.set_config(config)
    print("Default configuration created successfully")

Setting NEXUSML_CONFIG to: c:\Repos\fca-dashboard4\nexusml\nexusml\config\nexusml_config.yml
Error loading configuration: Configuration file not found: c:\Repos\fca-dashboard4\nexusml\nexusml\config\nexusml_config.yml
Creating default configuration...
Default configuration created successfully


## Data Loading and Exploration

Now, let's load the data and explore it.

In [3]:
# Import the data loading utility from notebook_utils
from nexusml.utils.notebook_utils import discover_and_load_data

# Discover and load data
data, data_path = discover_and_load_data()

# Display the first few rows
print(f"Data shape: {data.shape}")
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '../examples/sample_data.xlsx'

In [None]:
# Explore the data
print("Data types:")
print(data.dtypes)

print("\nMissing values:")
print(data.isnull().sum())

print("\nSummary statistics:")
data.describe()

## Pipeline Setup

Let's set up the pipeline components for our experiment.

In [None]:
# Import component implementations
from nexusml.core.pipeline.components.data_loader import CSVDataLoader, ExcelDataLoader
from nexusml.core.pipeline.components.data_preprocessor import StandardPreprocessor
from nexusml.core.pipeline.components.feature_engineer import TextFeatureEngineer
from nexusml.core.pipeline.components.model_builder import RandomForestModelBuilder
from nexusml.core.pipeline.components.model_trainer import StandardModelTrainer
from nexusml.core.pipeline.components.model_evaluator import StandardModelEvaluator
from nexusml.core.pipeline.components.model_serializer import PickleModelSerializer
from nexusml.core.pipeline.components.predictor import StandardPredictor

# Create a registry and container
registry = ComponentRegistry()
container = DIContainer()

# Register components
registry.register(DataLoader, "csv", CSVDataLoader)
registry.register(DataLoader, "excel", ExcelDataLoader)
registry.register(DataPreprocessor, "standard", StandardPreprocessor)
registry.register(FeatureEngineer, "text", TextFeatureEngineer)
registry.register(ModelBuilder, "random_forest", RandomForestModelBuilder)
registry.register(ModelTrainer, "standard", StandardModelTrainer)
registry.register(ModelEvaluator, "standard", StandardModelEvaluator)
registry.register(ModelSerializer, "pickle", PickleModelSerializer)
registry.register(Predictor, "standard", StandardPredictor)

# Set default implementations
registry.set_default_implementation(DataLoader, "excel")
registry.set_default_implementation(DataPreprocessor, "standard")
registry.set_default_implementation(FeatureEngineer, "text")
registry.set_default_implementation(ModelBuilder, "random_forest")
registry.set_default_implementation(ModelTrainer, "standard")
registry.set_default_implementation(ModelEvaluator, "standard")
registry.set_default_implementation(ModelSerializer, "pickle")
registry.set_default_implementation(Predictor, "standard")

# Create a factory and orchestrator
factory = PipelineFactory(registry, container)
context = PipelineContext()
orchestrator = PipelineOrchestrator(factory, context)

## Model Training

Now, let's train a model using the pipeline.

In [None]:
# Train a model
try:
    model, metrics = orchestrator.train_model(
        data_path=data_path,
        test_size=0.3,
        random_state=42,
        optimize_hyperparameters=True,
        output_dir="../outputs/models",
        model_name="equipment_classifier_experiment",
    )
    
    print("Model training completed successfully")
    print(f"Model saved to: {orchestrator.context.get('model_path')}")
    print(f"Metadata saved to: {orchestrator.context.get('metadata_path')}")
    print("Metrics:")
    for key, value in metrics.items():
        print(f"  {key}: {value}")
except Exception as e:
    print(f"Error training model: {e}")

## Model Evaluation

Let's evaluate the model in more detail.

In [None]:
# Evaluate the model
try:
    results = orchestrator.evaluate(
        model=model,
        data_path=data_path,
        output_path="../outputs/evaluation_results_experiment.json",
    )
    
    print("Evaluation completed successfully")
    print(f"Evaluation results saved to: ../outputs/evaluation_results_experiment.json")
    print("Metrics:")
    for key, value in results["metrics"].items():
        print(f"  {key}: {value}")
except Exception as e:
    print(f"Error evaluating model: {e}")

## Visualization

Let's visualize the results.

In [None]:
# Visualize the metrics
try:
    # Create a bar chart of the metrics
    metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Metric', y='Value', data=metrics_df)
    plt.title('Model Metrics')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    # Create a confusion matrix if available
    if 'confusion_matrix' in results['analysis']:
        cm = results['analysis']['confusion_matrix']
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.tight_layout()
        plt.show()
except Exception as e:
    print(f"Error visualizing results: {e}")

## Prediction

Finally, let's make predictions on new data.

In [None]:
# Create sample data for prediction
prediction_data = pd.DataFrame({
    "equipment_tag": ["AHU-01", "CHW-01", "P-01"],
    "manufacturer": ["Trane", "Carrier", "Armstrong"],
    "model": ["M-1000", "C-2000", "A-3000"],
    "description": [
        "Air Handling Unit with cooling coil",
        "Centrifugal Chiller for HVAC system",
        "Centrifugal Pump for chilled water",
    ],
})

# Make predictions
try:
    predictions = orchestrator.predict(
        model=model,
        data=prediction_data,
        output_path="../outputs/predictions_experiment.csv",
    )
    
    print("Predictions completed successfully")
    print(f"Predictions saved to: {orchestrator.context.get('output_path')}")
    print("Sample predictions:")
    display(predictions)
except Exception as e:
    print(f"Error making predictions: {e}")

## Conclusion

In this notebook, we demonstrated how to use the NexusML package for equipment classification. We loaded data, trained a model, evaluated it, and made predictions using the new architecture.

The new architecture provides a flexible, maintainable, and testable system for equipment classification. By following a modular design with clear interfaces, dependency injection, and a factory pattern, it makes it easy to create, configure, and extend the pipeline.