# Hierarchical Ensemble Classifier Examples

This notebook demonstrates the key features of the Hierarchical Ensemble Classifier package.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

from hierarchical_ensemble_classifier import (
    HierarchicalEnsembleClassifier,
    HierarchyBuilder,
    create_iris_example,
    create_hierarchy_builder_example,
    plot_hierarchy_with_performance,
    plot_feature_usage_heatmap
)

# Set random seed for reproducibility
np.random.seed(42)

## Example 1: Quick Start with Built-in Examples

In [None]:
# Run the built-in Iris example
print("Running Iris Example...")
iris_results = create_iris_example()

print(f"Accuracy: {iris_results['accuracy']:.3f}")
print("\nCross-validation results:")
print(iris_results['cv_results'])

# Visualize the hierarchy
iris_results['model'].visualize_hierarchy(figsize=(8, 6))

In [None]:
# Run the hierarchy builder example
print("Running Hierarchy Builder Example...")
hierarchy_results = create_hierarchy_builder_example()

print(f"Accuracy: {hierarchy_results['accuracy']:.3f}")
print("\nDiscovered hierarchical paths:")
for terminal_class, path in hierarchy_results['paths'].items():
    print(f"{terminal_class}: {' -> '.join(path)}")

# Visualize the discovered hierarchy
hierarchy_results['hierarchy_builder'].visualize_dendrogram(figsize=(10, 6))

## Example 2: Manual Hierarchy Construction

In [None]:
# Load Iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target_names[iris.target])

print("Dataset shape:", X.shape)
print("Classes:", y.unique())
print("Features:", list(X.columns))

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
# Create hierarchical classifier with manual structure
hec = HierarchicalEnsembleClassifier(name="ManualIrisHEC", verbose=True)

# Add first level: sepal-based classifier
hec.add_sub_classifier(
    name="sepal_classifier",
    estimator=LogisticRegression(random_state=42),
    features=["sepal length (cm)", "sepal width (cm)"]
)

# Add second level: petal-based classifier  
hec.add_sub_classifier(
    name="petal_classifier",
    estimator=DecisionTreeClassifier(random_state=42),
    features=["petal length (cm)", "petal width (cm)"],
    parent="sepal_classifier"
)

# Add terminal classes
for class_name in y.unique():
    hec.add_terminal_class(class_name, parent="petal_classifier")

print("Hierarchy structure created!")
print(f"Sub-classifiers: {list(hec.sub_classifiers_.keys())}")
print(f"Graph nodes: {list(hec.graph_.nodes())}")

In [None]:
# Fit the model
hec.fit(X_train, y_train)

# Make predictions
y_pred = hec.predict(X_test)
y_proba = hec.predict_proba(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.3f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Cross-validate sub-classifiers
cv_results = hec.cross_validate_sub_classifiers(X_train, y_train, cv=5)
print("Cross-validation results for sub-classifiers:")
print(cv_results)

# Get feature importance
importance_df = hec.get_feature_importance()
print("\nFeature importance:")
print(importance_df)

In [None]:
# Visualize the hierarchy with performance metrics
plot_hierarchy_with_performance(hec, figsize=(10, 8), show_accuracy=True)

# Plot feature usage heatmap
plot_feature_usage_heatmap(hec, figsize=(8, 4))

## Example 3: Automatic Hierarchy Discovery

In [None]:
# Create a more complex synthetic dataset
X_synth, y_synth = make_classification(
    n_samples=800,
    n_features=25,
    n_informative=20,
    n_redundant=5,
    n_classes=8,
    n_clusters_per_class=1,
    random_state=42
)

# Convert to pandas
feature_names = [f'gene_{i}' for i in range(X_synth.shape[1])]
X_synth = pd.DataFrame(X_synth, columns=feature_names)
class_names = [f'MOA_{i}' for i in range(8)]  # Mechanism of Action classes
y_synth = pd.Series([class_names[i] for i in y_synth])

print(f"Synthetic dataset: {X_synth.shape[0]} samples, {X_synth.shape[1]} features, {len(y_synth.unique())} classes")
print(f"Classes: {list(y_synth.unique())}")

# Split data
X_train_synth, X_test_synth, y_train_synth, y_test_synth = train_test_split(
    X_synth, y_synth, test_size=0.3, random_state=42, stratify=y_synth
)

In [None]:
# Build hierarchy from class profiles
hierarchy_builder = HierarchyBuilder(linkage_method='ward', distance_metric='euclidean')
hierarchy_builder.build_from_class_profiles(
    X_train_synth, y_train_synth, n_components=15
)

# Visualize the dendrogram
hierarchy_builder.visualize_dendrogram(figsize=(12, 6))

# Get hierarchical paths with different clustering levels
paths_2 = hierarchy_builder.get_paths(n_clusters=2)
paths_3 = hierarchy_builder.get_paths(n_clusters=3)
paths_4 = hierarchy_builder.get_paths(n_clusters=4)

print("\nHierarchical paths with 2 clusters:")
for terminal_class, path in paths_2.items():
    print(f"{terminal_class}: {' -> '.join(path)}")
    
print("\nHierarchical paths with 3 clusters:")
for terminal_class, path in paths_3.items():
    print(f"{terminal_class}: {' -> '.join(path)}")

In [None]:
# Build HEC using discovered hierarchy (using 3 clusters)
hec_auto = HierarchicalEnsembleClassifier(name="AutoHierarchyHEC", verbose=True)

# Get intermediate nodes from paths
intermediate_nodes = set()
for path in paths_3.values():
    intermediate_nodes.update(path[1:-1])  # Exclude input and terminal nodes

print(f"Intermediate nodes: {sorted(intermediate_nodes)}")

# Add sub-classifiers for each intermediate node
algorithms = [
    LogisticRegression(random_state=42, max_iter=1000),
    DecisionTreeClassifier(random_state=42),
    RandomForestClassifier(n_estimators=50, random_state=42)
]

for i, node in enumerate(sorted(intermediate_nodes)):
    # Use different feature subsets for each sub-classifier
    start_idx = i * 8
    end_idx = min(start_idx + 12, len(feature_names))
    features = feature_names[start_idx:end_idx]
    
    # Rotate through different algorithms
    estimator = algorithms[i % len(algorithms)]
    
    hec_auto.add_sub_classifier(
        name=node,
        estimator=estimator,
        features=features
    )
    
    print(f"Added {node}: {type(estimator).__name__} with {len(features)} features")

# Add terminal classes
for terminal_class in paths_3.keys():
    path = paths_3[terminal_class]
    parent = path[-2]  # Second to last node
    hec_auto.add_terminal_class(terminal_class, parent=parent)

In [None]:
# Fit the automatically constructed hierarchy
hec_auto.fit(X_train_synth, y_train_synth)

# Make predictions
y_pred_auto = hec_auto.predict(X_test_synth)
accuracy_auto = accuracy_score(y_test_synth, y_pred_auto)

print(f"Test Accuracy (Auto Hierarchy): {accuracy_auto:.3f}")
print("\nClassification Report:")
print(classification_report(y_test_synth, y_pred_auto))

In [None]:
# Cross-validate the sub-classifiers
cv_results_auto = hec_auto.cross_validate_sub_classifiers(
    X_train_synth, y_train_synth, cv=5
)

print("Cross-validation results for automatically discovered hierarchy:")
print(cv_results_auto)

# Visualize the final hierarchy
plot_hierarchy_with_performance(hec_auto, figsize=(12, 8), show_accuracy=True)

# Show feature usage
plot_feature_usage_heatmap(hec_auto, figsize=(12, 6))

## Example 4: Comparison with Standard Classifiers

In [None]:
# Compare HEC with standard classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Use the synthetic dataset
X_compare = X_train_synth
y_compare = y_train_synth

# Standard classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42),
    'K-NN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

# Cross-validate standard classifiers
results_comparison = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X_compare, y_compare, cv=5, scoring='accuracy')
    results_comparison[name] = {
        'mean': scores.mean(),
        'std': scores.std(),
        'scores': scores
    }

# Add HEC results (approximate from sub-classifier CV)
hec_mean_score = cv_results_auto['mean_score'].mean()
results_comparison['HEC (Auto)'] = {
    'mean': hec_mean_score,
    'std': cv_results_auto['std_score'].mean(),
    'scores': [hec_mean_score] * 5  # Placeholder
}

# Display comparison
print("\nClassifier Comparison (5-fold CV):")
print("-" * 50)
for name, result in results_comparison.items():
    print(f"{name:20s}: {result['mean']:.3f} ({result['std']:.3f})")

# Plot comparison
plt.figure(figsize=(10, 6))
names = list(results_comparison.keys())
means = [results_comparison[name]['mean'] for name in names]
stds = [results_comparison[name]['std'] for name in names]

plt.bar(names, means, yerr=stds, capsize=5, alpha=0.7)
plt.title('Classifier Comparison (5-fold Cross-Validation)')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Example 5: Feature Importance Analysis

In [None]:
# Analyze feature importance across the hierarchy
importance_df = hec_auto.get_feature_importance()

print("Feature importance by sub-classifier:")
print(importance_df.head(10))

# Aggregate importance by feature
feature_importance_agg = importance_df.groupby('feature')['importance'].agg(['mean', 'std', 'count'])
feature_importance_agg = feature_importance_agg.sort_values('mean', ascending=False)

print("\nTop 10 most important features (aggregated):")
print(feature_importance_agg.head(10))

# Plot feature importance
plt.figure(figsize=(12, 6))
top_features = feature_importance_agg.head(15)
plt.bar(range(len(top_features)), top_features['mean'], 
        yerr=top_features['std'], capsize=3, alpha=0.7)
plt.xlabel('Features')
plt.ylabel('Mean Importance')
plt.title('Top 15 Feature Importance (Aggregated Across Sub-Classifiers)')
plt.xticks(range(len(top_features)), top_features.index, rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Summary

This notebook demonstrated:

1. **Quick start** with built-in example functions
2. **Manual hierarchy construction** for domain-specific knowledge
3. **Automatic hierarchy discovery** using the HierarchyBuilder class
4. **Performance comparison** with standard classifiers
5. **Feature importance analysis** across the hierarchical structure

The Hierarchical Ensemble Classifier provides:
- Flexibility in using different algorithms and features for each sub-problem
- Interpretable decision paths
- Automatic discovery of natural hierarchies in data
- Integration with scikit-learn ecosystem
- Enhanced visualization capabilities (especially with skclust integration)