# Clustering Platform

An interactive platform for testing various clustering algorithms with support for multiple use cases, data formats (JSON/CSV), and guided workflows.

## Features
- Use Case Management
- Guided Questionnaire
- Multiple Data Formats (CSV/JSON)
- Algorithm Selection (K-Means, DBSCAN, HDBSCAN, Hierarchical, GMM)
- Interactive Visualization
- Comprehensive Metrics



In [None]:
# Setup & Imports
import json
import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Widgets
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
from IPython.display import display, HTML, clear_output

# Clustering algorithms
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
import hdbscan

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

# Metrics
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ All libraries imported successfully!")



## 1. Use Case Management

Select an existing use case or create a new one.



In [None]:
# Use Case Management Functions

USE_CASES_DIR = Path("use_cases")
USE_CASES_FILE = USE_CASES_DIR / "use_cases.json"

# Global state
current_use_case = None
use_cases_registry = {}
questionnaire_answers = {}
uploaded_data = None
processed_data = None
selected_features = []
clustering_results = {}

def load_use_cases():
    """Load use cases from registry file"""
    global use_cases_registry
    try:
        if USE_CASES_FILE.exists():
            with open(USE_CASES_FILE, 'r') as f:
                use_cases_registry = json.load(f)
            return use_cases_registry
        else:
            print(f"Warning: {USE_CASES_FILE} not found. Creating default registry.")
            return {}
    except Exception as e:
        print(f"Error loading use cases: {e}")
        return {}

def save_use_case(use_case_id, use_case_data):
    """Save a new use case to the registry"""
    global use_cases_registry
    use_cases_registry[use_case_id] = use_case_data
    try:
        USE_CASES_DIR.mkdir(exist_ok=True)
        with open(USE_CASES_FILE, 'w') as f:
            json.dump(use_cases_registry, f, indent=2)
        return True
    except Exception as e:
        print(f"Error saving use case: {e}")
        return False

def get_use_case_list():
    """Get list of use case names for dropdown"""
    use_cases = load_use_cases()
    return ["Create New Use Case"] + [use_cases[uc_id]["name"] for uc_id in use_cases.keys()]

# Load use cases
use_cases_registry = load_use_cases()
print(f"✓ Loaded {len(use_cases_registry)} use case(s)")



In [None]:
# Use Case Selection UI

def get_use_case_id_by_name(name):
    """Get use case ID from name"""
    for uc_id, uc_data in use_cases_registry.items():
        if uc_data["name"] == name:
            return uc_id
    return None

def on_use_case_change(change):
    """Handle use case selection change"""
    global current_use_case
    if change['new'] == "Create New Use Case":
        current_use_case = None
        new_use_case_ui.layout.display = 'flex'
        use_case_info_ui.layout.display = 'none'
    else:
        uc_id = get_use_case_id_by_name(change['new'])
        if uc_id:
            current_use_case = uc_id
            use_case_info_ui.value = f"""
            <h4>{use_cases_registry[uc_id]['name']}</h4>
            <p><strong>Description:</strong> {use_cases_registry[uc_id]['description']}</p>
            <p><strong>Recommended Algorithms:</strong> {', '.join(use_cases_registry[uc_id]['recommended_algorithms'])}</p>
            """
            use_case_info_ui.layout.display = 'flex'
            new_use_case_ui.layout.display = 'none'
        else:
            current_use_case = None

# Use Case Selection Widgets
use_case_dropdown = widgets.Dropdown(
    options=get_use_case_list(),
    description='Use Case:',
    style={'description_width': 'initial'},
    layout=Layout(width='400px')
)

use_case_info_ui = widgets.HTML(
    value="",
    layout=Layout(display='none', margin='10px 0px')
)

new_use_case_name = widgets.Text(
    description='Name:',
    placeholder='e.g., Network Traffic Clustering',
    style={'description_width': 'initial'},
    layout=Layout(width='400px')
)

new_use_case_desc = widgets.Textarea(
    description='Description:',
    placeholder='Describe your clustering use case...',
    style={'description_width': 'initial'},
    layout=Layout(width='400px', height='100px')
)

def create_new_use_case(b):
    """Create a new use case"""
    name = new_use_case_name.value.strip()
    desc = new_use_case_desc.value.strip()
    if not name:
        print("Error: Use case name is required")
        return
    
    uc_id = name.lower().replace(' ', '_')
    new_use_case = {
        "name": name,
        "description": desc,
        "data_format": ["json", "csv"],
        "recommended_algorithms": ["K-Means", "HDBSCAN"],
        "typical_features": [],
        "questions": [],
        "default_preprocessing": {
            "handle_missing": "drop",
            "scale_features": True,
            "scaler_type": "StandardScaler",
            "encode_categorical": True
        },
        "default_parameters": {}
    }
    
    if save_use_case(uc_id, new_use_case):
        global use_cases_registry
        use_cases_registry = load_use_cases()
        use_case_dropdown.options = get_use_case_list()
        use_case_dropdown.value = name
        print(f"✓ Created new use case: {name}")
        new_use_case_name.value = ""
        new_use_case_desc.value = ""

create_use_case_btn = widgets.Button(
    description='Create Use Case',
    button_style='success',
    layout=Layout(width='200px')
)
create_use_case_btn.on_click(create_new_use_case)

new_use_case_ui = widgets.VBox([
    widgets.HTML("<h4>Create New Use Case</h4>"),
    new_use_case_name,
    new_use_case_desc,
    create_use_case_btn
], layout=Layout(display='none', border='1px solid #ccc', padding='10px', margin='10px 0px'))

use_case_dropdown.observe(on_use_case_change, names='value')

display(widgets.VBox([
    widgets.HTML("<h3>Use Case Selection</h3>"),
    use_case_dropdown,
    use_case_info_ui,
    new_use_case_ui
]))



## 2. Guided Questionnaire

Answer questions about your clustering goals to get algorithm recommendations.



In [None]:
# Questionnaire Functions

def get_use_case_questions():
    """Get questions for current use case"""
    if current_use_case and current_use_case in use_cases_registry:
        return use_cases_registry[current_use_case].get("questions", [])
    # Default questions for generic use case
    return use_cases_registry.get("generic", {}).get("questions", [])

def render_question_widget(question):
    """Render a widget for a question based on its type"""
    q_id = question.get("id", "")
    q_text = question.get("question", "")
    q_type = question.get("type", "text")
    q_options = question.get("options", [])
    q_default = question.get("default", "")
    q_required = question.get("required", False)
    
    if q_type == "dropdown":
        widget = widgets.Dropdown(
            options=q_options,
            description=q_text + (" *" if q_required else ""),
            value=q_default if q_default in q_options else q_options[0] if q_options else "",
            style={'description_width': 'initial'},
            layout=Layout(width='500px')
        )
    elif q_type == "text":
        widget = widgets.Text(
            description=q_text + (" *" if q_required else ""),
            value=q_default,
            placeholder="Enter your answer...",
            style={'description_width': 'initial'},
            layout=Layout(width='500px')
        )
    elif q_type == "textarea":
        widget = widgets.Textarea(
            description=q_text + (" *" if q_required else ""),
            value=q_default,
            placeholder="Enter your answer...",
            style={'description_width': 'initial'},
            layout=Layout(width='500px', height='100px')
        )
    else:
        widget = widgets.Text(
            description=q_text + (" *" if q_required else ""),
            value=q_default,
            style={'description_width': 'initial'},
            layout=Layout(width='500px')
        )
    
    widget.question_id = q_id
    widget.question_required = q_required
    return widget

def collect_questionnaire_answers(question_widgets):
    """Collect answers from questionnaire widgets"""
    answers = {}
    for widget in question_widgets:
        if hasattr(widget, 'question_id'):
            value = widget.value
            if isinstance(value, str):
                value = value.strip()
            if widget.question_required and (not value or value == ""):
                return None, f"Required question '{widget.description}' not answered"
            answers[widget.question_id] = value
    return answers, None

def recommend_algorithms(answers):
    """Recommend algorithms based on questionnaire answers"""
    recommendations = []
    reasoning = []
    
    noise_level = answers.get("noise_level", "Medium").lower()
    known_cluster_count = answers.get("known_cluster_count", "No").lower()
    known_entity_count = answers.get("known_entity_count", "No").lower()
    cluster_size_variation = answers.get("cluster_size_variation", "Unknown").lower()
    
    # High noise -> HDBSCAN or DBSCAN
    if "high" in noise_level:
        recommendations.append(("HDBSCAN", 5))
        recommendations.append(("DBSCAN", 4))
        reasoning.append("High noise level detected - density-based algorithms recommended")
    
    # Unknown cluster count -> HDBSCAN, DBSCAN, or Hierarchical
    if "no" in known_cluster_count or "no" in known_entity_count:
        if ("HDBSCAN", 5) not in recommendations:
            recommendations.append(("HDBSCAN", 5))
        if ("DBSCAN", 4) not in recommendations:
            recommendations.append(("DBSCAN", 4))
        recommendations.append(("Hierarchical", 3))
        reasoning.append("Unknown cluster count - algorithms that don't require cluster count recommended")
    
    # Known cluster count -> K-Means or GMM
    if "yes" in known_cluster_count or "yes" in known_entity_count:
        recommendations.append(("K-Means", 4))
        recommendations.append(("GMM", 3))
        reasoning.append("Known cluster count - centroid/probabilistic algorithms suitable")
    
    # Varying cluster sizes -> HDBSCAN or Hierarchical
    if "no" in cluster_size_variation:
        if ("HDBSCAN", 5) not in recommendations:
            recommendations.append(("HDBSCAN", 4))
        recommendations.append(("Hierarchical", 4))
        reasoning.append("Varying cluster sizes expected - hierarchical methods recommended")
    
    # Similar cluster sizes -> K-Means or GMM
    if "yes" in cluster_size_variation:
        recommendations.append(("K-Means", 4))
        recommendations.append(("GMM", 3))
    
    # Remove duplicates and sort by score
    seen = set()
    unique_recs = []
    for alg, score in recommendations:
        if alg not in seen:
            seen.add(alg)
            unique_recs.append((alg, score))
    
    unique_recs.sort(key=lambda x: x[1], reverse=True)
    
    # Get top 3
    top_recs = unique_recs[:3]
    
    return [alg for alg, _ in top_recs], reasoning

question_widgets = []
questionnaire_container = widgets.VBox([])

def on_submit_questionnaire(b):
    """Handle questionnaire submission"""
    global questionnaire_answers
    
    answers, error = collect_questionnaire_answers(question_widgets)
    if error:
        print(f"Error: {error}")
        return
    
    questionnaire_answers = answers
    recommendations, reasoning = recommend_algorithms(answers)
    
    rec_text = "<h4>Algorithm Recommendations:</h4><ul>"
    for rec in recommendations:
        rec_text += f"<li><strong>{rec}</strong></li>"
    rec_text += "</ul>"
    
    if reasoning:
        rec_text += "<h5>Reasoning:</h5><ul>"
        for reason in reasoning:
            rec_text += f"<li>{reason}</li>"
        rec_text += "</ul>"
    
    recommendation_display.value = rec_text
    recommendation_display.layout.display = 'flex'
    print("✓ Questionnaire submitted successfully!")

# Create questionnaire UI
recommendation_display = widgets.HTML(
    value="",
    layout=Layout(display='none', border='1px solid #4CAF50', padding='10px', margin='10px 0px')
)

submit_btn = widgets.Button(
    description='Submit Answers',
    button_style='primary',
    layout=Layout(width='200px', margin='10px 0px')
)
submit_btn.on_click(on_submit_questionnaire)

def update_questionnaire():
    """Update questionnaire based on selected use case"""
    global question_widgets, questionnaire_container
    
    questions = get_use_case_questions()
    if not questions:
        questionnaire_container.children = [
            widgets.HTML("<p>No questions defined for this use case. You can proceed to data upload.</p>")
        ]
        return
    
    question_widgets = [render_question_widget(q) for q in questions]
    
    questionnaire_container.children = [
        widgets.HTML("<h4>Please answer the following questions:</h4>")
    ] + question_widgets + [
        submit_btn
    ]

# Auto-update questionnaire when use case changes
def on_use_case_change_for_questionnaire(change):
    update_questionnaire()

# Note: use_case_dropdown is defined in cell 4, so this observer will be set up there

display(widgets.VBox([
    widgets.HTML("<h3>Guided Questionnaire</h3>"),
    questionnaire_container,
    recommendation_display
]))



In [None]:
# JSON Parsing Utilities

def detect_json_structure(json_data):
    """Detect JSON structure type"""
    if isinstance(json_data, list):
        if len(json_data) > 0 and isinstance(json_data[0], dict):
            return "array_of_objects"
        return "array"
    elif isinstance(json_data, dict):
        # Check if it's JSON Lines (multiple objects as string)
        if len(json_data) == 1 and isinstance(list(json_data.values())[0], list):
            return "nested_objects"
        return "object"
    return "unknown"

def flatten_dict(d, parent_key='', sep='_'):
    """Flatten a nested dictionary"""
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        elif isinstance(v, list):
            # Handle lists: convert to string or extract if single element
            if len(v) == 0:
                items.append((new_key, None))
            elif len(v) == 1:
                if isinstance(v[0], dict):
                    items.extend(flatten_dict(v[0], new_key, sep=sep).items())
                else:
                    items.append((new_key, v[0]))
            else:
                # Multiple elements: join as string or create multiple columns
                items.append((new_key, ', '.join(str(x) for x in v)))
        else:
            items.append((new_key, v))
    return dict(items)

def parse_json_data(json_content):
    """Parse JSON content and convert to DataFrame"""
    try:
        # Try parsing as JSON
        if isinstance(json_content, bytes):
            json_content = json_content.decode('utf-8')
        
        # Try JSON Lines format first
        if '\n' in json_content and json_content.strip().startswith('{'):
            lines = [line.strip() for line in json_content.strip().split('\n') if line.strip()]
            json_objects = []
            for line in lines:
                try:
                    json_objects.append(json.loads(line))
                except:
                    pass
            if json_objects:
                json_data = json_objects
            else:
                json_data = json.loads(json_content)
        else:
            json_data = json.loads(json_content)
        
        # Detect structure
        structure_type = detect_json_structure(json_data)
        
        if structure_type == "array_of_objects":
            # Flatten each object
            flattened = [flatten_dict(obj) for obj in json_data]
            df = pd.DataFrame(flattened)
        elif structure_type == "nested_objects":
            # Handle nested structure
            flattened = flatten_dict(json_data)
            df = pd.DataFrame([flattened])
        elif structure_type == "object":
            # Single object
            flattened = flatten_dict(json_data)
            df = pd.DataFrame([flattened])
        else:
            df = pd.DataFrame(json_data)
        
        return df, None
        
    except json.JSONDecodeError as e:
        return None, f"Invalid JSON format: {e}"
    except Exception as e:
        return None, f"Error parsing JSON: {e}"

def parse_csv_data(csv_content):
    """Parse CSV content and convert to DataFrame"""
    try:
        if isinstance(csv_content, bytes):
            csv_content = csv_content.decode('utf-8')
        
        from io import StringIO
        df = pd.read_csv(StringIO(csv_content))
        return df, None
    except Exception as e:
        return None, f"Error parsing CSV: {e}"

print("✓ JSON parsing utilities loaded")



In [None]:
# Data Upload UI

data_preview = widgets.Output(layout=Layout(height='300px', overflow='auto'))
data_info = widgets.HTML(value="", layout=Layout(margin='10px 0px'))

def handle_upload(change):
    """Handle file upload"""
    global uploaded_data
    
    if len(change['new']) == 0:
        return
    
    uploaded_file = change['new'][0]
    file_name = uploaded_file['name']
    file_content = uploaded_file['content']
    
    with data_preview:
        clear_output()
        
        # Detect file type
        if file_name.endswith('.json'):
            df, error = parse_json_data(file_content)
            file_type = "JSON"
        elif file_name.endswith('.csv'):
            df, error = parse_csv_data(file_content)
            file_type = "CSV"
        else:
            # Try to detect by content
            try:
                if isinstance(file_content, bytes):
                    content_str = file_content.decode('utf-8')
                else:
                    content_str = file_content
                
                if content_str.strip().startswith('{') or content_str.strip().startswith('['):
                    df, error = parse_json_data(content_str)
                    file_type = "JSON"
                else:
                    df, error = parse_csv_data(content_str)
                    file_type = "CSV"
            except:
                print(f"Error: Could not determine file type for {file_name}")
                return
        
        if error:
            print(f"Error: {error}")
            return
        
        uploaded_data = df
        
        # Update feature selection
        update_feature_selection()
        
        # Display data info
        info_text = f"""
        <h4>Data Uploaded Successfully!</h4>
        <p><strong>File:</strong> {file_name}</p>
        <p><strong>Type:</strong> {file_type}</p>
        <p><strong>Shape:</strong> {df.shape[0]} rows × {df.shape[1]} columns</p>
        <p><strong>Columns:</strong> {', '.join(df.columns.tolist()[:10])}{'...' if len(df.columns) > 10 else ''}</p>
        """
        data_info.value = info_text
        
        # Display preview
        print(f"Data Preview (first 10 rows):")
        display(df.head(10))
        print(f"\nData Types:")
        display(df.dtypes)
        print(f"\nMissing Values:")
        display(df.isnull().sum())

file_upload = widgets.FileUpload(
    accept='.csv,.json',
    multiple=False,
    description='Upload Data',
    style={'description_width': 'initial'}
)

file_upload.observe(handle_upload, names='value')

display(widgets.VBox([
    widgets.HTML("<h3>Data Upload</h3>"),
    file_upload,
    data_info,
    data_preview
]))



## 4. Data Preprocessing

Select features and configure preprocessing options.



In [None]:
# Preprocessing Functions

def get_numeric_columns(df):
    """Get numeric columns from DataFrame"""
    return df.select_dtypes(include=[np.number]).columns.tolist()

def get_categorical_columns(df):
    """Get categorical columns from DataFrame"""
    return df.select_dtypes(include=['object', 'category']).columns.tolist()

def preprocess_data(df, selected_features, handle_missing='drop', scale_features=True, 
                   scaler_type='StandardScaler', encode_categorical=True):
    """Preprocess data for clustering"""
    global processed_data
    
    if df is None or len(selected_features) == 0:
        return None, "No data or features selected"
    
    # Select features
    data = df[selected_features].copy()
    
    # Handle missing values
    if handle_missing == 'drop':
        data = data.dropna()
    elif handle_missing == 'mean':
        imputer = SimpleImputer(strategy='mean')
        numeric_cols = get_numeric_columns(data)
        if numeric_cols:
            data[numeric_cols] = imputer.fit_transform(data[numeric_cols])
    elif handle_missing == 'median':
        imputer = SimpleImputer(strategy='median')
        numeric_cols = get_numeric_columns(data)
        if numeric_cols:
            data[numeric_cols] = imputer.fit_transform(data[numeric_cols])
    
    # Encode categorical variables
    if encode_categorical:
        categorical_cols = get_categorical_columns(data)
        label_encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            label_encoders[col] = le
    
    # Convert to numeric (handle any remaining non-numeric)
    for col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    
    # Drop any remaining NaN
    data = data.dropna()
    
    # Scale features
    if scale_features:
        if scaler_type == 'StandardScaler':
            scaler = StandardScaler()
        elif scaler_type == 'MinMaxScaler':
            scaler = MinMaxScaler()
        else:
            scaler = StandardScaler()
        
        scaled_data = scaler.fit_transform(data)
        processed_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)
    else:
        processed_data = data
    
    return processed_data, None

preprocessing_output = widgets.Output(layout=Layout(height='200px', overflow='auto'))

# Feature selection widgets
feature_selection = widgets.SelectMultiple(
    options=[],
    description='Select Features:',
    disabled=True,
    style={'description_width': 'initial'},
    layout=Layout(width='500px', height='150px')
)

def update_feature_selection():
    """Update feature selection options"""
    if uploaded_data is not None:
        feature_selection.options = uploaded_data.columns.tolist()
        feature_selection.disabled = False
        # Auto-select numeric columns
        numeric_cols = get_numeric_columns(uploaded_data)
        if numeric_cols:
            feature_selection.value = tuple(numeric_cols[:min(10, len(numeric_cols))])
    else:
        feature_selection.options = []
        feature_selection.disabled = True

# Preprocessing options
handle_missing_dropdown = widgets.Dropdown(
    options=['drop', 'mean', 'median'],
    value='drop',
    description='Handle Missing:',
    style={'description_width': 'initial'},
    layout=Layout(width='300px')
)

scale_features_checkbox = widgets.Checkbox(
    value=True,
    description='Scale Features',
    style={'description_width': 'initial'}
)

scaler_type_dropdown = widgets.Dropdown(
    options=['StandardScaler', 'MinMaxScaler'],
    value='StandardScaler',
    description='Scaler Type:',
    style={'description_width': 'initial'},
    layout=Layout(width='300px')
)

encode_categorical_checkbox = widgets.Checkbox(
    value=True,
    description='Encode Categorical',
    style={'description_width': 'initial'}
)

def apply_preprocessing(b):
    """Apply preprocessing"""
    global processed_data, selected_features
    
    if uploaded_data is None:
        print("Error: Please upload data first")
        return
    
    selected_features = list(feature_selection.value)
    if len(selected_features) == 0:
        print("Error: Please select at least one feature")
        return
    
    with preprocessing_output:
        clear_output()
        processed_data, error = preprocess_data(
            uploaded_data,
            selected_features,
            handle_missing=handle_missing_dropdown.value,
            scale_features=scale_features_checkbox.value,
            scaler_type=scaler_type_dropdown.value,
            encode_categorical=encode_categorical_checkbox.value
        )
        
        if error:
            print(f"Error: {error}")
            return
        
        print(f"✓ Preprocessing completed!")
        print(f"Processed data shape: {processed_data.shape}")
        print(f"Selected features: {len(selected_features)}")
        print(f"\nProcessed data preview:")
        display(processed_data.head())

apply_preprocessing_btn = widgets.Button(
    description='Apply Preprocessing',
    button_style='primary',
    layout=Layout(width='200px')
)
apply_preprocessing_btn.on_click(apply_preprocessing)

# Update feature selection when data is uploaded
# This will be called when file_upload changes
def on_data_uploaded_for_preprocessing():
    update_feature_selection()

display(widgets.VBox([
    widgets.HTML("<h3>Data Preprocessing</h3>"),
    feature_selection,
    widgets.HBox([
        handle_missing_dropdown,
        scaler_type_dropdown
    ]),
    widgets.HBox([
        scale_features_checkbox,
        encode_categorical_checkbox
    ]),
    apply_preprocessing_btn,
    preprocessing_output
]))



## 5. Algorithm Selection & Configuration

Select a clustering algorithm and configure its parameters.



In [None]:
# Algorithm Selection & Parameter Configuration

ALGORITHMS = ["K-Means", "DBSCAN", "HDBSCAN", "Hierarchical", "GMM"]

algorithm_dropdown = widgets.Dropdown(
    options=ALGORITHMS,
    description='Algorithm:',
    style={'description_width': 'initial'},
    layout=Layout(width='300px')
)

# Parameter widgets (will be updated based on algorithm)
parameter_container = widgets.VBox([])

# K-Means parameters
kmeans_n_clusters = widgets.IntText(
    value=3,
    description='n_clusters:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

# DBSCAN parameters
dbscan_eps = widgets.FloatText(
    value=0.5,
    description='eps:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

dbscan_min_samples = widgets.IntText(
    value=5,
    description='min_samples:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

# HDBSCAN parameters
hdbscan_min_cluster_size = widgets.IntText(
    value=5,
    description='min_cluster_size:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

hdbscan_min_samples = widgets.IntText(
    value=3,
    description='min_samples:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

# Hierarchical parameters
hierarchical_n_clusters = widgets.IntText(
    value=3,
    description='n_clusters:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

hierarchical_linkage = widgets.Dropdown(
    options=['ward', 'complete', 'average', 'single'],
    value='ward',
    description='linkage:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

# GMM parameters
gmm_n_components = widgets.IntText(
    value=3,
    description='n_components:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

gmm_covariance_type = widgets.Dropdown(
    options=['full', 'tied', 'diag', 'spherical'],
    value='full',
    description='covariance_type:',
    style={'description_width': 'initial'},
    layout=Layout(width='200px')
)

def update_parameter_widgets(change):
    """Update parameter widgets based on selected algorithm"""
    algorithm = change['new']
    
    if algorithm == "K-Means":
        parameter_container.children = [
            widgets.HTML("<h4>K-Means Parameters</h4>"),
            kmeans_n_clusters
        ]
    elif algorithm == "DBSCAN":
        parameter_container.children = [
            widgets.HTML("<h4>DBSCAN Parameters</h4>"),
            dbscan_eps,
            dbscan_min_samples
        ]
    elif algorithm == "HDBSCAN":
        parameter_container.children = [
            widgets.HTML("<h4>HDBSCAN Parameters</h4>"),
            hdbscan_min_cluster_size,
            hdbscan_min_samples
        ]
    elif algorithm == "Hierarchical":
        parameter_container.children = [
            widgets.HTML("<h4>Hierarchical Clustering Parameters</h4>"),
            hierarchical_n_clusters,
            hierarchical_linkage
        ]
    elif algorithm == "GMM":
        parameter_container.children = [
            widgets.HTML("<h4>Gaussian Mixture Model Parameters</h4>"),
            gmm_n_components,
            gmm_covariance_type
        ]

algorithm_dropdown.observe(update_parameter_widgets, names='value')
update_parameter_widgets({'new': algorithm_dropdown.value})

# Apply use case defaults
def apply_use_case_defaults():
    """Apply default parameters from use case"""
    if current_use_case and current_use_case in use_cases_registry:
        defaults = use_cases_registry[current_use_case].get("default_parameters", {})
        
        if "HDBSCAN" in defaults:
            hdbscan_min_cluster_size.value = defaults["HDBSCAN"].get("min_cluster_size", 5)
            hdbscan_min_samples.value = defaults["HDBSCAN"].get("min_samples", 3)
        
        if "DBSCAN" in defaults:
            dbscan_eps.value = defaults["DBSCAN"].get("eps", 0.5)
            dbscan_min_samples.value = defaults["DBSCAN"].get("min_samples", 5)
        
        if "K-Means" in defaults:
            kmeans_n_clusters.value = defaults["K-Means"].get("n_clusters", 3)

# Show recommendations if available
recommendation_info = widgets.HTML(
    value="",
    layout=Layout(display='none', border='1px solid #2196F3', padding='10px', margin='10px 0px')
)

def show_recommendations():
    """Show algorithm recommendations from questionnaire"""
    if questionnaire_answers:
        recs, reasoning = recommend_algorithms(questionnaire_answers)
        if recs:
            rec_text = "<h4>Recommended Algorithms (from questionnaire):</h4><ul>"
            for rec in recs:
                rec_text += f"<li><strong>{rec}</strong></li>"
            rec_text += "</ul>"
            recommendation_info.value = rec_text
            recommendation_info.layout.display = 'flex'

display(widgets.VBox([
    widgets.HTML("<h3>Algorithm Selection</h3>"),
    recommendation_info,
    algorithm_dropdown,
    parameter_container
]))



In [None]:
# Clustering Execution Functions

clustering_output = widgets.Output(layout=Layout(height='300px', overflow='auto'))
current_model = None
cluster_labels = None

def execute_kmeans(data, n_clusters):
    """Execute K-Means clustering"""
    model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = model.fit_predict(data)
    return model, labels

def execute_dbscan(data, eps, min_samples):
    """Execute DBSCAN clustering"""
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(data)
    return model, labels

def execute_hdbscan(data, min_cluster_size, min_samples):
    """Execute HDBSCAN clustering"""
    model = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)
    labels = model.fit_predict(data)
    return model, labels

def execute_hierarchical(data, n_clusters, linkage):
    """Execute Hierarchical clustering"""
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    labels = model.fit_predict(data)
    return model, labels

def execute_gmm(data, n_components, covariance_type):
    """Execute Gaussian Mixture Model clustering"""
    model = GaussianMixture(n_components=n_components, covariance_type=covariance_type, random_state=42)
    labels = model.fit_predict(data)
    return model, labels

def run_clustering(b):
    """Execute clustering with selected algorithm and parameters"""
    global current_model, cluster_labels, clustering_results
    
    if processed_data is None:
        print("Error: Please preprocess data first")
        return
    
    algorithm = algorithm_dropdown.value
    
    with clustering_output:
        clear_output()
        
        try:
            data_array = processed_data.values
            
            if algorithm == "K-Means":
                model, labels = execute_kmeans(data_array, kmeans_n_clusters.value)
            elif algorithm == "DBSCAN":
                model, labels = execute_dbscan(data_array, dbscan_eps.value, dbscan_min_samples.value)
            elif algorithm == "HDBSCAN":
                model, labels = execute_hdbscan(data_array, hdbscan_min_cluster_size.value, hdbscan_min_samples.value)
            elif algorithm == "Hierarchical":
                model, labels = execute_hierarchical(data_array, hierarchical_n_clusters.value, hierarchical_linkage.value)
            elif algorithm == "GMM":
                model, labels = execute_gmm(data_array, gmm_n_components.value, gmm_covariance_type.value)
            else:
                print(f"Error: Unknown algorithm {algorithm}")
                return
            
            current_model = model
            cluster_labels = labels
            
            # Store results
            clustering_results = {
                'algorithm': algorithm,
                'model': model,
                'labels': labels,
                'n_clusters': len(set(labels)) - (1 if -1 in labels else 0),
                'n_noise': int(np.sum(labels == -1)) if -1 in labels else 0
            }
            
            print(f"✓ Clustering completed successfully!")
            print(f"Algorithm: {algorithm}")
            print(f"Number of clusters: {clustering_results['n_clusters']}")
            if clustering_results['n_noise'] > 0:
                print(f"Number of noise points: {clustering_results['n_noise']}")
            
            # Show cluster distribution
            unique, counts = np.unique(labels, return_counts=True)
            print(f"\nCluster distribution:")
            for cluster_id, count in zip(unique, counts):
                if cluster_id == -1:
                    print(f"  Noise: {count} points")
                else:
                    print(f"  Cluster {cluster_id}: {count} points")
            
        except Exception as e:
            print(f"Error during clustering: {e}")
            import traceback
            traceback.print_exc()

run_clustering_btn = widgets.Button(
    description='Run Clustering',
    button_style='success',
    layout=Layout(width='200px', margin='10px 0px')
)
run_clustering_btn.on_click(run_clustering)

display(widgets.VBox([
    widgets.HTML("<h3>Clustering Execution</h3>"),
    run_clustering_btn,
    clustering_output
]))



## 7. Visualization

Visualize clustering results with interactive plots.



In [None]:
# Visualization Functions

visualization_output = widgets.Output(layout=Layout(height='600px', overflow='auto'))

def create_visualization():
    """Create visualization of clustering results"""
    global visualization_output
    
    if processed_data is None or cluster_labels is None:
        print("Error: Please run clustering first")
        return
    
    with visualization_output:
        clear_output()
        
        # Reduce dimensions for visualization
        if processed_data.shape[1] > 2:
            pca = PCA(n_components=2, random_state=42)
            data_2d = pca.fit_transform(processed_data.values)
            explained_var = pca.explained_variance_ratio_.sum()
            print(f"Using PCA for visualization (explained variance: {explained_var:.2%})")
        else:
            data_2d = processed_data.values
            explained_var = 1.0
        
        # Create DataFrame for plotting
        plot_df = pd.DataFrame({
            'x': data_2d[:, 0],
            'y': data_2d[:, 1],
            'cluster': cluster_labels
        })
        
        # Separate noise points if any
        noise_mask = plot_df['cluster'] == -1
        clustered_mask = ~noise_mask
        
        # Create plotly figure
        fig = go.Figure()
        
        # Plot clusters
        unique_clusters = sorted([c for c in plot_df['cluster'].unique() if c != -1])
        colors = px.colors.qualitative.Set3
        
        for i, cluster_id in enumerate(unique_clusters):
            cluster_data = plot_df[plot_df['cluster'] == cluster_id]
            fig.add_trace(go.Scatter(
                x=cluster_data['x'],
                y=cluster_data['y'],
                mode='markers',
                name=f'Cluster {cluster_id}',
                marker=dict(
                    size=8,
                    color=colors[i % len(colors)],
                    opacity=0.7
                )
            ))
        
        # Plot noise points if any
        if noise_mask.any():
            noise_data = plot_df[noise_mask]
            fig.add_trace(go.Scatter(
                x=noise_data['x'],
                y=noise_data['y'],
                mode='markers',
                name='Noise',
                marker=dict(
                    size=6,
                    color='gray',
                    opacity=0.5,
                    symbol='x'
                )
            ))
        
        fig.update_layout(
            title=f'Clustering Results - {clustering_results["algorithm"]}',
            xaxis_title='First Principal Component' if processed_data.shape[1] > 2 else processed_data.columns[0],
            yaxis_title='Second Principal Component' if processed_data.shape[1] > 2 else processed_data.columns[1] if processed_data.shape[1] > 1 else 'Value',
            hovermode='closest',
            width=800,
            height=600
        )
        
        fig.show()
        
        # Also create matplotlib version for static display
        plt.figure(figsize=(10, 8))
        
        for i, cluster_id in enumerate(unique_clusters):
            cluster_data = plot_df[plot_df['cluster'] == cluster_id]
            plt.scatter(cluster_data['x'], cluster_data['y'], 
                       label=f'Cluster {cluster_id}', 
                       alpha=0.7, s=50)
        
        if noise_mask.any():
            noise_data = plot_df[noise_mask]
            plt.scatter(noise_data['x'], noise_data['y'], 
                       label='Noise', 
                       color='gray', marker='x', alpha=0.5, s=30)
        
        plt.xlabel('First Principal Component' if processed_data.shape[1] > 2 else processed_data.columns[0])
        plt.ylabel('Second Principal Component' if processed_data.shape[1] > 2 else processed_data.columns[1] if processed_data.shape[1] > 1 else 'Value')
        plt.title(f'Clustering Results - {clustering_results["algorithm"]}')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()

visualize_btn = widgets.Button(
    description='Visualize Results',
    button_style='info',
    layout=Layout(width='200px', margin='10px 0px')
)
visualize_btn.on_click(lambda b: create_visualization())

display(widgets.VBox([
    widgets.HTML("<h3>Visualization</h3>"),
    visualize_btn,
    visualization_output
]))



## 8. Metrics & Analysis

View clustering performance metrics and analysis.



In [None]:
# Metrics Calculation

metrics_output = widgets.Output(layout=Layout(height='400px', overflow='auto'))

def calculate_metrics():
    """Calculate and display clustering metrics"""
    global metrics_output
    
    if processed_data is None or cluster_labels is None:
        print("Error: Please run clustering first")
        return
    
    with metrics_output:
        clear_output()
        
        # Filter out noise points for metrics calculation
        valid_mask = cluster_labels != -1
        if valid_mask.sum() < 2:
            print("Error: Not enough valid clusters for metrics calculation")
            return
        
        data_for_metrics = processed_data.values[valid_mask]
        labels_for_metrics = cluster_labels[valid_mask]
        
        if len(set(labels_for_metrics)) < 2:
            print("Error: Need at least 2 clusters for metrics calculation")
            return
        
        try:
            # Calculate metrics
            silhouette = silhouette_score(data_for_metrics, labels_for_metrics)
            davies_bouldin = davies_bouldin_score(data_for_metrics, labels_for_metrics)
            calinski_harabasz = calinski_harabasz_score(data_for_metrics, labels_for_metrics)
            
            # Cluster statistics
            unique_labels, counts = np.unique(cluster_labels, return_counts=True)
            n_clusters = len([l for l in unique_labels if l != -1])
            n_noise = int(np.sum(cluster_labels == -1))
            
            # Display metrics
            print("=" * 60)
            print("CLUSTERING METRICS")
            print("=" * 60)
            print(f"\nAlgorithm: {clustering_results['algorithm']}")
            print(f"Number of clusters: {n_clusters}")
            if n_noise > 0:
                print(f"Number of noise points: {n_noise}")
            
            print(f"\n--- Performance Metrics ---")
            print(f"Silhouette Score: {silhouette:.4f}")
            print(f"  (Range: -1 to 1, higher is better)")
            print(f"Davies-Bouldin Index: {davies_bouldin:.4f}")
            print(f"  (Lower is better)")
            print(f"Calinski-Harabasz Index: {calinski_harabasz:.4f}")
            print(f"  (Higher is better)")
            
            print(f"\n--- Cluster Statistics ---")
            for label, count in zip(unique_labels, counts):
                if label == -1:
                    print(f"Noise: {count} points ({count/len(cluster_labels)*100:.2f}%)")
                else:
                    print(f"Cluster {label}: {count} points ({count/len(cluster_labels)*100:.2f}%)")
            
            # Use case-specific metrics
            if current_use_case == "intersight_alarms":
                print(f"\n--- Use Case Specific Analysis ---")
                print(f"Total alarms: {len(cluster_labels)}")
                print(f"Alarms grouped into {n_clusters} clusters")
                if n_noise > 0:
                    print(f"Unclustered alarms (noise): {n_noise}")
                print(f"Average alarms per cluster: {(len(cluster_labels) - n_noise) / n_clusters:.2f}")
            
            # Store metrics
            clustering_results['metrics'] = {
                'silhouette_score': silhouette,
                'davies_bouldin_index': davies_bouldin,
                'calinski_harabasz_index': calinski_harabasz,
                'n_clusters': n_clusters,
                'n_noise': n_noise
            }
            
            print("\n" + "=" * 60)
            
        except Exception as e:
            print(f"Error calculating metrics: {e}")
            import traceback
            traceback.print_exc()

calculate_metrics_btn = widgets.Button(
    description='Calculate Metrics',
    button_style='info',
    layout=Layout(width='200px', margin='10px 0px')
)
calculate_metrics_btn.on_click(lambda b: calculate_metrics())

display(widgets.VBox([
    widgets.HTML("<h3>Metrics & Analysis</h3>"),
    calculate_metrics_btn,
    metrics_output
]))



## 9. Save Results

Save clustering results, configurations, and visualizations.



In [None]:
# Results Saving Functions

RESULTS_DIR = Path("results")
results_output = widgets.Output(layout=Layout(height='200px', overflow='auto'))

def save_results():
    """Save clustering results to file"""
    global results_output
    
    if clustering_results is None or len(clustering_results) == 0:
        print("Error: No clustering results to save")
        return
    
    RESULTS_DIR.mkdir(exist_ok=True)
    
    with results_output:
        clear_output()
        
        try:
            # Create results dictionary
            save_data = {
                'use_case': current_use_case,
                'use_case_name': use_cases_registry.get(current_use_case, {}).get('name', 'Unknown') if current_use_case else 'Unknown',
                'algorithm': clustering_results['algorithm'],
                'parameters': {},
                'metrics': clustering_results.get('metrics', {}),
                'cluster_info': {
                    'n_clusters': clustering_results['n_clusters'],
                    'n_noise': clustering_results['n_noise']
                },
                'questionnaire_answers': questionnaire_answers,
                'selected_features': selected_features,
                'preprocessing': {
                    'handle_missing': handle_missing_dropdown.value,
                    'scale_features': scale_features_checkbox.value,
                    'scaler_type': scaler_type_dropdown.value,
                    'encode_categorical': encode_categorical_checkbox.value
                },
                'timestamp': pd.Timestamp.now().isoformat()
            }
            
            # Add algorithm-specific parameters
            algorithm = clustering_results['algorithm']
            if algorithm == "K-Means":
                save_data['parameters'] = {'n_clusters': kmeans_n_clusters.value}
            elif algorithm == "DBSCAN":
                save_data['parameters'] = {'eps': dbscan_eps.value, 'min_samples': dbscan_min_samples.value}
            elif algorithm == "HDBSCAN":
                save_data['parameters'] = {'min_cluster_size': hdbscan_min_cluster_size.value, 'min_samples': hdbscan_min_samples.value}
            elif algorithm == "Hierarchical":
                save_data['parameters'] = {'n_clusters': hierarchical_n_clusters.value, 'linkage': hierarchical_linkage.value}
            elif algorithm == "GMM":
                save_data['parameters'] = {'n_components': gmm_n_components.value, 'covariance_type': gmm_covariance_type.value}
            
            # Generate filename
            timestamp_str = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
            use_case_str = current_use_case if current_use_case else "generic"
            filename = f"{use_case_str}_{algorithm}_{timestamp_str}.json"
            filepath = RESULTS_DIR / filename
            
            # Save JSON
            with open(filepath, 'w') as f:
                json.dump(save_data, f, indent=2, default=str)
            
            # Save cluster labels to CSV
            if uploaded_data is not None and cluster_labels is not None:
                results_df = uploaded_data.copy()
                results_df['cluster_label'] = cluster_labels
                csv_filename = f"{use_case_str}_{algorithm}_{timestamp_str}_labels.csv"
                csv_filepath = RESULTS_DIR / csv_filename
                results_df.to_csv(csv_filepath, index=False)
                print(f"✓ Saved cluster labels to: {csv_filename}")
            
            print(f"✓ Results saved successfully!")
            print(f"File: {filename}")
            print(f"Location: {filepath}")
            print(f"\nSaved information:")
            print(f"  - Use case: {save_data['use_case_name']}")
            print(f"  - Algorithm: {save_data['algorithm']}")
            print(f"  - Parameters: {save_data['parameters']}")
            if save_data['metrics']:
                print(f"  - Metrics: Silhouette={save_data['metrics'].get('silhouette_score', 'N/A'):.4f}")
            
        except Exception as e:
            print(f"Error saving results: {e}")
            import traceback
            traceback.print_exc()

save_results_btn = widgets.Button(
    description='Save Results',
    button_style='success',
    layout=Layout(width='200px', margin='10px 0px')
)
save_results_btn.on_click(lambda b: save_results())

display(widgets.VBox([
    widgets.HTML("<h3>Save Results</h3>"),
    save_results_btn,
    results_output
]))



## 10. Integration & Auto-updates

This section ensures components are properly connected and update automatically.



In [None]:
# Integration: Connect file upload to feature selection update

def on_file_upload_for_integration(change):
    """Update feature selection when file is uploaded"""
    if uploaded_data is not None:
        update_feature_selection()

# Re-observe file upload to trigger feature selection update
if 'file_upload' in globals():
    file_upload.observe(on_file_upload_for_integration, names='value')

# Auto-update questionnaire when use case changes
if 'use_case_dropdown' in globals():
    use_case_dropdown.observe(on_use_case_change_for_questionnaire, names='value')

# Apply use case defaults when use case is selected
def on_use_case_selected_for_defaults(change):
    """Apply use case defaults when use case is selected"""
    apply_use_case_defaults()
    show_recommendations()

if 'use_case_dropdown' in globals():
    use_case_dropdown.observe(on_use_case_selected_for_defaults, names='value')

print("✓ Integration complete - components are connected")

