In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import glob
import os
import warnings

warnings.filterwarnings('ignore')

def train_neural_network(df):
    """Train a neural network to predict the best clustering combination."""
    print("Training neural network...")
    
    # Assign quality labels
    def assign_quality_dynamic(silhouette):
        thresholds = {'good': 0.8, 'acceptable': 0.5}
        if silhouette >= thresholds['good']:
            return 'Good'
        elif silhouette >= thresholds['acceptable']:
            return 'Acceptable'
        return 'Poor'
    
    try:
        df['Quality'] = df['Silhouette'].apply(assign_quality_dynamic)
        
        # Encode categorical variables
        le_method = LabelEncoder()
        le_algorithm = LabelEncoder()
        df['Method_Encoded'] = le_method.fit_transform(df['Method'])
        df['Algorithm_Encoded'] = le_algorithm.fit_transform(df['Algorithm'])
        
        # Features: dataset characteristics + clustering metrics
        features = ['n_traces', 'n_unique_activities', 'avg_trace_length', 'max_trace_length', 'num_events',
                    'N_Clusters', 'Fitness', 'Simplicity', 'Precision', 'Generalization', 'Method_Encoded', 'Algorithm_Encoded']
        X = df[features].to_numpy()
        y = df['Quality'].to_numpy()
        
        # Handle NaN/infinite values
        X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Split data
        try:
            X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
        except ValueError as e:
            print(f"Error during train-test split: {e}")
            print("Falling back to non-stratified split...")
            X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
        
        # Define neural network
        model = Sequential([
            Dense(64, activation='relu', input_shape=(len(features),)),
            Dropout(0.2),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(16, activation='relu'),
            Dense(3, activation='softmax')  # 3 classes: Good, Acceptable, Poor
        ])
        
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        
        # Encode target
        le_quality = LabelEncoder()
        y_train_encoded = le_quality.fit_transform(y_train)
        y_test_encoded = le_quality.transform(y_test)
        
        # Train model
        model.fit(X_train, y_train_encoded, epochs=20, batch_size=32, validation_data=(X_test, y_test_encoded), verbose=1)
        
        # Evaluate
        loss, accuracy = model.evaluate(X_test, y_test_encoded)
        print(f"Neural Network Test Accuracy: {accuracy:.4f}")
        
        return model, scaler, le_method, le_algorithm, le_quality
    except Exception as e:
        print(f"Error in train_neural_network: {e}")
        return None, None, None, None, None

def predict_best_combination(model, scaler, le_method, le_algorithm, le_quality, results_df):
    """Predict the best combination across all datasets."""
    if model is None:
        print("Error: Neural network model not trained. Selecting best combination by Composite Score.")
        best_idx = results_df['Composite_Score'].idxmax()
        return results_df.loc[best_idx]
    
    try:
        df = results_df.copy()
        df['Method_Encoded'] = le_method.transform(df['Method'])
        df['Algorithm_Encoded'] = le_algorithm.transform(df['Algorithm'])
        
        # Features
        features = ['n_traces', 'n_unique_activities', 'avg_trace_length', 'max_trace_length', 'num_events',
                    'N_Clusters', 'Fitness', 'Simplicity', 'Precision', 'Generalization', 'Method_Encoded', 'Algorithm_Encoded']
        X = df[features].to_numpy()
        X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
        X_scaled = scaler.transform(X)
        
        # Predict
        predictions = model.predict(X_scaled)
        predicted_quality = le_quality.inverse_transform(np.argmax(predictions, axis=1))
        df['Predicted_Quality'] = predicted_quality
        
        # Select best combination
        good_combinations = df[df['Predicted_Quality'] == 'Good']
        if len(good_combinations) == 0:
            good_combinations = df[df['Predicted_Quality'] == 'Acceptable']
        if len(good_combinations) == 0:
            good_combinations = df
        
        best_combination_idx = good_combinations['Composite_Score'].idxmax()
        best_combination = good_combinations.loc[best_combination_idx]
        
        return best_combination
    except Exception as e:
        print(f"Error in predict_best_combination: {e}")
        best_idx = results_df['Composite_Score'].idxmax()
        return results_df.loc[best_idx]

def main_part2():
    """Aggregate results from Part 1 and train neural network to find the best combination."""
    # Define the results directory
    results_dir = "/kaggle/input/results"
    
    # Check if results directory exists
    if not os.path.exists(results_dir):
        print(f"Results directory not found: {results_dir}")
        return
    
    # Aggregate results from all CSV files in the results directory
    result_files = glob.glob(os.path.join(results_dir, "results_*.csv"))
    if not result_files:
        print(f"No result files found in {results_dir}. Run Part 1 first.")
        return
    
    all_results = []
    for file in result_files:
        try:
            df = pd.read_csv(file)
            all_results.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")
            continue
    
    if not all_results:
        print("No valid results to process.")
        return
    
    results_df = pd.concat(all_results, ignore_index=True)
    results_df.to_csv('all_best_combinations_aggregated.csv', index=False)
    print("Aggregated results saved to 'all_best_combinations_aggregated.csv'")
    
    # Train neural network
    model, scaler, le_method, le_algorithm, le_quality = train_neural_network(results_df)
    
    # Predict best combination
    best_combination = predict_best_combination(model, scaler, le_method, le_algorithm, le_quality, results_df)
    
    # Output best combination
    print("\nOverall Best Clustering Combination Across All Datasets:")
    print(f"Method: {best_combination['Method']}, Algorithm: {best_combination['Algorithm']}")
    print(f"Predicted Quality: {best_combination['Predicted_Quality']}")
    print(f"Metrics: Fitness={best_combination['Fitness']:.4f}, Simplicity={best_combination['Simplicity']:.4f}, "
          f"Precision={best_combination['Precision']:.4f}, Generalization={best_combination['Generalization']:.4f}")
    print(f"Silhouette Index: {best_combination['Silhouette']:.4f}, Composite Score: {best_combination['Composite_Score']:.4f}")
    
    # Save final results
    pd.DataFrame([best_combination]).to_csv('final_clustering_results.csv', index=False)
    print("Final best combination saved to 'final_clustering_results.csv'")

if __name__ == "__main__":
    main_part2()

2025-07-14 01:37:33.162223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752457053.350762      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752457053.409901      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Aggregated results saved to 'all_best_combinations_aggregated.csv'
Training neural network...


I0000 00:00:1752457065.535368      37 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/20


I0000 00:00:1752457068.843656      96 service.cc:148] XLA service 0x7c7f1800b4f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1752457068.844449      96 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1752457069.091863      96 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.2727 - loss: 1.1374

I0000 00:00:1752457070.602295      96 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.2727 - loss: 1.1374 - val_accuracy: 0.0000e+00 - val_loss: 1.2046
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.6364 - loss: 1.0132 - val_accuracy: 0.0000e+00 - val_loss: 1.1727
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.3636 - loss: 1.0706 - val_accuracy: 0.0000e+00 - val_loss: 1.1419
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.4545 - loss: 1.0454 - val_accuracy: 0.0000e+00 - val_loss: 1.1092
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.7273 - loss: 0.9420 - val_accuracy: 0.3333 - val_loss: 1.0743
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.6364 - loss: 0.9428 - val_accuracy: 0.6667 - val_loss: 1.0411
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━