In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv1D, Dropout, Activation, LayerNormalization
from tensorflow.keras.layers import MultiHeadAttention, GlobalAveragePooling1D, Reshape, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import json
from collections import Counter
import time
import joblib

In [2]:
np.random.seed(42)
tf.random.set_seed(42)

In [3]:
# Configuration parameters
CONFIG = {
    'data_dir': 'NPPAD',                 # Directory containing processed CSV files
    'sequence_length': 18,               # 3 minutes of history (10sec intervals = 18 points)
    'prediction_horizon': 1,             # Binary prediction (will accident happen in next 180s)
    'k_folds': 5,                        # Number of folds for cross-validation
    'batch_size': 64,                    # Batch size for training
    'epochs': 100,                       # Maximum number of epochs
    'patience': 10,                      # Early stopping patience
    'tcn_filters': [64, 128, 128],       # Filters for TCN layers
    'tcn_kernel_size': 3,                # Kernel size for TCN
    'tcn_dilations': [1, 2, 4, 8],       # Dilation rates for TCN
    'attention_heads': 4,                # Number of attention heads
    'dropout_rate': 0.3,                 # Dropout rate
    'learning_rate': 0.001,              # Learning rate
    'test_size': 0.2,                    # Proportion of data for testing
    'val_size': 0.2,                     # Proportion of training data for validation
    'model_dir': 'models',               # Directory to save models
    'results_dir': 'results',            # Directory to save results
    'class_weight': {0: 1, 1: 2}         # Weight for handling class imbalance
}

In [4]:
def create_directories():
    """Create necessary directories for saving models and results"""
    os.makedirs(CONFIG['model_dir'], exist_ok=True)
    os.makedirs(CONFIG['results_dir'], exist_ok=True)
    os.makedirs(os.path.join(CONFIG['results_dir'], 'figures'), exist_ok=True)


In [5]:
def load_and_preprocess_data():
    """Load and preprocess all CSV data from the NPPAD directory"""
    print("Loading and preprocessing data...")
    
    # Find all CSV files
    all_files = []
    for root, _, _ in os.walk(CONFIG['data_dir']):
        files = glob.glob(os.path.join(root, '*.csv'))
        all_files.extend(files)
    
    print(f"Found {len(all_files)} CSV files")
    
    # Load a small sample to determine feature dimensionality
    sample_df = pd.read_csv(all_files[0])
    
    # Skip non-feature columns
    non_feature_cols = ['TIME', 'label', 'accident_timestamp', 'accident_type']
    feature_cols = [col for col in sample_df.columns if col not in non_feature_cols]
    
    print(f"Found {len(feature_cols)} feature columns")
    
    # Load all data
    all_sequences = []
    all_labels = []
    accident_types = []
    
    for file in all_files:
        try:
            df = pd.read_csv(file)
            
            # Skip files with too few rows
            if len(df) < CONFIG['sequence_length'] + CONFIG['prediction_horizon']:
                continue
                
            # Extract features and labels
            features = df[feature_cols].values
            times = df['TIME'].values
            labels = df['label'].values
            
            # Record accident types for analysis
            if 1 in labels:
                accident_type = df['accident_type'].iloc[0]
                if isinstance(accident_type, str):
                    accident_types.append(accident_type)
            
            # Create sequences with sliding window
            for i in range(len(df) - CONFIG['sequence_length'] - CONFIG['prediction_horizon'] + 1):
                # Ensure we're using 10-second intervals (check TIME column)
                if i > 0 and abs(times[i] - times[i-1] - 10.0) > 1e-5:
                    continue
                
                seq = features[i:i+CONFIG['sequence_length']]
                
                # Label is 1 if any point in the prediction horizon has label 1
                target_labels = labels[i+CONFIG['sequence_length']:
                                       i+CONFIG['sequence_length']+CONFIG['prediction_horizon']]
                target = 1 if 1 in target_labels else 0
                
                all_sequences.append(seq)
                all_labels.append(target)
                
        except Exception as e:
            print(f"Error processing {file}: {e}")
            continue
    
    # Convert to numpy arrays
    X = np.array(all_sequences)
    y = np.array(all_labels)
    
    print(f"Created {len(X)} sequences")
    print(f"Class distribution: {Counter(y)}")
    
    # Print accident type distribution
    if accident_types:
        print("Accident type distribution:")
        for acc_type, count in Counter(accident_types).items():
            print(f"  {acc_type}: {count}")
    
    return X, y, feature_cols

In [6]:
def residual_block(x, dilation_rate, nb_filters, kernel_size, dropout_rate):
    """TCN residual block with dilated causal convolutions"""
    prev_x = x
    
    # Layer normalization
    x = LayerNormalization()(x)
    
    # Dilated causal convolution
    x = Conv1D(filters=nb_filters,
               kernel_size=kernel_size,
               padding='causal',
               dilation_rate=dilation_rate,
               activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    
    # Second dilated causal convolution
    x = Conv1D(filters=nb_filters,
               kernel_size=kernel_size,
               padding='causal',
               dilation_rate=dilation_rate,
               activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    
    # If dimensions don't match, transform the input
    if prev_x.shape[-1] != nb_filters:
        prev_x = Conv1D(nb_filters, 1, padding='same')(prev_x)
    
    # Residual connection
    res = prev_x + x
    return res

In [7]:
def attention_block(x, num_heads, key_dim):
    """Multi-head self-attention block"""
    # Self-attention
    attention_output = MultiHeadAttention(
        num_heads=num_heads, key_dim=key_dim
    )(x, x)
    
    # Skip connection
    return x + attention_output

In [8]:
def build_tcn_attention_model(input_shape):
    """Build TCN model with attention mechanism"""
    inputs = Input(shape=input_shape)
    x = inputs
    
    # TCN blocks with increasing dilation rates
    for i, (nb_filters, dilation_rate) in enumerate(
            zip(CONFIG['tcn_filters'], CONFIG['tcn_dilations'])):
        x = residual_block(
            x, 
            dilation_rate=dilation_rate,
            nb_filters=nb_filters,
            kernel_size=CONFIG['tcn_kernel_size'],
            dropout_rate=CONFIG['dropout_rate']
        )
    
    # Attention mechanism
    x = attention_block(x, CONFIG['attention_heads'], key_dim=CONFIG['tcn_filters'][-1]//CONFIG['attention_heads'])
    
    # Global pooling to reduce sequence dimension
    x = GlobalAveragePooling1D()(x)
    
    # Output layer
    outputs = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=CONFIG['learning_rate']),
        loss='binary_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall(),
            tf.keras.metrics.AUC()
        ]
    )
    
    return model

In [9]:
def plot_training_history(history, fold=None):
    """Plot training and validation metrics"""
    plt.figure(figsize=(12, 10))
    
    # Plot accuracy
    plt.subplot(2, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='lower right')
    
    # Plot loss
    plt.subplot(2, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')
    
    # Plot precision
    plt.subplot(2, 2, 3)
    plt.plot(history.history['precision'])
    plt.plot(history.history['val_precision'])
    plt.title('Model Precision')
    plt.ylabel('Precision')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='lower right')
    
    # Plot recall
    plt.subplot(2, 2, 4)
    plt.plot(history.history['recall'])
    plt.plot(history.history['val_recall'])
    plt.title('Model Recall')
    plt.ylabel('Recall')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='lower right')
    
    plt.tight_layout()
    
    # Save figure
    plt_path = os.path.join(CONFIG['results_dir'], 'figures', 
                           f'training_history{"_fold"+str(fold) if fold is not None else ""}.png')
    plt.savefig(plt_path)
    plt.close()

In [10]:
def plot_confusion_matrix(y_true, y_pred, fold=None):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_true, y_pred > 0.5)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Normal', 'Accident'],
                yticklabels=['Normal', 'Accident'])
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    # Save figure
    plt_path = os.path.join(CONFIG['results_dir'], 'figures', 
                           f'confusion_matrix{"_fold"+str(fold) if fold is not None else ""}.png')
    plt.savefig(plt_path)
    plt.close()

In [11]:
def train_with_kfold(X, y):
    """Train the model with k-fold cross-validation"""
    print(f"Starting {CONFIG['k_folds']}-fold cross-validation...")
    
    # Initialize k-fold
    kfold = KFold(n_splits=CONFIG['k_folds'], shuffle=True, random_state=42)
    
    # Initialize results tracking
    fold_results = []
    all_val_predictions = []
    all_val_true = []
    
    # Train and evaluate for each fold
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        print(f"\nTraining fold {fold+1}/{CONFIG['k_folds']}")
        
        # Split data
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Scale features using Min-Max scaling
        scaler = MinMaxScaler()
        X_train_fold_reshaped = X_train_fold.reshape(-1, X_train_fold.shape[-1])
        X_val_fold_reshaped = X_val_fold.reshape(-1, X_val_fold.shape[-1])
        
        X_train_fold_scaled = scaler.fit_transform(X_train_fold_reshaped)
        X_val_fold_scaled = scaler.transform(X_val_fold_reshaped)
        
        # Reshape back to 3D
        X_train_fold = X_train_fold_scaled.reshape(X_train_fold.shape)
        X_val_fold = X_val_fold_scaled.reshape(X_val_fold.shape)
        
        # Build model
        model = build_tcn_attention_model((X_train_fold.shape[1], X_train_fold.shape[2]))
        
        if fold == 0:
            # Print model summary for the first fold
            model.summary()
            try:
                plot_model(model, to_file=os.path.join(CONFIG['results_dir'], 'model_architecture.png'), 
                           show_shapes=True)
            except Exception as e:
                print(f"Could not generate model plot: {e}")
        
        # Callbacks
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                patience=CONFIG['patience'],
                restore_best_weights=True
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.5,
                patience=5,
                min_lr=1e-6
            ),
            ModelCheckpoint(
                filepath=os.path.join(CONFIG['model_dir'], f'model_fold_{fold+1}.h5'),
                monitor='val_loss',
                save_best_only=True
            )
        ]
        
        # Train model
        start_time = time.time()
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=CONFIG['epochs'],
            batch_size=CONFIG['batch_size'],
            validation_data=(X_val_fold, y_val_fold),
            callbacks=callbacks,
            class_weight=CONFIG['class_weight']
        )
        train_time = time.time() - start_time
        
        # Plot training history
        plot_training_history(history, fold+1)
        
        # Evaluate on validation set
        val_loss, val_acc, val_precision, val_recall, val_auc = model.evaluate(X_val_fold, y_val_fold)
        
        # Get predictions
        val_pred = model.predict(X_val_fold)
        all_val_predictions.extend(val_pred.flatten())
        all_val_true.extend(y_val_fold)
        
        # Plot confusion matrix
        plot_confusion_matrix(y_val_fold, val_pred, fold+1)
        
        # Save fold results
        fold_result = {
            'fold': fold + 1,
            'val_loss': float(val_loss),
            'val_accuracy': float(val_acc),
            'val_precision': float(val_precision),
            'val_recall': float(val_recall),
            'val_auc': float(val_auc),
            'training_time': train_time,
            'best_epoch': len(history.history['loss']) - CONFIG['patience']
        }
        
        fold_results.append(fold_result)
        print(f"Fold {fold+1} results: {fold_result}")
    
    # Calculate overall performance
    overall_auc = roc_auc_score(all_val_true, all_val_predictions)
    binary_predictions = np.array(all_val_predictions) > 0.5
    report = classification_report(all_val_true, binary_predictions, output_dict=True)
    
    # Save results
    results = {
        'config': CONFIG,
        'fold_results': fold_results,
        'overall_auc': float(overall_auc),
        'classification_report': report
    }
    
    with open(os.path.join(CONFIG['results_dir'], 'kfold_results.json'), 'w') as f:
        json.dump(results, f, indent=4)
    
    # Plot overall confusion matrix
    plot_confusion_matrix(all_val_true, all_val_predictions)
    
    return results

In [12]:
def train_final_model(X, y):
    """Train final model on all data with a proper test split"""
    print("\nTraining final model...")
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=CONFIG['test_size'], random_state=42, stratify=y
    )
    
    # Further split train into train and validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=CONFIG['val_size'], random_state=42, stratify=y_train
    )
    
    print(f"Train set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")
    
    # Scale features
    scaler = MinMaxScaler()
    X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])
    X_val_reshaped = X_val.reshape(-1, X_val.shape[-1])
    X_test_reshaped = X_test.reshape(-1, X_test.shape[-1])
    
    X_train_scaled = scaler.fit_transform(X_train_reshaped)
    X_val_scaled = scaler.transform(X_val_reshaped)
    X_test_scaled = scaler.transform(X_test_reshaped)
    
    # Reshape back to 3D
    X_train = X_train_scaled.reshape(X_train.shape)
    X_val = X_val_scaled.reshape(X_val.shape)
    X_test = X_test_scaled.reshape(X_test.shape)
    
    # Save scaler for future use
    joblib.dump(scaler, os.path.join(CONFIG['model_dir'], 'scaler.pkl'))
    
    # Build model
    model = build_tcn_attention_model((X_train.shape[1], X_train.shape[2]))
    
    # Callbacks
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=CONFIG['patience'],
            restore_best_weights=True
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6
        ),
        ModelCheckpoint(
            filepath=os.path.join(CONFIG['model_dir'], 'final_model.h5'),
            monitor='val_loss',
            save_best_only=True
        )
    ]
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=CONFIG['epochs'],
        batch_size=CONFIG['batch_size'],
        validation_data=(X_val, y_val),
        callbacks=callbacks,
        class_weight=CONFIG['class_weight']
    )
    
    # Plot training history
    plot_training_history(history)
    
    # Evaluate on test set
    test_loss, test_acc, test_precision, test_recall, test_auc = model.evaluate(X_test, y_test)
    
    # Get predictions
    test_pred = model.predict(X_test)
    
    # Plot confusion matrix
    plot_confusion_matrix(y_test, test_pred)
    
    # Generate classification report
    binary_predictions = (test_pred > 0.5).astype(int)
    report = classification_report(y_test, binary_predictions, output_dict=True)
    
    # Save test results
    test_results = {
        'test_loss': float(test_loss),
        'test_accuracy': float(test_acc),
        'test_precision': float(test_precision),
        'test_recall': float(test_recall),
        'test_auc': float(test_auc),
        'classification_report': report
    }
    
    with open(os.path.join(CONFIG['results_dir'], 'test_results.json'), 'w') as f:
        json.dump(test_results, f, indent=4)
    
    print(f"Test results: {test_results}")
    
    # Save model in SavedModel format for deployment
    model.save(os.path.join(CONFIG['model_dir'], 'final_model_saved'))
    
    print("Final model training complete!")
    
    return model, test_results

In [13]:
def main():
    """Main function to execute the training pipeline"""
    # Create directories
    create_directories()
    
    # Load and preprocess data
    X, y, feature_cols = load_and_preprocess_data()
    
    # Save feature columns for future reference
    with open(os.path.join(CONFIG['model_dir'], 'feature_columns.json'), 'w') as f:
        json.dump(feature_cols, f)
    
    # Train with k-fold cross-validation
    kfold_results = train_with_kfold(X, y)
    
    # Train final model
    final_model, test_results = train_final_model(X, y)
    
    print("\nTraining pipeline complete!")
    print(f"Overall AUC across folds: {kfold_results['overall_auc']:.4f}")
    print(f"Final model test accuracy: {test_results['test_accuracy']:.4f}")
    print(f"Final model test AUC: {test_results['test_auc']:.4f}")
    
    return final_model

In [None]:
if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"Error during execution: {e}")
        raise 

Loading and preprocessing data...
Found 1211 CSV files
Found 96 feature columns
Created 467903 sequences
Class distribution: Counter({0: 286124, 1: 181779})
Accident type distribution:
  Reactor Scram: 660
Starting 5-fold cross-validation...

Training fold 1/5
