In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, RepeatVector, TimeDistributed
from tensorflow.keras.optimizers import Adam
from pytorch_forecasting.models import TemporalFusionTransformer
import torch
import torch.nn as nn
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from prophet import Prophet
import warnings
warnings.filterwarnings('ignore')



In [10]:
class DataProcessor:
    """
    Handles data ingestion, cleaning, and feature engineering for AIX server metrics
    """
    
    def __init__(self):
        self.scalers = {}
        self.feature_columns = []
        self.temporal_features = ['hour', 'day_of_week', 'day_of_month', 'is_weekend']
        
    def load_and_merge_data(self, vmstat_df, iostat_df, netstat_df, process_df):
        """
        Merge all metric dataframes on timestamp and id
        """
        # Convert timestamps to datetime
        for df in [vmstat_df, iostat_df, netstat_df, process_df]:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        
        # Aggregate process metrics by timestamp and id
        process_agg = process_df.groupby(['id', 'timestamp']).agg({
            'cpu': ['mean', 'max', 'std'],
            'mem': ['mean', 'max', 'std'],
            'pid': 'count'  # number of processes
        }).reset_index()
        
        process_agg.columns = ['id', 'timestamp', 'cpu_mean', 'cpu_max', 'cpu_std',
                              'mem_mean', 'mem_max', 'mem_std', 'process_count']
        
        # Aggregate iostat metrics by timestamp and id
        iostat_agg = iostat_df.groupby(['id', 'timestamp']).agg({
            'tps': 'sum',
            'kb_read': 'sum', 
            'kb_wrtn': 'sum',
            'service_time': 'mean'
        }).reset_index()
        
        # Aggregate netstat metrics by timestamp and id
        netstat_agg = netstat_df.groupby(['id', 'timestamp']).agg({
            'ipkts_rate': 'sum',
            'opkts_rate': 'sum',
            'ierrs_rate': 'sum',
            'oerrs_rate': 'sum'
        }).reset_index()
        
        # Merge all dataframes
        merged_df = vmstat_df.merge(iostat_agg, on=['id', 'timestamp'], how='left')
        merged_df = merged_df.merge(netstat_agg, on=['id', 'timestamp'], how='left')
        merged_df = merged_df.merge(process_agg, on=['id', 'timestamp'], how='left')
        
        return merged_df.fillna(0)
    
    def create_temporal_features(self, df):
        """
        Create time-based features
        """
        df = df.copy()
        df['hour'] = df['timestamp'].dt.hour
        df['day_of_week'] = df['timestamp'].dt.dayofweek
        df['day_of_month'] = df['timestamp'].dt.day
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        
        return df
    
    def create_statistical_features(self, df, window_sizes=[5, 15, 30]):
        """
        Create rolling statistical features
        """
        df = df.copy()
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in ['id', 'hour', 'day_of_week', 'day_of_month', 'is_weekend']]
        
        for window in window_sizes:
            for col in numeric_cols:
                df[f'{col}_rolling_mean_{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
                df[f'{col}_rolling_std_{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).std())
                df[f'{col}_rolling_max_{window}'] = df.groupby('id')[col].transform(lambda x: x.rolling(window, min_periods=1).max())
        
        return df.fillna(0)
    
    def create_cross_metric_features(self, df):
        """
        Create features that capture relationships between different metrics
        """
        df = df.copy()
        
        # CPU-Memory correlation
        df['cpu_mem_ratio'] = df['us'] / (df['mem_mean'] + 1e-6)
        
        # IO efficiency
        df['io_efficiency'] = (df['kb_read'] + df['kb_wrtn']) / (df['tps'] + 1e-6)
        
        # Network error rates
        df['net_error_rate'] = (df['ierrs_rate'] + df['oerrs_rate']) / (df['ipkts_rate'] + df['opkts_rate'] + 1e-6)
        
        # System load indicator
        df['system_load'] = df['us'] + df['sy'] + (df['r'] * 10)  # Running processes weighted higher
        
        return df.fillna(0)
    
    def detect_and_handle_outliers(self, df, method='iqr', threshold=3):
        """
        Detect and handle outliers
        """
        df = df.copy()
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in ['id', 'hour', 'day_of_week', 'day_of_month', 'is_weekend']]
        
        for col in numeric_cols:
            if method == 'iqr':
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
            
            elif method == 'zscore':
                z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
                df[col] = df[col].where(z_scores <= threshold, df[col].median())
        
        return df
    
    def normalize_features(self, df, method='robust'):
        """
        Normalize features for ML models
        """
        df = df.copy()
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in ['id', 'hour', 'day_of_week', 'day_of_month', 'is_weekend']]
        
        if method == 'robust':
            scaler = RobustScaler()
        else:
            scaler = StandardScaler()
        
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
        self.scalers['main_scaler'] = scaler
        self.feature_columns = numeric_cols
        
        return df
    
    def process_data(self, vmstat_df, iostat_df, netstat_df, process_df):
        """
        Complete data processing pipeline
        """
        # Merge data
        df = self.load_and_merge_data(vmstat_df, iostat_df, netstat_df, process_df)
        
        # Feature engineering
        df = self.create_temporal_features(df)
        df = self.create_statistical_features(df)
        df = self.create_cross_metric_features(df)
        
        # Handle outliers
        df = self.detect_and_handle_outliers(df)
        
        # Normalize
        df = self.normalize_features(df)
        
        return df



In [11]:

class ForecastingEngine:
    """
    Multi-model forecasting engine with ensemble capabilities
    """
    
    def __init__(self, sequence_length=50):
        self.sequence_length = sequence_length
        self.models = {}
        self.model_weights = {}
        self.feature_columns = []
        
    def create_lstm_model(self, input_shape, horizon=24):
        """
        Create LSTM model for long-term forecasting
        """
        model = Sequential([
            LSTM(128, return_sequences=True, input_shape=input_shape),
            Dropout(0.2),
            LSTM(64, return_sequences=True),
            Dropout(0.2),
            LSTM(32, return_sequences=False),
            Dropout(0.2),
            Dense(horizon)
        ])
        
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
        return model
    
    def create_transformer_model(self, input_shape, horizon=12):
        """
        Create Transformer model for medium-term forecasting
        """
        inputs = Input(shape=input_shape)
        
        # Multi-head attention
        attention = tf.keras.layers.MultiHeadAttention(
            num_heads=8, key_dim=64
        )(inputs, inputs)
        
        # Add & Norm
        attention = tf.keras.layers.LayerNormalization()(inputs + attention)
        
        # Feed Forward
        ff = tf.keras.layers.Dense(128, activation='relu')(attention)
        ff = tf.keras.layers.Dense(input_shape[-1])(ff)
        
        # Add & Norm
        output = tf.keras.layers.LayerNormalization()(attention + ff)
        
        # Output layer
        output = tf.keras.layers.GlobalAveragePooling1D()(output)
        output = tf.keras.layers.Dense(horizon)(output)
        
        model = Model(inputs=inputs, outputs=output)
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
        
        return model
    
    def create_sequences(self, data, target_col, horizon=1):
        """
        Create sequences for time series models
        """
        X, y = [], []
        
        for i in range(len(data) - self.sequence_length - horizon + 1):
            X.append(data[i:(i + self.sequence_length)])
            y.append(data[i + self.sequence_length:i + self.sequence_length + horizon, target_col])
        
        return np.array(X), np.array(y)
    
    def fit_arima_model(self, series, order=(5, 1, 0), seasonal_order=(1, 1, 1, 24)):
        """
        Fit ARIMA model for short-term forecasting
        """
        try:
            model = ARIMA(series, order=order, seasonal_order=seasonal_order)
            fitted_model = model.fit()
            return fitted_model
        except:
            # Fallback to simpler ARIMA
            model = ARIMA(series, order=(1, 1, 1))
            fitted_model = model.fit()
            return fitted_model
    
    def fit_prophet_model(self, df, target_col):
        """
        Fit Prophet model for long-term forecasting with seasonality
        """
        prophet_df = df[['timestamp', target_col]].copy()
        prophet_df.columns = ['ds', 'y']
        
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=True,
            daily_seasonality=True,
            changepoint_prior_scale=0.05
        )
        
        model.fit(prophet_df)
        return model
    
    def train_ensemble(self, df, target_columns):
        """
        Train all forecasting models
        """
        self.feature_columns = [col for col in df.columns if col not in ['id', 'timestamp']]
        
        for target_col in target_columns:
            print(f"Training models for {target_col}...")
            
            # Prepare data
            numeric_data = df[self.feature_columns].values
            target_idx = self.feature_columns.index(target_col)
            
            # Create sequences
            X_lstm, y_lstm_24 = self.create_sequences(numeric_data, target_idx, horizon=24)
            X_trans, y_trans_12 = self.create_sequences(numeric_data, target_idx, horizon=12)
            
            # Split data
            split_idx = int(0.8 * len(X_lstm))
            X_train_lstm, X_test_lstm = X_lstm[:split_idx], X_lstm[split_idx:]
            y_train_lstm, y_test_lstm = y_lstm_24[:split_idx], y_lstm_24[split_idx:]
            
            # Train LSTM
            lstm_model = self.create_lstm_model(X_train_lstm.shape[1:], horizon=24)
            lstm_model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=32, verbose=0)
            self.models[f'{target_col}_lstm'] = lstm_model
            
            # Train Transformer
            if len(X_trans) > 0:
                trans_model = self.create_transformer_model(X_trans.shape[1:], horizon=12)
                X_train_trans = X_trans[:int(0.8 * len(X_trans))]
                y_train_trans = y_trans_12[:int(0.8 * len(y_trans_12))]
                trans_model.fit(X_train_trans, y_train_trans, epochs=30, batch_size=32, verbose=0)
                self.models[f'{target_col}_transformer'] = trans_model
            
            # Train ARIMA
            series = df[target_col].values
            arima_model = self.fit_arima_model(series)
            self.models[f'{target_col}_arima'] = arima_model
            
            # Train Prophet
            prophet_model = self.fit_prophet_model(df, target_col)
            self.models[f'{target_col}_prophet'] = prophet_model
            
            # Initialize equal weights
            self.model_weights[target_col] = {'lstm': 0.25, 'transformer': 0.25, 'arima': 0.25, 'prophet': 0.25}
    
    def predict(self, df, target_col, horizon=24):
        """
        Generate ensemble predictions
        """
        predictions = {}
        
        # LSTM prediction
        if f'{target_col}_lstm' in self.models:
            numeric_data = df[self.feature_columns].values
            if len(numeric_data) >= self.sequence_length:
                X_pred = numeric_data[-self.sequence_length:].reshape(1, self.sequence_length, -1)
                lstm_pred = self.models[f'{target_col}_lstm'].predict(X_pred, verbose=0)[0]
                predictions['lstm'] = lstm_pred[:horizon]
        
        # ARIMA prediction
        if f'{target_col}_arima' in self.models:
            arima_pred = self.models[f'{target_col}_arima'].forecast(steps=horizon)
            predictions['arima'] = arima_pred
        
        # Prophet prediction
        if f'{target_col}_prophet' in self.models:
            future = self.models[f'{target_col}_prophet'].make_future_dataframe(periods=horizon, freq='H')
            prophet_pred = self.models[f'{target_col}_prophet'].predict(future)
            predictions['prophet'] = prophet_pred['yhat'].tail(horizon).values
        
        # Ensemble prediction
        if predictions:
            weights = self.model_weights.get(target_col, {})
            ensemble_pred = np.zeros(horizon)
            total_weight = 0
            
            for model_name, pred in predictions.items():
                if len(pred) == horizon:
                    weight = weights.get(model_name, 0.25)
                    ensemble_pred += pred * weight
                    total_weight += weight
            
            if total_weight > 0:
                ensemble_pred /= total_weight
            
            return ensemble_pred, predictions
        
        return None, {}




In [12]:
class AnomalyDetector:
    """
    Multi-method anomaly detection framework
    """
    
    def __init__(self, sequence_length=50):
        self.sequence_length = sequence_length
        self.models = {}
        self.thresholds = {
            'low': 0.3,
            'medium': 0.5,
            'high': 0.7,
            'critical': 0.9
        }
        self.feature_columns = []
        
    def create_lstm_autoencoder(self, input_shape):
        """
        Create LSTM Autoencoder for sequential anomaly detection
        """
        # Encoder
        encoder_inputs = Input(shape=input_shape)
        encoder = LSTM(64, return_sequences=True)(encoder_inputs)
        encoder = LSTM(32, return_sequences=False)(encoder)
        
        # Decoder
        decoder = RepeatVector(input_shape[0])(encoder)
        decoder = LSTM(32, return_sequences=True)(decoder)
        decoder = LSTM(64, return_sequences=True)(decoder)
        decoder_outputs = TimeDistributed(Dense(input_shape[1]))(decoder)
        
        autoencoder = Model(encoder_inputs, decoder_outputs)
        autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
        
        return autoencoder
    
    def create_sequences_for_autoencoder(self, data):
        """
        Create sequences for autoencoder training
        """
        X = []
        for i in range(len(data) - self.sequence_length + 1):
            X.append(data[i:(i + self.sequence_length)])
        return np.array(X)
    
    def train_detectors(self, df, contamination=0.05):
        """
        Train all anomaly detection models
        """
        self.feature_columns = [col for col in df.columns if col not in ['id', 'timestamp']]
        numeric_data = df[self.feature_columns].values
        
        # Train Isolation Forest
        iso_forest = IsolationForest(
            contamination=contamination,
            n_estimators=100,
            random_state=42
        )
        iso_forest.fit(numeric_data)
        self.models['isolation_forest'] = iso_forest
        
        # Train One-Class SVM
        oc_svm = OneClassSVM(kernel='rbf', nu=contamination)
        oc_svm.fit(numeric_data)
        self.models['oneclass_svm'] = oc_svm
        
        # Train LSTM Autoencoder
        sequences = self.create_sequences_for_autoencoder(numeric_data)
        if len(sequences) > 0:
            autoencoder = self.create_lstm_autoencoder((self.sequence_length, len(self.feature_columns)))
            autoencoder.fit(sequences, sequences, epochs=50, batch_size=32, verbose=0)
            self.models['lstm_autoencoder'] = autoencoder
            
            # Calculate reconstruction threshold
            reconstructions = autoencoder.predict(sequences, verbose=0)
            mse = np.mean(np.power(sequences - reconstructions, 2), axis=(1, 2))
            self.thresholds['reconstruction'] = np.percentile(mse, 95)
    
    def detect_anomalies(self, df):
        """
        Detect anomalies using ensemble approach
        """
        numeric_data = df[self.feature_columns].values
        anomaly_scores = np.zeros(len(df))
        detection_details = {}
        
        # Isolation Forest
        if 'isolation_forest' in self.models:
            iso_scores = self.models['isolation_forest'].decision_function(numeric_data)
            iso_scores = (iso_scores - iso_scores.min()) / (iso_scores.max() - iso_scores.min())
            anomaly_scores += iso_scores * 0.33
            detection_details['isolation_forest'] = iso_scores
        
        # One-Class SVM
        if 'oneclass_svm' in self.models:
            svm_scores = self.models['oneclass_svm'].decision_function(numeric_data)
            svm_scores = (svm_scores - svm_scores.min()) / (svm_scores.max() - svm_scores.min())
            anomaly_scores += svm_scores * 0.33
            detection_details['oneclass_svm'] = svm_scores
        
        # LSTM Autoencoder
        if 'lstm_autoencoder' in self.models and len(numeric_data) >= self.sequence_length:
            sequences = self.create_sequences_for_autoencoder(numeric_data)
            if len(sequences) > 0:
                reconstructions = self.models['lstm_autoencoder'].predict(sequences, verbose=0)
                mse = np.mean(np.power(sequences - reconstructions, 2), axis=(1, 2))
                
                # Pad the scores to match original length
                ae_scores = np.zeros(len(numeric_data))
                ae_scores[self.sequence_length-1:] = mse
                ae_scores = (ae_scores - ae_scores.min()) / (ae_scores.max() - ae_scores.min() + 1e-8)
                anomaly_scores += ae_scores * 0.34
                detection_details['lstm_autoencoder'] = ae_scores
        
        # Classify severity
        severity = np.where(anomaly_scores >= self.thresholds['critical'], 'Critical',
                   np.where(anomaly_scores >= self.thresholds['high'], 'High',
                   np.where(anomaly_scores >= self.thresholds['medium'], 'Medium',
                   np.where(anomaly_scores >= self.thresholds['low'], 'Low', 'Normal'))))
        
        return anomaly_scores, severity, detection_details
    
    def update_thresholds(self, feedback_data):
        """
        Adapt thresholds based on feedback
        """
        # This would be implemented based on expert feedback
        # For now, we'll use a simple adaptive approach
        if len(feedback_data) > 0:
            true_anomalies = feedback_data[feedback_data['is_anomaly'] == True]['score']
            if len(true_anomalies) > 0:
                self.thresholds['low'] = np.percentile(true_anomalies, 25)
                self.thresholds['medium'] = np.percentile(true_anomalies, 50)
                self.thresholds['high'] = np.percentile(true_anomalies, 75)
                self.thresholds['critical'] = np.percentile(true_anomalies, 90)




In [13]:
class ActiveLearningComponent:
    """
    Active learning for continuous model improvement
    """
    
    def __init__(self, buffer_size=10000):
        self.buffer_size = buffer_size
        self.experience_buffer = []
        self.feedback_history = []
        self.model_performance = {}
        
    def collect_feedback(self, predictions, actual_values, expert_annotations=None):
        """
        Collect feedback from various sources
        """
        feedback = {
            'timestamp': pd.Timestamp.now(),
            'predictions': predictions,
            'actual_values': actual_values,
            'expert_annotations': expert_annotations,
            'error': np.abs(predictions - actual_values) if actual_values is not None else None
        }
        
        self.feedback_history.append(feedback)
        
        # Maintain buffer size
        if len(self.feedback_history) > self.buffer_size:
            self.feedback_history = self.feedback_history[-self.buffer_size:]
    
    def select_samples_for_labeling(self, uncertainty_scores, n_samples=10):
        """
        Select most uncertain samples for expert labeling
        """
        # Select samples with highest uncertainty
        uncertain_indices = np.argsort(uncertainty_scores)[-n_samples:]
        return uncertain_indices
    
    def update_models(self, forecasting_engine, anomaly_detector, new_data):
        """
        Incremental model updates using transfer learning
        """
        # This would implement incremental learning
        # For now, we'll track performance and suggest retraining
        
        current_performance = self.evaluate_model_performance(new_data)
        
        if self.should_retrain(current_performance):
            return True  # Signal for retraining
        
        return False
    
    def evaluate_model_performance(self, data):
        """
        Evaluate current model performance
        """
        # Calculate performance metrics
        performance = {
            'timestamp': pd.Timestamp.now(),
            'accuracy': 0.0,  # Would be calculated from actual vs predicted
            'precision': 0.0,
            'recall': 0.0,
            'f1_score': 0.0
        }
        
        self.model_performance[performance['timestamp']] = performance
        return performance
    
    def should_retrain(self, current_performance, threshold=0.8):
        """
        Decide if models need retraining
        """
        if len(self.model_performance) < 2:
            return False
        
        # Compare with historical performance
        recent_performances = list(self.model_performance.values())[-5:]
        avg_performance = np.mean([p['accuracy'] for p in recent_performances])
        
        return current_performance['accuracy'] < avg_performance * threshold




In [14]:
class ExplainableDecisionSupport:
    """
    Provides explanations and decision support for predictions
    """
    
    def __init__(self):
        self.feature_importance = {}
        self.explanation_templates = {
            'anomaly': "Anomaly detected due to unusual patterns in {features}. Confidence: {confidence:.2f}",
            'forecast': "Forecast based on historical trends in {features}. Confidence interval: [{lower:.2f}, {upper:.2f}]"
        }
    
    def explain_anomaly(self, anomaly_score, feature_values, feature_names, detection_details):
        """
        Generate explanation for anomaly detection
        """
        # Find most contributing features
        if 'isolation_forest' in detection_details:
            # For simplicity, use variance-based importance
            feature_importance = np.var(feature_values.reshape(1, -1), axis=0)
            top_features_idx = np.argsort(feature_importance)[-3:]
            top_features = [feature_names[i] for i in top_features_idx]
        else:
            top_features = feature_names[:3]  # Fallback
        
        explanation = self.explanation_templates['anomaly'].format(
            features=', '.join(top_features),
            confidence=anomaly_score
        )
        
        return {
            'explanation': explanation,
            'contributing_features': top_features,
            'confidence': anomaly_score,
            'severity': self._get_severity_level(anomaly_score),
            'recommendations': self._get_recommendations(anomaly_score, top_features)
        }
    
    def explain_forecast(self, forecast_values, confidence_intervals, contributing_factors):
        """
        Generate explanation for forecasts
        """
        explanation = self.explanation_templates['forecast'].format(
            features=', '.join(contributing_factors),
            lower=confidence_intervals[0],
            upper=confidence_intervals[1]
        )
        
        return {
            'explanation': explanation,
            'forecast_trend': 'increasing' if forecast_values[-1] > forecast_values[0] else 'decreasing',
            'confidence_interval': confidence_intervals,
            'key_factors': contributing_factors
        }
    
    def _get_severity_level(self, score):
        """
        Map anomaly score to severity level
        """
        if score >= 0.9:
            return 'Critical'
        elif score >= 0.7:
            return 'High'
        elif score >= 0.5:
            return 'Medium'
        elif score >= 0.3:
            return 'Low'
        else:
            return 'Normal'
    
    def _get_recommendations(self, score, features):
        """
        Generate recommendations based on anomaly
        """
        recommendations = []
        
        if score >= 0.7:
            recommendations.append("Immediate investigation required")
            recommendations.append("Check system logs for errors")
        
        if 'cpu' in str(features).lower():
            recommendations.append("Monitor CPU-intensive processes")
        
        if 'memory' in str(features).lower():
            recommendations.append("Check memory usage and potential leaks")
        
        if 'io' in str(features).lower():
            recommendations.append("Investigate disk I/O performance")
        
        return recommendations




In [15]:
class AIXMonitoringTrainer:
    """
    Main training class that orchestrates all components
    """
    
    def __init__(self):
        self.data_processor = DataProcessor()
        self.forecasting_engine = ForecastingEngine()
        self.anomaly_detector = AnomalyDetector()
        self.active_learner = ActiveLearningComponent()
        self.explainer = ExplainableDecisionSupport()
        
        self.is_trained = False
        self.training_history = []
    
    def train(self, vmstat_df, iostat_df, netstat_df, process_df, 
              target_columns=['us', 'sy', 'mem_mean', 'tps'], 
              test_size=0.2):
        """
        Complete training pipeline
        """
        print("Starting AIX Monitoring System Training...")
        
        # 1. Data Processing
        print("1. Processing data...")
        processed_df = self.data_processor.process_data(
            vmstat_df, iostat_df, netstat_df, process_df
        )
        
        # 2. Split data
        train_df, test_df = train_test_split(
            processed_df, test_size=test_size, shuffle=False
        )
        
        # 3. Train Forecasting Models
        print("2. Training forecasting models...")
        self.forecasting_engine.train_ensemble(train_df, target_columns)
        
        # 4. Train Anomaly Detection Models
        print("3. Training anomaly detection models...")
        self.anomaly_detector.train_detectors(train_df)
        
        # 5. Evaluate on test set
        print("4. Evaluating models...")
        evaluation_results = self.evaluate_models(test_df, target_columns)
        
        # 6. Store training history
        training_record = {
            'timestamp': pd.Timestamp.now(),
            'train_size': len(train_df),
            'test_size': len(test_df),
            'target_columns': target_columns,
            'evaluation_results': evaluation_results
        }
        self.training_history.append(training_record)
        
        self.is_trained = True
        print("Training completed successfully!")
        
        return evaluation_results
    
    def evaluate_models(self, test_df, target_columns):
        """
        Comprehensive model evaluation
        """
        results = {
            'forecasting': {},
            'anomaly_detection': {},
            'overall_performance': {}
        }
        
        # Evaluate Forecasting
        for target_col in target_columns:
            if target_col in test_df.columns:
                # Generate predictions
                forecast_pred, model_preds = self.forecasting_engine.predict(
                    test_df.head(50), target_col, horizon=24
                )
                
                if forecast_pred is not None and len(forecast_pred) > 0:
                    # Calculate metrics (simplified - would need actual future values)
                    results['forecasting'][target_col] = {
                        'mae': np.mean(np.abs(forecast_pred)),
                        'mse': np.mean(forecast_pred ** 2),
                        'model_contributions': list(model_preds.keys())
                    }
        
        # Evaluate Anomaly Detection
        anomaly_scores, severity, detection_details = self.anomaly_detector.detect_anomalies(test_df)
        
        results['anomaly_detection'] = {
            'total_anomalies': np.sum(severity != 'Normal'),
            'severity_distribution': {
                severity_level: np.sum(severity == severity_level) 
                for severity_level in ['Normal', 'Low', 'Medium', 'High', 'Critical']
            },
            'average_anomaly_score': np.mean(anomaly_scores),
            'detection_methods_used': list(detection_details.keys())
        }
        
        # Overall Performance
        results['overall_performance'] = {
            'training_time': pd.Timestamp.now(),
            'data_quality_score': self._calculate_data_quality_score(test_df),
            'model_complexity': self._calculate_model_complexity(),
            'memory_usage_mb': self._estimate_memory_usage()
        }
        
        return results
    
    def predict_and_detect(self, vmstat_df, iostat_df, netstat_df, process_df, 
                          target_columns=['us', 'sy', 'mem_mean', 'tps'],
                          forecast_horizon=24):
        """
        Generate predictions and detect anomalies on new data
        """
        if not self.is_trained:
            raise ValueError("Models must be trained before making predictions")
        
        # Process new data
        processed_df = self.data_processor.process_data(
            vmstat_df, iostat_df, netstat_df, process_df
        )
        
        results = {
            'forecasts': {},
            'anomalies': {},
            'explanations': {},
            'recommendations': []
        }
        
        # Generate Forecasts
        for target_col in target_columns:
            if target_col in processed_df.columns:
                forecast_pred, model_preds = self.forecasting_engine.predict(
                    processed_df, target_col, horizon=forecast_horizon
                )
                
            # Inside the target_col loop in predict_and_detect()
                if forecast_pred is not None:
                    std_dev = np.std(forecast_pred)
    
                # Calculate full confidence bands
                    confidence_band_lower = forecast_pred - 1.96 * std_dev
                    confidence_band_upper = forecast_pred + 1.96 * std_dev
    
                     # Get LAST confidence interval for explanation
                    last_ci = [confidence_band_lower[-1], confidence_band_upper[-1]]
    
                    results['forecasts'][target_col] = {
                        'values': forecast_pred.tolist(),
                        'confidence_intervals': [confidence_band_lower.tolist(), 
                                confidence_band_upper.tolist()],
                        'model_contributions': model_preds
    }
    
                    # Pass single interval to explainer
                    explanation = self.explainer.explain_forecast(
                        forecast_pred, last_ci, [target_col]  # Now using last_ci
                    )
                    results['explanations'][f'forecast_{target_col}'] = explanation
        
        # Detect Anomalies
        anomaly_scores, severity, detection_details = self.anomaly_detector.detect_anomalies(processed_df)
        
        # Find anomalous points
        anomalous_indices = np.where(severity != 'Normal')[0]
        
        results['anomalies'] = {
            'scores': anomaly_scores.tolist(),
            'severity': severity.tolist(),
            'anomalous_points': len(anomalous_indices),
            'detection_details': {k: v.tolist() if isinstance(v, np.ndarray) else v 
                                for k, v in detection_details.items()}
        }
        
        # Generate explanations for significant anomalies
        for idx in anomalous_indices[:5]:  # Explain top 5 anomalies
            if severity[idx] in ['High', 'Critical']:
                feature_values = processed_df.iloc[idx][self.anomaly_detector.feature_columns].values
                explanation = self.explainer.explain_anomaly(
                    anomaly_scores[idx],
                    feature_values,
                    self.anomaly_detector.feature_columns,
                    detection_details
                )
                results['explanations'][f'anomaly_{idx}'] = explanation
                results['recommendations'].extend(explanation['recommendations'])
        
        return results
    
    def update_models_with_feedback(self, feedback_data):
        """
        Update models with new feedback data
        """
        if not self.is_trained:
            raise ValueError("Models must be trained before updating")
        
        # Collect feedback
        self.active_learner.collect_feedback(
            predictions=feedback_data.get('predictions'),
            actual_values=feedback_data.get('actual_values'),
            expert_annotations=feedback_data.get('expert_annotations')
        )
        
        # Check if retraining is needed
        should_retrain = self.active_learner.update_models(
            self.forecasting_engine,
            self.anomaly_detector,
            feedback_data
        )
        
        if should_retrain:
            print("Model performance degraded. Retraining recommended.")
            return {'status': 'retraining_recommended', 'reason': 'performance_degradation'}
        else:
            # Update thresholds based on feedback
            if 'anomaly_feedback' in feedback_data:
                self.anomaly_detector.update_thresholds(feedback_data['anomaly_feedback'])
            
            return {'status': 'updated', 'reason': 'incremental_learning'}
    
    def get_model_status(self):
        """
        Get current status of all models
        """
        if not self.is_trained:
            return {'status': 'not_trained', 'models': {}}
        
        status = {
            'status': 'trained',
            'training_history': len(self.training_history),
            'last_training': self.training_history[-1]['timestamp'] if self.training_history else None,
            'models': {
                'forecasting': {
                    'lstm_models': len([k for k in self.forecasting_engine.models.keys() if 'lstm' in k]),
                    'transformer_models': len([k for k in self.forecasting_engine.models.keys() if 'transformer' in k]),
                    'arima_models': len([k for k in self.forecasting_engine.models.keys() if 'arima' in k]),
                    'prophet_models': len([k for k in self.forecasting_engine.models.keys() if 'prophet' in k]),
                },
                'anomaly_detection': {
                    'isolation_forest': 'isolation_forest' in self.anomaly_detector.models,
                    'oneclass_svm': 'oneclass_svm' in self.anomaly_detector.models,
                    'lstm_autoencoder': 'lstm_autoencoder' in self.anomaly_detector.models,
                },
                'active_learning': {
                    'feedback_samples': len(self.active_learner.feedback_history),
                    'performance_records': len(self.active_learner.model_performance)
                }
            },
            'memory_usage_estimate': self._estimate_memory_usage()
        }
        
        return status
    
    def save_models(self, filepath):
        """
        Save trained models to disk
        """
        import pickle
        import os
        
        if not self.is_trained:
            raise ValueError("No trained models to save")
        
        os.makedirs(filepath, exist_ok=True)
        
        # Save data processor
        with open(os.path.join(filepath, 'data_processor.pkl'), 'wb') as f:
            pickle.dump(self.data_processor, f)
        
        # Save forecasting models (non-neural network components)
        forecasting_state = {
            'model_weights': self.forecasting_engine.model_weights,
            'feature_columns': self.forecasting_engine.feature_columns,
            'sequence_length': self.forecasting_engine.sequence_length
        }
        
        # Save ARIMA and Prophet models
        arima_models = {k: v for k, v in self.forecasting_engine.models.items() if 'arima' in k}
        prophet_models = {k: v for k, v in self.forecasting_engine.models.items() if 'prophet' in k}
        
        with open(os.path.join(filepath, 'forecasting_classical.pkl'), 'wb') as f:
            pickle.dump({'arima': arima_models, 'prophet': prophet_models, 'state': forecasting_state}, f)
        
        # Save neural network models separately
        for model_name, model in self.forecasting_engine.models.items():
            if 'lstm' in model_name or 'transformer' in model_name:
                model.save(os.path.join(filepath, f'{model_name}.h5'))
        
        # Save anomaly detection models
        anomaly_state = {
            'thresholds': self.anomaly_detector.thresholds,
            'feature_columns': self.anomaly_detector.feature_columns,
            'sequence_length': self.anomaly_detector.sequence_length
        }
        
        # Save sklearn models
        sklearn_models = {k: v for k, v in self.anomaly_detector.models.items() 
                         if k in ['isolation_forest', 'oneclass_svm']}
        
        with open(os.path.join(filepath, 'anomaly_detection.pkl'), 'wb') as f:
            pickle.dump({'sklearn_models': sklearn_models, 'state': anomaly_state}, f)
        
        # Save LSTM autoencoder
        if 'lstm_autoencoder' in self.anomaly_detector.models:
            self.anomaly_detector.models['lstm_autoencoder'].save(
                os.path.join(filepath, 'lstm_autoencoder.h5')
            )
        
        # Save active learning component
        with open(os.path.join(filepath, 'active_learning.pkl'), 'wb') as f:
            pickle.dump(self.active_learner, f)
        
        # Save training history
        with open(os.path.join(filepath, 'training_history.pkl'), 'wb') as f:
            pickle.dump(self.training_history, f)
        
        print(f"Models saved successfully to {filepath}")
    
    def load_models(self, filepath):
        """
        Load trained models from disk
        """
        import pickle
        import os
        from tensorflow.keras.models import load_model
        
        if not os.path.exists(filepath):
            raise ValueError(f"Model directory {filepath} does not exist")
        
        # Load data processor
        with open(os.path.join(filepath, 'data_processor.pkl'), 'rb') as f:
            self.data_processor = pickle.load(f)
        
        # Load forecasting models
        with open(os.path.join(filepath, 'forecasting_classical.pkl'), 'rb') as f:
            forecasting_data = pickle.load(f)
            
        self.forecasting_engine.model_weights = forecasting_data['state']['model_weights']
        self.forecasting_engine.feature_columns = forecasting_data['state']['feature_columns']
        self.forecasting_engine.sequence_length = forecasting_data['state']['sequence_length']
        
        # Load classical models
        self.forecasting_engine.models.update(forecasting_data['arima'])
        self.forecasting_engine.models.update(forecasting_data['prophet'])
        
        # Load neural network models
        for filename in os.listdir(filepath):
            if filename.endswith('.h5') and ('lstm' in filename or 'transformer' in filename):
                model_name = filename.replace('.h5', '')
                if 'autoencoder' not in filename:  # Skip autoencoder here
                    self.forecasting_engine.models[model_name] = load_model(
                        os.path.join(filepath, filename)
                    )
        
        # Load anomaly detection models
        with open(os.path.join(filepath, 'anomaly_detection.pkl'), 'rb') as f:
            anomaly_data = pickle.load(f)
        
        self.anomaly_detector.thresholds = anomaly_data['state']['thresholds']
        self.anomaly_detector.feature_columns = anomaly_data['state']['feature_columns']
        self.anomaly_detector.sequence_length = anomaly_data['state']['sequence_length']
        self.anomaly_detector.models.update(anomaly_data['sklearn_models'])
        
        # Load LSTM autoencoder
        autoencoder_path = os.path.join(filepath, 'lstm_autoencoder.h5')
        if os.path.exists(autoencoder_path):
            self.anomaly_detector.models['lstm_autoencoder'] = load_model(autoencoder_path)
        
        # Load active learning component
        with open(os.path.join(filepath, 'active_learning.pkl'), 'rb') as f:
            self.active_learner = pickle.load(f)
        
        # Load training history
        with open(os.path.join(filepath, 'training_history.pkl'), 'rb') as f:
            self.training_history = pickle.load(f)
        
        self.is_trained = True
        print(f"Models loaded successfully from {filepath}")
    
    def _calculate_data_quality_score(self, df):
        """
        Calculate a simple data quality score
        """
        numeric_df = df.select_dtypes(include=[np.number])
        
        # Check for missing values
        missing_ratio = numeric_df.isnull().sum().sum() / (len(numeric_df) * len(numeric_df.columns))
        
        # Check for constant columns
        constant_cols = (numeric_df.nunique() == 1).sum()
        constant_ratio = constant_cols / len(numeric_df.columns)
        
        # Calculate quality score (0-1, higher is better)
        quality_score = (1 - missing_ratio) * (1 - constant_ratio)
        
        return max(0, min(1, quality_score))
    
    def _calculate_model_complexity(self):
        """
        Estimate model complexity
        """
        complexity = 0
        
        # Count forecasting models
        complexity += len(self.forecasting_engine.models) * 10
        
        # Count anomaly detection models
        complexity += len(self.anomaly_detector.models) * 5
        
        # Add feature complexity
        complexity += len(self.data_processor.feature_columns)
        
        return complexity
    
    def _estimate_memory_usage(self):
        """
        Estimate memory usage in MB
        """
        import sys
        
        total_size = 0
        
        # Estimate size of main components
        total_size += sys.getsizeof(self.data_processor)
        total_size += sys.getsizeof(self.forecasting_engine)
        total_size += sys.getsizeof(self.anomaly_detector)
        total_size += sys.getsizeof(self.active_learner)
        
        # Convert to MB
        return total_size / (1024 * 1024)




In [None]:
# Example usage and testing
if __name__ == "__main__":
    # Create sample data for testing
    np.random.seed(42)
    
    # Sample vmstat data
    vmstat_data = {
        'id': ['server1'] * 1000,
        'timestamp': pd.date_range('2024-01-01', periods=1000, freq='H'),
        'r': np.random.poisson(2, 1000),
        'b': np.random.poisson(1, 1000),
        'avm': np.random.normal(50000, 5000, 1000),
        'fre': np.random.normal(20000, 2000, 1000),
        'pi': np.random.poisson(10, 1000),
        'po': np.random.poisson(5, 1000),
        'fr': np.random.poisson(100, 1000),
        'in': np.random.poisson(1000, 1000),
        'cs': np.random.poisson(500, 1000),
        'us': np.random.normal(30, 10, 1000),
        'sy': np.random.normal(20, 5, 1000),
        'idle': 100 - np.random.normal(50, 15, 1000)
    }
    vmstat_df = pd.DataFrame(vmstat_data)
    
    # Sample iostat data
    iostat_data = {
        'id': ['server1'] * 2000,
        'timestamp': np.repeat(pd.date_range('2024-01-01', periods=1000, freq='H'), 2),
        'disk': ['sda', 'sdb'] * 1000,
        'tps': np.random.exponential(10, 2000),
        'kb_read': np.random.exponential(1000, 2000),
        'kb_wrtn': np.random.exponential(500, 2000),
        'service_time': np.random.exponential(5, 2000)
    }
    iostat_df = pd.DataFrame(iostat_data)
    
    # Sample netstat data
    netstat_data = {
        'id': ['server1'] * 2000,
        'timestamp': np.repeat(pd.date_range('2024-01-01', periods=1000, freq='H'), 2),
        'interface': ['eth0', 'eth1'] * 1000,
        'ipkts': np.random.poisson(1000, 2000),
        'ierrs': np.random.poisson(1, 2000),
        'opkts': np.random.poisson(800, 2000),
        'oerrs': np.random.poisson(1, 2000),
        'time': np.random.normal(1, 0.1, 2000),
        'ipkts_rate': np.random.exponential(100, 2000),
        'opkts_rate': np.random.exponential(80, 2000),
        'ierrs_rate': np.random.exponential(0.1, 2000),
        'oerrs_rate': np.random.exponential(0.1, 2000)
    }
    netstat_df = pd.DataFrame(netstat_data)
    
    # Sample process data
    process_data = {
        'id': ['server1'] * 5000,
        'timestamp': np.repeat(pd.date_range('2024-01-01', periods=1000, freq='H'), 5),
        'pid': np.random.randint(1000, 9999, 5000),
        'user': np.random.choice(['root', 'apache', 'mysql', 'user1'], 5000),
        'cpu': np.random.exponential(2, 5000),
        'mem': np.random.exponential(1, 5000),
        'command': np.random.choice(['httpd', 'mysqld', 'python', 'java'], 5000)
    }
    process_df = pd.DataFrame(process_data)
    
    # Initialize and train the system
    trainer = AIXMonitoringTrainer()
    
    print("Testing AIX Monitoring System...")
    print("=" * 50)
    
    # Train the system
    evaluation_results = trainer.train(
        vmstat_df, iostat_df, netstat_df, process_df,
        target_columns=['us', 'sy', 'mem_mean', 'tps']
    )
    
    print("\nEvaluation Results:")
    print(f"Forecasting Models: {evaluation_results['forecasting']}")
    print(f"Anomaly Detection: {evaluation_results['anomaly_detection']}")
    
    # Test prediction
    print("\nTesting predictions...")
    results = trainer.predict_and_detect(
        vmstat_df.tail(100), iostat_df.tail(200), 
        netstat_df.tail(200), process_df.tail(500)
    )
    
    print(f"Forecasts generated for: {list(results['forecasts'].keys())}")
    print(f"Anomalies detected: {results['anomalies']['anomalous_points']}")
    print(f"Explanations provided: {len(results['explanations'])}")
    
    # Check model status
    status = trainer.get_model_status()
    print(f"\nModel Status: {status['status']}")
    print(f"Memory Usage: {status['memory_usage_estimate']:.2f} MB")
    
    print("\nTesting completed successfully!")

Testing AIX Monitoring System...
Starting AIX Monitoring System Training...
1. Processing data...
2. Training forecasting models...
Training models for us...


14:39:39 - cmdstanpy - INFO - Chain [1] start processing
14:39:39 - cmdstanpy - INFO - Chain [1] done processing


Training models for sy...


14:41:58 - cmdstanpy - INFO - Chain [1] start processing
14:41:58 - cmdstanpy - INFO - Chain [1] done processing


Training models for mem_mean...


14:44:05 - cmdstanpy - INFO - Chain [1] start processing
14:44:05 - cmdstanpy - INFO - Chain [1] done processing


Training models for tps...


14:46:32 - cmdstanpy - INFO - Chain [1] start processing
14:46:32 - cmdstanpy - INFO - Chain [1] done processing


3. Training anomaly detection models...
4. Evaluating models...
Training completed successfully!

Evaluation Results:
Forecasting Models: {'us': {'mae': np.float64(0.09533888515187061), 'mse': np.float64(0.01462145429894438), 'model_contributions': ['lstm', 'arima', 'prophet']}, 'sy': {'mae': np.float64(0.0893235372102365), 'mse': np.float64(0.012338707622326748), 'model_contributions': ['lstm', 'arima', 'prophet']}, 'mem_mean': {'mae': np.float64(0.10391441250900058), 'mse': np.float64(0.014249267913659087), 'model_contributions': ['lstm', 'arima', 'prophet']}, 'tps': {'mae': np.float64(0.3021251441609195), 'mse': np.float64(0.1036135803830096), 'model_contributions': ['lstm', 'arima', 'prophet']}}
Anomaly Detection: {'total_anomalies': np.int64(193), 'severity_distribution': {'Normal': np.int64(7), 'Low': np.int64(56), 'Medium': np.int64(64), 'High': np.int64(71), 'Critical': np.int64(2)}, 'average_anomaly_score': np.float64(0.6056006809675158), 'detection_methods_used': ['isolation_

In [None]:
# Load real data
print("Loading real AIX server metrics...")

# Load real data
vmstat_df = pd.read_csv("D:\\projet\\exports\\vmstat_metrics.csv")
iostat_df = pd.read_csv("D:\\projet\\exports\\iostat_metrics.csv")
netstat_df = pd.read_csv("D:\\projet\\exports\\netstat_metrics.csv")
process_df = pd.read_csv("D:\\projet\\exports\\process_metrics.csv")




# Process timestamps with robust parsing
for df in [vmstat_df, iostat_df, netstat_df, process_df]:
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601', errors='coerce')
    # Drop rows with invalid timestamps if any
    df.dropna(subset=['timestamp'], inplace=True)


# Initialize trainer
trainer = AIXMonitoringTrainer()

# Define target columns based on critical metrics
target_columns = [
    # CPU metrics (vmstat)
    'us',    # User CPU %
    'sy',    # System CPU %
    'idle',  # Idle CPU %
    
    # Memory metrics (vmstat)
    'fre',   # Free memory
    
    # Disk metrics (iostat)
    'tps',   # Transactions per second
    'service_time',  # Disk service time
    
    # Network metrics (netstat)
    'ipkts_rate',  # Input packets rate
    'oerrs_rate',  # Output error rate
    
    # Process metrics (process)
    'cpu'    # Process CPU usage
]

# Train the model
print("\nTraining models for:", target_columns)
evaluation_results = trainer.train(
    vmstat_df, 
    iostat_df, 
    netstat_df, 
    process_df,
    target_columns=target_columns,
    test_size=0.2
)

# Print detailed evaluation results
def print_evaluation(results):
    print("\n" + "="*50)
    print("FORECASTING PERFORMANCE (MAE/MSE)")
    print("="*50)
    for metric, scores in results['forecasting'].items():
        print(f"{metric.upper():<15} MAE: {scores['mae']:.4f} | MSE: {scores['mse']:.4f}")
        print(f"    Models used: {', '.join(scores['model_contributions'])}")
    
    print("\n" + "="*50)
    print("ANOMALY DETECTION RESULTS")
    print("="*50)
    anomalies = results['anomaly_detection']
    print(f"Total anomalies detected: {anomalies['total_anomalies']}")
    print("Severity distribution:")
    for severity, count in anomalies['severity_distribution'].items():
        print(f"  {severity:<8}: {count}")
    print(f"Average anomaly score: {anomalies['average_anomaly_score']:.2f}")
    print(f"Detection methods: {', '.join(anomalies['detection_methods_used'])}")
    
    print("\n" + "="*50)
    print("SYSTEM PERFORMANCE METRICS")
    print("="*50)
    perf = results['overall_performance']
    print(f"Training time: {perf['training_time']}")
    print(f"Data quality score: {perf['data_quality_score']:.2f}/1.0")
    print(f"Model complexity: {perf['model_complexity']} (relative units)")
    print(f"Memory usage: {perf['memory_usage_mb']:.2f} MB")

print_evaluation(evaluation_results)

# Sample predictions
print("\n" + "="*50)
print("GENERATING SAMPLE PREDICTIONS")
print("="*50)
sample_results = trainer.predict_and_detect(
    vmstat_df.tail(100),
    iostat_df.tail(200),
    netstat_df.tail(200),
    process_df.tail(500)
)

print(f"\nGenerated forecasts for: {list(sample_results['forecasts'].keys())}")
print(f"Detected {sample_results['anomalies']['anomalous_points']} anomalies")
print(f"Top anomaly explanation:")
first_anomaly_key = [k for k in sample_results['explanations'].keys() if 'anomaly_' in k][0]
print(sample_results['explanations'][first_anomaly_key]['explanation'])

# Model status
print("\n" + "="*50)
print("MODEL STATUS SUMMARY")
print("="*50)
status = trainer.get_model_status()
print(f"Status: {status['status']}")
print(f"Last trained: {status['last_training']}")
print(f"Forecasting models: {status['models']['forecasting']}")
print(f"Anomaly detectors: {status['models']['anomaly_detection']}")
print(f"Memory usage: {status['memory_usage_estimate']:.2f} MB")

Loading real AIX server metrics...


MemoryError: Unable to allocate 1.00 MiB for an array with shape (131072,) and data type int64

: 