In [1]:
{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "1566b4c7",
      "metadata": {
        "id": "1566b4c7"
      },
      "source": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Library Imports"
      ],
      "metadata": {
        "id": "uCSRIld0tnWz"
      },
      "id": "uCSRIld0tnWz"
    },
    {
      "cell_type": "code",
      "source": [
        "# !pip install pytorch_forecasting polars"
      ],
      "metadata": {
        "id": "KGhO5A8QnQop"
      },
      "id": "KGhO5A8QnQop",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "import polars as pl\n",
        "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
        "from sklearn.ensemble import IsolationForest\n",
        "from sklearn.svm import OneClassSVM\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import precision_recall_fscore_support, mean_absolute_error\n",
        "import tensorflow as tf\n",
        "from tensorflow.keras.models import Sequential, Model\n",
        "from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, RepeatVector, TimeDistributed\n",
        "from tensorflow.keras.optimizers import Adam\n",
        "from statsmodels.tsa.arima.model import ARIMA\n",
        "from statsmodels.tsa.holtwinters import SimpleExpSmoothing\n",
        "from prophet import Prophet\n",
        "import warnings\n",
        "import sys\n",
        "warnings.filterwarnings('ignore')"
      ],
      "metadata": {
        "id": "abb6PFxrmGk3"
      },
      "id": "abb6PFxrmGk3",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class DataProcessor:\n",
        "    def __init__(self):\n",
        "        self.scalers = {}\n",
        "        self.feature_columns = []\n",
        "        self.temporal_features = ['hour', 'day_of_week', 'day_of_month', 'is_weekend']\n",
        "        self.target_columns = []\n",
        "\n",
        "    def create_temporal_features(self, df):\n",
        "        \"\"\"Create time-based features (optimized)\"\"\"\n",
        "        df = df.copy()\n",
        "        df['hour'] = df['timestamp'].dt.hour\n",
        "        df['day_of_week'] = df['timestamp'].dt.dayofweek\n",
        "        df['day_of_month'] = df['timestamp'].dt.day\n",
        "        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)\n",
        "        return df\n",
        "\n",
        "    def create_statistical_features(self, df, window_sizes=[5]):\n",
        "        \"\"\"Optimized rolling features with minimal windows\"\"\"\n",
        "        df = df.copy()\n",
        "        # Only process target columns and key metrics\n",
        "        numeric_cols = self.target_columns + ['r', 'b', 'avm', 'fre', 'fr', 'in', 'cs']\n",
        "        numeric_cols = [col for col in numeric_cols if col in df.columns]\n",
        "\n",
        "        for window in window_sizes:\n",
        "            for col in numeric_cols:\n",
        "                # Only calculate rolling mean (skip std/max)\n",
        "                df[f'{col}_rolling_mean_{window}'] = df.groupby('id')[col].transform(\n",
        "                    lambda x: x.rolling(window, min_periods=1).mean()\n",
        "                )\n",
        "        return df.fillna(0)\n",
        "\n",
        "    def create_cross_metric_features(self, df):\n",
        "        \"\"\"Optimized cross-metric features\"\"\"\n",
        "        df = df.copy()\n",
        "        if 'us' in df and 'mem_mean' in df:\n",
        "            df['cpu_mem_ratio'] = df['us'] / (df['mem_mean'] + 1e-6)\n",
        "        if 'kb_read' in df and 'kb_wrtn' in df and 'tps' in df:\n",
        "            df['io_efficiency'] = (df['kb_read'] + df['kb_wrtn']) / (df['tps'] + 1e-6)\n",
        "        if 'ipkts_rate' in df and 'opkts_rate' in df and 'ierrs_rate' in df and 'oerrs_rate' in df:\n",
        "            df['net_error_rate'] = (df['ierrs_rate'] + df['oerrs_rate']) / (df['ipkts_rate'] + df['opkts_rate'] + 1e-6)\n",
        "        if 'us' in df and 'sy' in df and 'r' in df:\n",
        "            df['system_load'] = df['us'] + df['sy'] + (df['r'] * 10)\n",
        "        return df.fillna(0)\n",
        "\n",
        "    def detect_and_handle_outliers(self, df):\n",
        "        \"\"\"Optimized outlier handling for target columns only\"\"\"\n",
        "        df = df.copy()\n",
        "        for col in self.target_columns:\n",
        "            if col not in df.columns:\n",
        "                continue\n",
        "            # Vectorized IQR calculation\n",
        "            q1 = df[col].quantile(0.25)\n",
        "            q3 = df[col].quantile(0.75)\n",
        "            iqr = q3 - q1\n",
        "            lower_bound = q1 - 1.5 * iqr\n",
        "            upper_bound = q3 + 1.5 * iqr\n",
        "            df[col] = df[col].clip(lower_bound, upper_bound)\n",
        "        return df\n",
        "\n",
        "    def normalize_features(self, df, method='robust'):\n",
        "        \"\"\"Optimized normalization\"\"\"\n",
        "        df = df.copy()\n",
        "        numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
        "        numeric_cols = [col for col in numeric_cols if col not in ['id', 'hour', 'day_of_week', 'day_of_month', 'is_weekend']]\n",
        "\n",
        "        if method == 'robust':\n",
        "            scaler = RobustScaler()\n",
        "        else:\n",
        "            scaler = StandardScaler()\n",
        "\n",
        "        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])\n",
        "        self.scalers['main_scaler'] = scaler\n",
        "        self.feature_columns = numeric_cols\n",
        "\n",
        "        return df\n",
        "\n",
        "    def process_data(self, merged_df, target_columns):\n",
        "        \"\"\"Optimized processing pipeline\"\"\"\n",
        "        # Validate target columns\n",
        "        valid_columns = [col for col in target_columns if col in merged_df.columns]\n",
        "        missing = set(target_columns) - set(valid_columns)\n",
        "        \n",
        "        if missing:\n",
        "            print(f\"Warning: Missing target columns: {missing}\")\n",
        "            \n",
        "        self.target_columns = valid_columns\n",
        "        df = self.create_temporal_features(merged_df)\n",
        "        df = self.create_statistical_features(df, window_sizes=[5])  # Single window size\n",
        "        df = self.create_cross_metric_features(df)\n",
        "        df = self.detect_and_handle_outliers(df)\n",
        "        df = self.normalize_features(df)\n",
        "        return df"
      ],
      "metadata": {
        "id": "XquWKbn9mHAN"
      },
      "id": "XquWKbn9mHAN",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class ForecastingEngine:\n",
        "    \"\"\"\n",
        "    Multi-model forecasting engine with ensemble capabilities\n",
        "    \"\"\"\n",
        "\n",
        "    def __init__(self, sequence_length=50):\n",
        "        self.sequence_length = sequence_length\n",
        "        self.models = {}\n",
        "        self.model_weights = {}\n",
        "        self.feature_columns = []\n",
        "\n",
        "    def create_lstm_model(self, input_shape, horizon=24):\n",
        "        \"\"\"\n",
        "        Create LSTM model for long-term forecasting\n",
        "        \"\"\"\n",
        "        model = Sequential([\n",
        "            LSTM(128, return_sequences=True, input_shape=input_shape),\n",
        "            Dropout(0.2),\n",
        "            LSTM(64, return_sequences=True),\n",
        "            Dropout(0.2),\n",
        "            LSTM(32, return_sequences=False),\n",
        "            Dropout(0.2),\n",
        "            Dense(horizon)\n",
        "        ])\n",
        "\n",
        "        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])\n",
        "        return model\n",
        "\n",
        "    def create_transformer_model(self, input_shape, horizon=12):\n",
        "        \"\"\"\n",
        "        Create Transformer model for medium-term forecasting\n",
        "        \"\"\"\n",
        "        inputs = Input(shape=input_shape)\n",
        "\n",
        "        # Multi-head attention\n",
        "        attention = tf.keras.layers.MultiHeadAttention(\n",
        "            num_heads=8, key_dim=64\n",
        "        )(inputs, inputs)\n",
        "\n",
        "        # Add & Norm\n",
        "        attention = tf.keras.layers.LayerNormalization()(inputs + attention)\n",
        "\n",
        "        # Feed Forward\n",
        "        ff = tf.keras.layers.Dense(128, activation='relu')(attention)\n",
        "        ff = tf.keras.layers.Dense(input_shape[-1])(ff)\n",
        "\n",
        "        # Add & Norm\n",
        "        output = tf.keras.layers.LayerNormalization()(attention + ff)\n",
        "\n",
        "        # Output layer\n",
        "        output = tf.keras.layers.GlobalAveragePooling1D()(output)\n",
        "        output = tf.keras.layers.Dense(horizon)(output)\n",
        "\n",
        "        model = Model(inputs=inputs, outputs=output)\n",
        "        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])\n",
        "\n",
        "        return model\n",
        "\n",
        "    def create_sequences(self, data, target_col, horizon=1):\n",
        "        \"\"\"\n",
        "        Create sequences for time series models\n",
        "        \"\"\"\n",
        "        X, y = [], []\n",
        "        max_start = len(data) - self.sequence_length - horizon\n",
        "        \n",
        "        if max_start < 0:\n",
        "            # Pad with zeros if insufficient data\n",
        "            padded = np.pad(data, ((0, -max_start), 'constant')\n",
        "            X.append(padded[:self.sequence_length])\n",
        "            y.append(padded[self.sequence_length:self.sequence_length + horizon, target_col])\n",
        "        else:\n",
        "            for i in range(max_start + 1):\n",
        "                X.append(data[i:(i + self.sequence_length)])\n",
        "                y.append(data[i + self.sequence_length:i + self.sequence_length + horizon, target_col])\n",
        "\n",
        "        return np.array(X), np.array(y)\n",
        "\n",
        "    def fit_arima_model(self, series, order=(5, 1, 0), seasonal_order=(1, 1, 1, 24)):\n",
        "        \"\"\"\n",
        "        Fit ARIMA model for short-term forecasting\n",
        "        \"\"\"\n",
        "        try:\n",
        "            model = ARIMA(series, order=order, seasonal_order=seasonal_order)\n",
        "            fitted_model = model.fit()\n",
        "            return fitted_model\n",
        "        except Exception as e:\n",
        "            print(f\"ARIMA failed: {str(e)}\")\n",
        "            # Fallback to simple exponential smoothing\n",
        "            model = SimpleExpSmoothing(series)\n",
        "            fitted_model = model.fit()\n",
        "            return fitted_model\n",
        "\n",
        "    def fit_prophet_model(self, df, target_col):\n",
        "        \"\"\"\n",
        "        Fit Prophet model for long-term forecasting with seasonality\n",
        "        \"\"\"\n",
        "        prophet_df = df[['timestamp', target_col]].copy()\n",
        "        prophet_df.columns = ['ds', 'y']\n",
        "\n",
        "        model = Prophet(\n",
        "            yearly_seasonality=True,\n",
        "            weekly_seasonality=True,\n",
        "            daily_seasonality=True,\n",
        "            changepoint_prior_scale=0.05\n",
        "        )\n",
        "\n",
        "        model.fit(prophet_df)\n",
        "        return model\n",
        "\n",
        "    def train_ensemble(self, df, target_columns):\n",
        "        \"\"\"\n",
        "        Train all forecasting models\n",
        "        \"\"\"\n",
        "        self.feature_columns = [col for col in df.columns if col not in ['id', 'timestamp']]\n",
        "\n",
        "        for target_col in target_columns:\n",
        "            print(f\"Training models for {target_col}...\")\n",
        "\n",
        "            # Prepare data\n",
        "            numeric_data = df[self.feature_columns].values\n",
        "            target_idx = self.feature_columns.index(target_col)\n",
        "\n",
        "            # Create sequences\n",
        "            X_lstm, y_lstm_24 = self.create_sequences(numeric_data, target_idx, horizon=24)\n",
        "            X_trans, y_trans_12 = self.create_sequences(numeric_data, target_idx, horizon=12)\n",
        "\n",
        "            # Split data\n",
        "            split_idx = int(0.8 * len(X_lstm))\n",
        "            X_train_lstm, X_test_lstm = X_lstm[:split_idx], X_lstm[split_idx:]\n",
        "            y_train_lstm, y_test_lstm = y_lstm_24[:split_idx], y_lstm_24[split_idx:]\n",
        "\n",
        "            # Train LSTM\n",
        "            lstm_model = self.create_lstm_model(X_train_lstm.shape[1:], horizon=24)\n",
        "            lstm_model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=32, verbose=0)\n",
        "            self.models[f'{target_col}_lstm'] = lstm_model\n",
        "\n",
        "            # Train Transformer\n",
        "            if len(X_trans) > 0:\n",
        "                trans_model = self.create_transformer_model(X_trans.shape[1:], horizon=12)\n",
        "                X_train_trans = X_trans[:int(0.8 * len(X_trans))]\n",
        "                y_train_trans = y_trans_12[:int(0.8 * len(y_trans_12))]\n",
        "                trans_model.fit(X_train_trans, y_train_trans, epochs=30, batch_size=32, verbose=0)\n",
        "                self.models[f'{target_col}_transformer'] = trans_model\n",
        "\n",
        "            # Train ARIMA\n",
        "            series = df[target_col].values\n",
        "            arima_model = self.fit_arima_model(series)\n",
        "            self.models[f'{target_col}_arima'] = arima_model\n",
        "\n",
        "            # Train Prophet\n",
        "            prophet_model = self.fit_prophet_model(df, target_col)\n",
        "            self.models[f'{target_col}_prophet'] = prophet_model\n",
        "\n",
        "            # Initialize equal weights\n",
        "            self.model_weights[target_col] = {'lstm': 0.25, 'transformer': 0.25, 'arima': 0.25, 'prophet': 0.25}\n",
        "\n",
        "    def predict(self, df, target_col, horizon=24):\n",
        "        \"\"\"\n",
        "        Generate ensemble predictions\n",
        "        \"\"\"\n",
        "        predictions = {}\n",
        "\n",
        "        # LSTM prediction\n",
        "        if f'{target_col}_lstm' in self.models:\n",
        "            numeric_data = df[self.feature_columns].values\n",
        "            if len(numeric_data) >= self.sequence_length:\n",
        "                X_pred = numeric_data[-self.sequence_length:].reshape(1, self.sequence_length, -1)\n",
        "                lstm_pred = self.models[f'{target_col}_lstm'].predict(X_pred, verbose=0)[0]\n",
        "                predictions['lstm'] = lstm_pred[:horizon]\n",
        "\n",
        "        # ARIMA prediction\n",
        "        if f'{target_col}_arima' in self.models:\n",
        "            try:\n",
        "                arima_pred = self.models[f'{target_col}_arima'].forecast(steps=horizon)\n",
        "                predictions['arima'] = arima_pred\n",
        "            except:\n",
        "                predictions['arima'] = np.full(horizon, df[target_col].iloc[-1])\n",
        "\n",
        "        # Prophet prediction\n",
        "        if f'{target_col}_prophet' in self.models:\n",
        "            future = self.models[f'{target_col}_prophet'].make_future_dataframe(periods=horizon, freq='H')\n",
        "            prophet_pred = self.models[f'{target_col}_prophet'].predict(future)\n",
        "            predictions['prophet'] = prophet_pred['yhat'].tail(horizon).values\n",
        "\n",
        "        # Ensemble prediction\n",
        "        if predictions:\n",
        "            weights = self.model_weights.get(target_col, {})\n",
        "            ensemble_pred = np.zeros(horizon)\n",
        "            total_weight = 0\n",
        "\n",
        "            for model_name, pred in predictions.items():\n",
        "                if len(pred) == horizon:\n",
        "                    weight = weights.get(model_name, 0.25)\n",
        "                    ensemble_pred += pred * weight\n",
        "                    total_weight += weight\n",
        "\n",
        "            if total_weight > 0:\n",
        "                ensemble_pred /= total_weight\n",
        "\n",
        "            return ensemble_pred, predictions\n",
        "\n",
        "        return None, {}"
      ],
      "metadata": {
        "id": "vklvAK7SmKhV"
      },
      "id": "vklvAK7SmKhV",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class AnomalyDetector:\n",
        "    \"\"\"\n",
        "    Multi-method anomaly detection framework\n",
        "    \"\"\"\n",
        "\n",
        "    def __init__(self, sequence_length=50):\n",
        "        self.sequence_length = sequence_length\n",
        "        self.models = {}\n",
        "        self.thresholds = {\n",
        "            'low': 0.3,\n",
        "            'medium': 0.5,\n",
        "            'high': 0.7,\n",
        "            'critical': 0.9\n",
        "        }\n",
        "        self.feature_columns = []\n",
        "\n",
        "    def create_lstm_autoencoder(self, input_shape):\n",
        "        \"\"\"\n",
        "        Create LSTM Autoencoder for sequential anomaly detection\n",
        "        \"\"\"\n",
        "        # Encoder\n",
        "        encoder_inputs = Input(shape=input_shape)\n",
        "        encoder = LSTM(64, return_sequences=True)(encoder_inputs)\n",
        "        encoder = LSTM(32, return_sequences=False)(encoder)\n",
        "\n",
        "        # Decoder\n",
        "        decoder = RepeatVector(input_shape[0])(encoder)\n",
        "        decoder = LSTM(32, return_sequences=True)(decoder)\n",
        "        decoder = LSTM(64, return_sequences=True)(decoder)\n",
        "        decoder_outputs = TimeDistributed(Dense(input_shape[1]))(decoder)\n",
        "\n",
        "        autoencoder = Model(encoder_inputs, decoder_outputs)\n",
        "        autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')\n",
        "\n",
        "        return autoencoder\n",
        "\n",
        "    def create_sequences_for_autoencoder(self, data):\n",
        "        \"\"\"\n",
        "        Create sequences for autoencoder training\n",
        "        \"\"\"\n",
        "        X = []\n",
        "        for i in range(len(data) - self.sequence_length + 1):\n",
        "            X.append(data[i:(i + self.sequence_length)])\n",
        "        return np.array(X)\n",
        "\n",
        "    def train_detectors(self, df, contamination=0.05):\n",
        "        \"\"\"\n",
        "        Train all anomaly detection models\n",
        "        \"\"\"\n",
        "        self.feature_columns = [col for col in df.columns if col not in ['id', 'timestamp']]\n",
        "        numeric_data = df[self.feature_columns].values\n",
        "\n",
        "        # Train Isolation Forest\n",
        "        iso_forest = IsolationForest(\n",
        "            contamination=contamination,\n",
        "            n_estimators=100,\n",
        "            random_state=42\n",
        "        )\n",
        "        iso_forest.fit(numeric_data)\n",
        "        self.models['isolation_forest'] = iso_forest\n",
        "\n",
        "        # Train One-Class SVM\n",
        "        oc_svm = OneClassSVM(kernel='rbf', nu=contamination)\n",
        "        oc_svm.fit(numeric_data)\n",
        "        self.models['oneclass_svm'] = oc_svm\n",
        "\n",
        "        # Train LSTM Autoencoder\n",
        "        sequences = self.create_sequences_for_autoencoder(numeric_data)\n",
        "        if len(sequences) > 0:\n",
        "            autoencoder = self.create_lstm_autoencoder((self.sequence_length, len(self.feature_columns)))\n",
        "            autoencoder.fit(sequences, sequences, epochs=50, batch_size=32, verbose=0)\n",
        "            self.models['lstm_autoencoder'] = autoencoder\n",
        "\n",
        "            # Calculate reconstruction threshold\n",
        "            reconstructions = autoencoder.predict(sequences, verbose=0)\n",
        "            mse = np.mean(np.power(sequences - reconstructions, 2), axis=(1, 2))\n",
        "            self.thresholds['reconstruction'] = np.percentile(mse, 95)\n",
        "\n",
        "    def detect_anomalies(self, df):\n",
        "        \"\"\"\n",
        "        Detect anomalies using ensemble approach\n",
        "        \"\"\"\n",
        "        numeric_data = df[self.feature_columns].values\n",
        "        anomaly_scores = np.zeros(len(df))\n",
        "        detection_details = {}\n",
        "\n",
        "        # Isolation Forest\n",
        "        if 'isolation_forest' in self.models:\n",
        "            iso_scores = self.models['isolation_forest'].decision_function(numeric_data)\n",
        "            iso_scores = (iso_scores - iso_scores.min()) / (iso_scores.max() - iso_scores.min())\n",
        "            anomaly_scores += iso_scores * 0.33\n",
        "            detection_details['isolation_forest'] = iso_scores\n",
        "\n",
        "        # One-Class SVM\n",
        "        if 'oneclass_svm' in self.models:\n",
        "            svm_scores = self.models['oneclass_svm'].decision_function(numeric_data)\n",
        "            svm_scores = (svm_scores - svm_scores.min()) / (svm_scores.max() - svm_scores.min())\n",
        "            anomaly_scores += svm_scores * 0.33\n",
        "            detection_details['oneclass_svm'] = svm_scores\n",
        "\n",
        "        # LSTM Autoencoder\n",
        "        if 'lstm_autoencoder' in self.models and len(numeric_data) >= self.sequence_length:\n",
        "            sequences = self.create_sequences_for_autoencoder(numeric_data)\n",
        "            if len(sequences) > 0:\n",
        "                reconstructions = self.models['lstm_autoencoder'].predict(sequences, verbose=0)\n",
        "                mse = np.mean(np.power(sequences - reconstructions, 2), axis=(1, 2))\n",
        "\n",
        "                # Pad the scores to match original length\n",
        "                ae_scores = np.zeros(len(numeric_data))\n",
        "                ae_scores[self.sequence_length-1:] = mse\n",
        "                ae_scores = (ae_scores - ae_scores.min()) / (ae_scores.max() - ae_scores.min() + 1e-8)\n",
        "                anomaly_scores += ae_scores * 0.34\n",
        "                detection_details['lstm_autoencoder'] = ae_scores\n",
        "\n",
        "        # Classify severity\n",
        "        severity = np.where(anomaly_scores >= self.thresholds['critical'], 'Critical',\n",
        "                   np.where(anomaly_scores >= self.thresholds['high'], 'High',\n",
        "                   np.where(anomaly_scores >= self.thresholds['medium'], 'Medium',\n",
        "                   np.where(anomaly_scores >= self.thresholds['low'], 'Low', 'Normal'))))\n",
        "\n",
        "        return anomaly_scores, severity, detection_details\n",
        "\n",
        "    def update_thresholds(self, feedback_data):\n",
        "        \"\"\"\n",
        "        Adapt thresholds based on feedback\n",
        "        \"\"\"\n",
        "        # This would be implemented based on expert feedback\n",
        "        # For now, we'll use a simple adaptive approach\n",
        "        if len(feedback_data) > 0:\n",
        "            true_anomalies = feedback_data[feedback_data['is_anomaly'] == True]['score']\n",
        "            if len(true_anomalies) > 0:\n",
        "                self.thresholds['low'] = np.percentile(true_anomalies, 25)\n",
        "                self.thresholds['medium'] = np.percentile(true_anomalies, 50)\n",
        "                self.thresholds['high'] = np.percentile(true_anomalies, 75)\n",
        "                self.thresholds['critical'] = np.percentile(true_anomalies, 90)"
      ],
      "metadata": {
        "id": "C7jCyoZKmN8_"
      },
      "id": "C7jCyoZKmN8_",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class ActiveLearningComponent:\n",
        "    \"\"\"\n",
        "    Active learning for continuous model improvement\n",
        "    \"\"\"\n",
        "\n",
        "    def __init__(self, buffer_size=10000):\n",
        "        self.buffer_size = buffer_size\n",
        "        self.experience_buffer = []\n",
        "        self.feedback_history = []\n",
        "        self.model_performance = {}\n",
        "\n",
        "    def collect_feedback(self, predictions, actual_values, expert_annotations=None):\n",
        "        \"\"\"\n",
        "        Collect feedback from various sources\n",
        "        \"\"\"\n",
        "        feedback = {\n",
        "            'timestamp': pd.Timestamp.now(),\n",
        "            'predictions': predictions,\n",
        "            'actual_values': actual_values,\n",
        "            'expert_annotations': expert_annotations,\n",
        "            'error': np.abs(predictions - actual_values) if actual_values is not None else None\n",
        "        }\n",
        "\n",
        "        self.feedback_history.append(feedback)\n",
        "\n",
        "        # Maintain buffer size\n",
        "        if len(self.feedback_history) > self.buffer_size:\n",
        "            self.feedback_history = self.feedback_history[-self.buffer_size:]\n",
        "\n",
        "    def select_samples_for_labeling(self, uncertainty_scores, n_samples=10):\n",
        "        \"\"\"\n",
        "        Select most uncertain samples for expert labeling\n",
        "        \"\"\"\n",
        "        # Select samples with highest uncertainty\n",
        "        uncertain_indices = np.argsort(uncertainty_scores)[-n_samples:]\n",
        "        return uncertain_indices\n",
        "\n",
        "    def update_models(self, forecasting_engine, anomaly_detector, new_data):\n",
        "        \"\"\"\n",
        "        Incremental model updates using transfer learning\n",
        "        \"\"\"\n",
        "        # This would implement incremental learning\n",
        "        # For now, we'll track performance and suggest retraining\n",
        "\n",
        "        current_performance = self.evaluate_model_performance(new_data)\n",
        "\n",
        "        if self.should_retrain(current_performance):\n",
        "            return True  # Signal for retraining\n",
        "\n",
        "        return False\n",
        "\n",
        "    def evaluate_model_performance(self, data):\n",
        "        \"\"\"\n",
        "        Evaluate current model performance\n",
        "        \"\"\"\n",
        "        # Calculate performance metrics\n",
        "        performance = {\n",
        "            'timestamp': pd.Timestamp.now(),\n",
        "            'accuracy': 0.0,  # Would be calculated from actual vs predicted\n",
        "            'precision': 0.0,\n",
        "            'recall': 0.0,\n",
        "            'f1_score': 0.0\n",
        "        }\n",
        "\n",
        "        self.model_performance[performance['timestamp']] = performance\n",
        "        return performance\n",
        "\n",
        "    def should_retrain(self, current_performance, threshold=0.8):\n",
        "        \"\"\"\n",
        "        Decide if models need retraining\n",
        "        \"\"\"\n",
        "        if len(self.model_performance) < 2:\n",
        "            return False\n",
        "\n",
        "        # Compare with historical performance\n",
        "        recent_performances = list(self.model_performance.values())[-5:]\n",
        "        avg_performance = np.mean([p['accuracy'] for p in recent_performances])\n",
        "\n",
        "        return current_performance['accuracy'] < avg_performance * threshold"
      ],
      "metadata": {
        "id": "EW6z6m_0mPGe"
      },
      "id": "EW6z6m_0mPGe",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class ExplainableDecisionSupport:\n",
        "    \"\"\"\n",
        "    Provides explanations and decision support for predictions\n",
        "    \"\"\"\n",
        "\n",
        "    def __init__(self):\n",
        "        self.feature_importance = {}\n",
        "        self.explanation_templates = {\n",
        "            'anomaly': \"Anomaly detected due to unusual patterns in {features}. Confidence: {confidence:.2f}\",\n",
        "            'forecast': \"Forecast based on historical trends in {features}. Confidence interval: [{lower:.2f}, {upper:.2f}]\"\n",
        "        }\n",
        "\n",
        "    def explain_anomaly(self, anomaly_score, feature_values, feature_names, detection_details):\n",
        "        \"\"\"\n",
        "        Generate explanation for anomaly detection\n",
        "        \"\"\"\n",
        "        # Find most contributing features\n",
        "        if 'isolation_forest' in detection_details:\n",
        "            # For simplicity, use variance-based importance\n",
        "            feature_importance = np.var(feature_values.reshape(1, -1), axis=0)\n",
        "            top_features_idx = np.argsort(feature_importance)[-3:]\n",
        "            top_features = [feature_names[i] for i in top_features_idx]\n",
        "        else:\n",
        "            top_features = feature_names[:3]  # Fallback\n",
        "\n",
        "        explanation = self.explanation_templates['anomaly'].format(\n",
        "            features=', '.join(top_features),\n",
        "            confidence=anomaly_score\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'explanation': explanation,\n",
        "            'contributing_features': top_features,\n",
        "            'confidence': anomaly_score,\n",
        "            'severity': self._get_severity_level(anomaly_score),\n",
        "            'recommendations': self._get_recommendations(anomaly_score, top_features)\n",
        "        }\n",
        "\n",
        "    def explain_forecast(self, forecast_values, confidence_intervals, contributing_factors):\n",
        "        \"\"\"\n",
        "        Generate explanation for forecasts\n",
        "        \"\"\"\n",
        "        explanation = self.explanation_templates['forecast'].format(\n",
        "            features=', '.join(contributing_factors),\n",
        "            lower=confidence_intervals[0],\n",
        "            upper=confidence_intervals[1]\n",
        "        )\n",
        "\n",
        "        return {\n",
        "            'explanation': explanation,\n",
        "            'forecast_trend': 'increasing' if forecast_values[-1] > forecast_values[0] else 'decreasing',\n",
        "            'confidence_interval': confidence_intervals,\n",
        "            'key_factors': contributing_factors\n",
        "        }\n",
        "\n",
        "    def _get_severity_level(self, score):\n",
        "        \"\"\"\n",
        "        Map anomaly score to severity level\n",
        "        \"\"\"\n",
        "        if score >= 0.9:\n",
        "            return 'Critical'\n",
        "        elif score >= 0.7:\n",
        "            return 'High'\n",
        "        elif score >= 0.5:\n",
        "            return 'Medium'\n",
        "        elif score >= 0.3:\n",
        "            return 'Low'\n",
        "        else:\n",
        "            return 'Normal'\n",
        "\n",
        "    def _get_recommendations(self, score, features):\n",
        "        \"\"\"\n",
        "        Generate recommendations based on anomaly\n",
        "        \"\"\"\n",
        "        recommendations = []\n",
        "\n",
        "        if score >= 0.7:\n",
        "            recommendations.append(\"Immediate investigation required\")\n",
        "            recommendations.append(\"Check system logs for errors\")\n",
        "\n",
        "        if 'cpu' in str(features).lower():\n",
        "            recommendations.append(\"Monitor CPU-intensive processes\")\n",
        "\n",
        "        if 'memory' in str(features).lower():\n",
        "            recommendations.append(\"Check memory usage and potential leaks\")\n",
        "\n",
        "        if 'io' in str(features).lower():\n",
        "            recommendations.append(\"Investigate disk I/O performance\")\n",
        "\n",
        "        return recommendations"
      ],
      "metadata": {
        "id": "Rlz5XXAImQY9"
      },
      "id": "Rlz5XXAImQY9",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class AIXMonitoringTrainer:\n",
        "    \"\"\"\n",
        "    Main training class that orchestrates all components\n",
        "    \"\"\"\n",
        "\n",
        "    def __init__(self):\n",
        "        self.data_processor = DataProcessor()\n",
        "        self.forecasting_engine = ForecastingEngine()\n",
        "        self.anomaly_detector = AnomalyDetector()\n",
        "        self.active_learner = ActiveLearningComponent()\n",
        "        self.explainer = ExplainableDecisionSupport()\n",
        "\n",
        "        self.is_trained = False\n",
        "        self.training_history = []\n",
        "\n",
        "    def merge_data(self, vmstat_df, iostat_df, netstat_df, process_df):\n",
        "        \"\"\"Merge data from multiple sources\"\"\"\n",
        "        # Convert to pandas if using Polars\n",
        "        if isinstance(vmstat_df, pl.DataFrame):\n",
        "            vmstat_df = vmstat_df.to_pandas(date_unit='ms')\n",
        "        if isinstance(iostat_df, pl.DataFrame):\n",
        "            iostat_df = iostat_df.to_pandas(date_unit='ms')\n",
        "        if isinstance(netstat_df, pl.DataFrame):\n",
        "            netstat_df = netstat_df.to_pandas(date_unit='ms')\n",
        "        if isinstance(process_df, pl.DataFrame):\n",
        "            process_df = process_df.to_pandas(date_unit='ms')\n",
        "\n",
        "        # Merge all data on id and timestamp\n",
        "        merged_df = vmstat_df.merge(\n",
        "            iostat_df, on=['id', 'timestamp'], how='outer', suffixes=('', '_iostat')\n",
        "        ).merge(\n",
        "            netstat_df, on=['id', 'timestamp'], how='outer', suffixes=('', '_netstat')\n",
        "        ).merge(\n",
        "            process_df, on=['id', 'timestamp'], how='outer', suffixes=('', '_process')\n",
        "        )\n",
        "\n",
        "        # Fill missing values with 0\n",
        "        merged_df.fillna(0, inplace=True)\n",
        "        return merged_df\n",
        "\n",
        "    def train(self, vmstat_df, iostat_df, netstat_df, process_df,\n",
        "              target_columns=['us', 'sy', 'mem_mean', 'tps'],\n",
        "              test_size=0.2):\n",
        "        \"\"\"\n",
        "        Complete training pipeline\n",
        "        \"\"\"\n",
        "        print(\"Starting AIX Monitoring System Training...\")\n",
        "\n",
        "        # 0. Merge data\n",
        "        print(\"0. Merging data...\")\n",
        "        merged_df = self.merge_data(vmstat_df, iostat_df, netstat_df, process_df)\n",
        "\n",
        "        # 1. Data Processing\n",
        "        print(\"1. Processing data...\")\n",
        "        processed_df = self.data_processor.process_data(merged_df, target_columns)\n",
        "\n",
        "        # 2. Split data\n",
        "        train_df, test_df = train_test_split(\n",
        "            processed_df, test_size=test_size, shuffle=False\n",
        "        )\n",
        "\n",
        "        # 3. Train Forecasting Models\n",
        "        print(\"2. Training forecasting models...\")\n",
        "        self.forecasting_engine.train_ensemble(train_df, target_columns)\n",
        "\n",
        "        # 4. Train Anomaly Detection Models\n",
        "        print(\"3. Training anomaly detection models...\")\n",
        "        self.anomaly_detector.train_detectors(train_df)\n",
        "\n",
        "        # 5. Evaluate on test set\n",
        "        print(\"4. Evaluating models...\")\n",
        "        evaluation_results = self.evaluate_models(test_df, target_columns)\n",
        "\n",
        "        # 6. Store training history\n",
        "        training_record = {\n",
        "            'timestamp': pd.Timestamp.now(),\n",
        "            'train_size': len(train_df),\n",
        "            'test_size': len(test_df),\n",
        "            'target_columns': target_columns,\n",
        "            'evaluation_results': evaluation_results\n",
        "        }\n",
        "        self.training_history.append(training_record)\n",
        "\n",
        "        self.is_trained = True\n",
        "        print(\"Training completed successfully!\")\n",
        "\n",
        "        return evaluation_results\n",
        "\n",
        "    def evaluate_models(self, test_df, target_columns):\n",
        "        \"\"\"\n",
        "        Comprehensive model evaluation\n",
        "        \"\"\"\n",
        "        results = {\n",
        "            'forecasting': {},\n",
        "            'anomaly_detection': {},\n",
        "            'overall_performance': {}\n",
        "        }\n",
        "\n",
        "        # Evaluate Forecasting\n",
        "        for target_col in target_columns:\n",
        "            if target_col in test_df.columns:\n",
        "                # Generate predictions\n",
        "                forecast_pred, model_preds = self.forecasting_engine.predict(\n",
        "                    test_df.head(50), target_col, horizon=24\n",
        "                )\n",
        "\n",
        "                if forecast_pred is not None and len(forecast_pred) > 0:\n",
        "                    # Calculate metrics (simplified - would need actual future values)\n",
        "                    results['forecasting'][target_col] = {\n",
        "                        'mae': np.mean(np.abs(forecast_pred)),\n",
        "                        'mse': np.mean(forecast_pred ** 2),\n",
        "                        'model_contributions': list(model_preds.keys())\n",
        "                    }\n",
        "\n",
        "        # Evaluate Anomaly Detection\n",
        "        anomaly_scores, severity, detection_details = self.anomaly_detector.detect_anomalies(test_df)\n",
        "\n",
        "        results['anomaly_detection'] = {\n",
        "            'total_anomalies': np.sum(severity != 'Normal'),\n",
        "            'severity_distribution': {\n",
        "                severity_level: np.sum(severity == severity_level)\n",
        "                for severity_level in ['Normal', 'Low', 'Medium', 'High', 'Critical']\n",
        "            },\n",
        "            'average_anomaly_score': np.mean(anomaly_scores),\n",
        "            'detection_methods_used': list(detection_details.keys())\n",
        "        }\n",
        "\n",
        "        # Overall Performance\n",
        "        results['overall_performance'] = {\n",
        "            'training_time': pd.Timestamp.now(),\n",
        "            'data_quality_score': self._calculate_data_quality_score(test_df),\n",
        "            'model_complexity': self._calculate_model_complexity(),\n",
        "            'memory_usage_mb': self._estimate_memory_usage()\n",
        "        }\n",
        "\n",
        "        return results\n",
        "\n",
        "    def predict_and_detect(self, vmstat_df, iostat_df, netstat_df, process_df,\n",
        "                          target_columns=['us', 'sy', 'mem_mean', 'tps'],\n",
        "                          forecast_horizon=24):\n",
        "        \"\"\"\n",
        "        Generate predictions and detect anomalies on new data\n",
        "        \"\"\"\n",
        "        if not self.is_trained:\n",
        "            raise ValueError(\"Models must be trained before making predictions\")\n",
        "\n",
        "        # Process new data\n",
        "        merged_df = self.merge_data(vmstat_df, iostat_df, netstat_df, process_df)\n",
        "        processed_df = self.data_processor.process_data(merged_df, target_columns)\n",
        "\n",
        "        results = {\n",
        "            'forecasts': {},\n",
        "            'anomalies': {},\n",
        "            'explanations': {},\n",
        "            'recommendations': []\n",
        "        }\n",
        "\n",
        "        # Generate Forecasts\n",
        "        for target_col in target_columns:\n",
        "            if target_col in processed_df.columns:\n",
        "                forecast_pred, model_preds = self.forecasting_engine.predict(\n",
        "                    processed_df, target_col, horizon=forecast_horizon\n",
        "                )\n",
        "\n",
        "                if forecast_pred is not None:\n",
        "                    std_dev = np.std(forecast_pred)\n",
        "                    # Calculate full confidence bands\n",
        "                    confidence_band_lower = forecast_pred - 1.96 * std_dev\n",
        "                    confidence_band_upper = forecast_pred + 1.96 * std_dev\n",
        "\n",
        "                    # Get LAST confidence interval for explanation\n",
        "                    last_ci = [confidence_band_lower[-1], confidence_band_upper[-1]]\n",
        "\n",
        "                    results['forecasts'][target_col] = {\n",
        "                        'values': forecast_pred.tolist(),\n",
        "                        'confidence_intervals': [confidence_band_lower.tolist(),\n",
        "                                               confidence_band_upper.tolist()],\n",
        "                        'model_contributions': model_preds\n",
        "                    }\n",
        "\n",
        "                    # Pass single interval to explainer\n",
        "                    explanation = self.explainer.explain_forecast(\n",
        "                        forecast_pred, last_ci, [target_col]\n",
        "                    )\n",
        "                    results['explanations'][f'forecast_{target_col}'] = explanation\n",
        "\n",
        "        # Detect Anomalies\n",
        "        anomaly_scores, severity, detection_details = self.anomaly_detector.detect_anomalies(processed_df)\n",
        "\n",
        "        # Find anomalous points\n",
        "        anomalous_indices = np.where(severity != 'Normal')[0]\n",
        "\n",
        "        results['anomalies'] = {\n",
        "            'scores': anomaly_scores.tolist(),\n",
        "            'severity': severity.tolist(),\n",
        "            'anomalous_points': len(anomalous_indices),\n",
        "            'detection_details': {k: v.tolist() if isinstance(v, np.ndarray) else v\n",
        "                                for k, v in detection_details.items()}\n",
        "        }\n",
        "\n",
        "        # Generate explanations for significant anomalies\n",
        "        for idx in anomalous_indices[:5]:  # Explain top 5 anomalies\n",
        "            if severity[idx] in ['High', 'Critical']:\n",
        "                feature_values = processed_df.iloc[idx][self.anomaly_detector.feature_columns].values\n",
        "                explanation = self.explainer.explain_anomaly(\n",
        "                    anomaly_scores[idx],\n",
        "                    feature_values,\n",
        "                    self.anomaly_detector.feature_columns,\n",
        "                    detection_details\n",
        "                )\n",
        "                results['explanations'][f'anomaly_{idx}'] = explanation\n",
        "                results['recommendations'].extend(explanation['recommendations'])\n",
        "\n",
        "        return results\n",
        "\n",
        "    def update_models_with_feedback(self, feedback_data):\n",
        "        \"\"\"\n",
        "        Update models with new feedback data\n",
        "        \"\"\"\n",
        "        if not self.is_trained:\n",
        "            raise ValueError(\"Models must be trained before updating\")\n",
        "\n",
        "        # Collect feedback\n",
        "        self.active_learner.collect_feedback(\n",
        "            predictions=feedback_data.get('predictions'),\n",
        "            actual_values=feedback_data.get('actual_values'),\n",
        "            expert_annotations=feedback_data.get('expert_annotations')\n",
        "        )\n",
        "\n",
        "        # Check if retraining is needed\n",
        "        should_retrain = self.active_learner.update_models(\n",
        "            self.forecasting_engine,\n",
        "            self.anomaly_detector,\n",
        "            feedback_data\n",
        "        )\n",
        "\n",
        "        if should_retrain:\n",
        "            print(\"Model performance degraded. Retraining recommended.\")\n",
        "            return {'status': 'retraining_recommended', 'reason': 'performance_degradation'}\n",
        "        else:\n",
        "            # Update thresholds based on feedback\n",
        "            if 'anomaly_feedback' in feedback_data:\n",
        "                self.anomaly_detector.update_thresholds(feedback_data['anomaly_feedback'])\n",
        "\n",
        "            return {'status': 'updated', 'reason': 'incremental_learning'}\n",
        "\n",
        "    def get_model_status(self):\n",
        "        \"\"\"\n",
        "        Get current status of all models\n",
        "        \"\"\"\n",
        "        if not self.is_trained:\n",
        "            return {'status': 'not_trained', 'models': {}}\n",
        "\n",
        "        status = {\n",
        "            'status': 'trained',\n",
        "            'training_history': len(self.training_history),\n",
        "            'last_training': self.training_history[-1]['timestamp'] if self.training_history else None,\n",
        "            'models': {\n",
        "                'forecasting': {\n",
        "                    'lstm_models': len([k for k in self.forecasting_engine.models.keys() if 'lstm' in k]),\n",
        "                    'transformer_models': len([k for k in self.forecasting_engine.models.keys() if 'transformer' in k]),\n",
        "                    'arima_models': len([k for k in self.forecasting_engine.models.keys() if 'arima' in k]),\n",
        "                    'prophet_models': len([k for k in self.forecasting_engine.models.keys() if 'prophet' in k]),\n",
        "                },\n",
        "                'anomaly_detection': {\n",
        "                    'isolation_forest': 'isolation_forest' in self.anomaly_detector.models,\n",
        "                    'oneclass_svm': 'oneclass_svm' in self.anomaly_detector.models,\n",
        "                    'lstm_autoencoder': 'lstm_autoencoder' in self.anomaly_detector.models,\n",
        "                },\n",
        "                'active_learning': {\n",
        "                    'feedback_samples': len(self.active_learner.feedback_history),\n",
        "                    'performance_records': len(self.active_learner.model_performance)\n",
        "                }\n",
        "            },\n",
        "            'memory_usage_estimate': self._estimate_memory_usage()\n",
        "        }\n",
        "\n",
        "        return status\n",
        "\n",
        "    def save_models(self, filepath):\n",
        "        \"\"\"\n",
        "        Save trained models to disk\n",
        "        \"\"\"\n",
        "        import pickle\n",
        "        import os\n",
        "\n",
        "        if not self.is_trained:\n",
        "            raise ValueError(\"No trained models to save\")\n",
        "\n",
        "        os.makedirs(filepath, exist_ok=True)\n",
        "\n",
        "        # Save data processor\n",
        "        with open(os.path.join(filepath, 'data_processor.pkl'), 'wb') as f:\n",
        "            pickle.dump(self.data_processor, f)\n",
        "\n",
        "        # Save forecasting models (non-neural network components)\n",
        "        forecasting_state = {\n",
        "            'model_weights': self.forecasting_engine.model_weights,\n",
        "            'feature_columns': self.forecasting_engine.feature_columns,\n",
        "            'sequence_length': self.forecasting_engine.sequence_length\n",
        "        }\n",
        "\n",
        "        # Save ARIMA and Prophet models\n",
        "        arima_models = {k: v for k, v in self.forecasting_engine.models.items() if 'arima' in k}\n",
        "        prophet_models = {k: v for k, v in self.forecasting_engine.models.items() if 'prophet' in k}\n",
        "\n",
        "        with open(os.path.join(filepath, 'forecasting_classical.pkl'), 'wb') as f:\n",
        "            pickle.dump({'arima': arima_models, 'prophet': prophet_models, 'state': forecasting_state}, f)\n",
        "\n",
        "        # Save neural network models separately\n",
        "        for model_name, model in self.forecasting_engine.models.items():\n",
        "            if 'lstm' in model_name or 'transformer' in model_name:\n",
        "                model.save(os.path.join(filepath, f'{model_name}.h5'))\n",
        "\n",
        "        # Save anomaly detection models\n",
        "        anomaly_state = {\n",
        "            'thresholds': self.anomaly_detector.thresholds,\n",
        "            'feature_columns': self.anomaly_detector.feature_columns,\n",
        "            'sequence_length': self.anomaly_detector.sequence_length\n",
        "        }\n",
        "\n",
        "        # Save sklearn models\n",
        "        sklearn_models = {k: v for k, v in self.anomaly_detector.models.items()\n",
        "                         if k in ['isolation_forest', 'oneclass_svm']}\n",
        "\n",
        "        with open(os.path.join(filepath, 'anomaly_detection.pkl'), 'wb') as f:\n",
        "            pickle.dump({'sklearn_models': sklearn_models, 'state': anomaly_state}, f)\n",
        "\n",
        "        # Save LSTM autoencoder\n",
        "        if 'lstm_autoencoder' in self.anomaly_detector.models:\n",
        "            self.anomaly_detector.models['lstm_autoencoder'].save(\n",
        "                os.path.join(filepath, 'lstm_autoencoder.h5')\n",
        "            )\n",
        "\n",
        "        # Save active learning component\n",
        "        with open(os.path.join(filepath, 'active_learning.pkl'), 'wb') as f:\n",
        "            pickle.dump(self.active_learner, f)\n",
        "\n",
        "        # Save training history\n",
        "        with open(os.path.join(filepath, 'training_history.pkl'), 'wb') as f:\n",
        "            pickle.dump(self.training_history, f)\n",
        "\n",
        "        print(f\"Models saved successfully to {filepath}\")\n",
        "\n",
        "    def load_models(self, filepath):\n",
        "        \"\"\"\n",
        "        Load trained models from disk\n",
        "        \"\"\"\n",
        "        import pickle\n",
        "        import os\n",
        "        from tensorflow.keras.models import load_model\n",
        "\n",
        "        if not os.path.exists(filepath):\n",
        "            raise ValueError(f\"Model directory {filepath} does not exist\")\n",
        "\n",
        "        # Load data processor\n",
        "        with open(os.path.join(filepath, 'data_processor.pkl'), 'rb') as f:\n",
        "            self.data_processor = pickle.load(f)\n",
        "\n",
        "        # Load forecasting models\n",
        "        with open(os.path.join(filepath, 'forecasting_classical.pkl'), 'rb') as f:\n",
        "            forecasting_data = pickle.load(f)\n",
        "\n",
        "        self.forecasting_engine.model_weights = forecasting_data['state']['model_weights']\n",
        "        self.forecasting_engine.feature_columns = forecasting_data['state']['feature_columns']\n",
        "        self.forecasting_engine.sequence_length = forecasting_data['state']['sequence_length']\n",
        "\n",
        "        # Load classical models\n",
        "        self.forecasting_engine.models.update(forecasting_data['arima'])\n",
        "        self.forecasting_engine.models.update(forecasting_data['prophet'])\n",
        "\n",
        "        # Load neural network models\n",
        "        for filename in os.listdir(filepath):\n",
        "            if filename.endswith('.h5') and ('lstm' in filename or 'transformer' in filename):\n",
        "                model_name = filename.replace('.h5', '')\n",
        "                if 'autoencoder' not in filename:  # Skip autoencoder here\n",
        "                    self.forecasting_engine.models[model_name] = load_model(\n",
        "                        os.path.join(filepath, filename)\n",
        "                    )\n",
        "\n",
        "        # Load anomaly detection models\n",
        "        with open(os.path.join(filepath, 'anomaly_detection.pkl'), 'rb') as f:\n",
        "            anomaly_data = pickle.load(f)\n",
        "\n",
        "        self.anomaly_detector.thresholds = anomaly_data['state']['thresholds']\n",
        "        self.anomaly_detector.feature_columns = anomaly_data['state']['feature_columns']\n",
        "        self.anomaly_detector.sequence_length = anomaly_data['state']['sequence_length']\n",
        "        self.anomaly_detector.models.update(anomaly_data['sklearn_models'])\n",
        "\n",
        "        # Load LSTM autoencoder\n",
        "        autoencoder_path = os.path.join(filepath, 'lstm_autoencoder.h5')\n",
        "        if os.path.exists(autoencoder_path):\n",
        "            self.anomaly_detector.models['lstm_autoencoder'] = load_model(autoencoder_path)\n",
        "\n",
        "        # Load active learning component\n",
        "        with open(os.path.join(filepath, 'active_learning.pkl'), 'rb') as f:\n",
        "            self.active_learner = pickle.load(f)\n",
        "\n",
        "        # Load training history\n",
        "        with open(os.path.join(filepath, 'training_history.pkl'), 'rb') as f:\n",
        "            self.training_history = pickle.load(f)\n",
        "\n",
        "        self.is_trained = True\n",
        "        print(f\"Models loaded successfully from {filepath}\")\n",
        "\n",
        "    def _calculate_data_quality_score(self, df):\n",
        "        \"\"\"\n",
        "        Calculate a simple data quality score\n",
        "        \"\"\"\n",
        "        numeric_df = df.select_dtypes(include=[np.number])\n",
        "\n",
        "        # Check for missing values\n",
        "        missing_ratio = numeric_df.isnull().sum().sum() / (len(numeric_df) * len(numeric_df.columns))\n",
        "\n",
        "        # Check for constant columns\n",
        "        constant_cols = (numeric_df.nunique() == 1).sum()\n",
        "        constant_ratio = constant_cols / len(numeric_df.columns)\n",
        "\n",
        "        # Calculate quality score (0-1, higher is better)\n",
        "        quality_score = (1 - missing_ratio) * (1 - constant_ratio)\n",
        "\n",
        "        return max(0, min(1, quality_score))\n",
        "\n",
        "    def _calculate_model_complexity(self):\n",
        "        \"\"\"\n",
        "        Estimate model complexity\n",
        "        \"\"\"\n",
        "        complexity = 0\n",
        "\n",
        "        # Count forecasting models\n",
        "        complexity += len(self.forecasting_engine.models) * 10\n",
        "\n",
        "        # Count anomaly detection models\n",
        "        complexity += len(self.anomaly_detector.models) * 5\n",
        "\n",
        "        # Add feature complexity\n",
        "        complexity += len(self.data_processor.feature_columns)\n",
        "\n",
        "        return complexity\n",
        "\n",
        "    def _estimate_memory_usage(self):\n",
        "        \"\"\"\n",
        "        Estimate memory usage in MB\n",
        "        \"\"\"\n",
        "        total_size = 0\n",
        "        components = [\n",
        "            self.data_processor,\n",
        "            self.forecasting_engine,\n",
        "            self.anomaly_detector,\n",
        "            self.active_learner\n",
        "        ]\n",
        "        \n",
        "        for obj in components:\n",
        "            for v in vars(obj).values():\n",
        "                total_size += sys.getsizeof(v)\n",
        "        \n",
        "        # Add model sizes\n",
        "        for model in self.forecasting_engine.models.values():\n",
        "            if hasattr(model, 'count_params'):\n",
        "                total_size += model.count_params() * 4  # 4 bytes per float\n",
        "        \n",
        "        return total_size / (1024 * 1024)  # Convert to MB"
      ],
      "metadata": {
        "id": "KYh8rWTumRni"
      },
      "id": "KYh8rWTumRni",
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Load data with Polars\n",
        "def load_and_fix_data():\n",
        "    print(\"Loading real AIX server metrics with Polars...\")\n",
        "\n",
        "    # Load data with Polars\n",
        "    vmstat_df = pl.read_csv(\"vmstat_metrics.csv\")\n",
        "    iostat_df = pl.read_csv(\"iostat_metrics.csv\")\n",
        "    netstat_df = pl.read_csv(\"netstat_metrics.csv\")\n",
        "    process_df = pl.read_csv(\"process_metrics.csv\")\n",
        "\n",
        "    # Function to fix timestamps\n",
        "    def fix_timestamps(df):\n",
        "        # Handle incomplete timestamps\n",
        "        df = df.with_columns(\n",
        "            pl.when(pl.col(\"timestamp\").str.ends_with(\"+00\") | pl.col(\"timestamp\").str.ends_with(\"-00\"))\n",
        "            .then(pl.col(\"timestamp\") + \"00\")\n",
        "            .otherwise(pl.col(\"timestamp\"))\n",
        "        )\n",
        "        \n",
        "        # Try multiple formats\n",
        "        return df.with_columns(\n",
        "            pl.coalesce(\n",
        "                pl.col(\"timestamp\").str.to_datetime(strict=False, format=\"%Y-%m-%d %H:%M:%S%.f%z\"),\n",
        "                pl.col(\"timestamp\").str.to_datetime(strict=False, format=\"%Y-%m-%d %H:%M:%S\"),\n",
        "                pl.col(\"timestamp\").str.to_datetime(strict=False, format=\"%Y-%m-%d %H\")\n",
        "            )\n",
        "        )\n",
        "\n",
        "    # Apply to all dataframes\n",
        "    vmstat_df = fix_timestamps(vmstat_df)\n",
        "    iostat_df = fix_timestamps(iostat_df)\n",
        "    netstat_df = fix_timestamps(netstat_df)\n",
        "    process_df = fix_timestamps(process_df)\n",
        "    \n",
        "    return vmstat_df, iostat_df, netstat_df, process_df\n",
        "\n",
        "# Load data\n",
        "vmstat_df, iostat_df, netstat_df, process_df = load_and_fix_data()\n",
        "\n",
        "# Initialize trainer\n",
        "trainer = AIXMonitoringTrainer()\n",
        "\n",
        "# Define target columns\n",
        "target_columns = [\n",
        "    # CPU metrics (vmstat)\n",
        "    'us',    # User CPU %\n",
        "    'sy',    # System CPU %\n",
        "    'idle',  # Idle CPU %\n",
        "\n",
        "    # Memory metrics (vmstat)\n",
        "    'fre',   # Free memory\n",
        "\n",
        "    # Disk metrics (iostat)\n",
        "    'tps',   # Transactions per second\n",
        "    'service_time',  # Disk service time\n",
        "\n",
        "    # Network metrics (netstat)\n",
        "    'ipkts_rate',  # Input packets rate\n",
        "    'oerrs_rate',  # Output error rate\n",
        "\n",
        "    # Process metrics (process)\n",
        "    'cpu'    # Process CPU usage\n",
        "]\n",
        "\n",
        "# Train the model\n",
        "print(\"\\nTraining models for:\", target_columns)\n",
        "evaluation_results = trainer.train(\n",
        "    vmstat_df,\n",
        "    iostat_df,\n",
        "    netstat_df,\n",
        "    process_df,\n",
        "    target_columns=target_columns,\n",
        "    test_size=0.2\n",
        ")\n",
        "\n",
        "# Print detailed evaluation results\n",
        "def print_evaluation(results):\n",
        "    print(\"\\n\" + \"=\"*50)\n",
        "    print(\"FORECASTING PERFORMANCE (MAE/MSE)\")\n",
        "    print(\"=\"*50)\n",
        "    for metric, scores in results['forecasting'].items():\n",
        "        print(f\"{metric.upper():<15} MAE: {scores['mae']:.4f} | MSE: {scores['mse']:.4f}\")\n",
        "        print(f\"    Models used: {', '.join(scores['model_contributions'])}\")\n",
        "\n",
        "    print(\"\\n\" + \"=\"*50)\n",
        "    print(\"ANOMALY DETECTION RESULTS\")\n",
        "    print(\"=\"*50)\n",
        "    anomalies = results['anomaly_detection']\n",
        "    print(f\"Total anomalies detected: {anomalies['total_anomalies']}\")\n",
        "    print(\"Severity distribution:\")\n",
        "    for severity, count in anomalies['severity_distribution'].items():\n",
        "        print(f\"  {severity:<8}: {count}\")\n",
        "    print(f\"Average anomaly score: {anomalies['average_anomaly_score']:.2f}\")\n",
        "    print(f\"Detection methods: {', '.join(anomalies['detection_methods_used'])}\")\n",
        "\n",
        "    print(\"\\n\" + \"=\"*50)\n",
        "    print(\"SYSTEM PERFORMANCE METRICS\")\n",
        "    print(\"=\"*50)\n",
        "    perf = results['overall_performance']\n",
        "    print(f\"Training time: {perf['training_time']}\")\n",
        "    print(f\"Data quality score: {perf['data_quality_score']:.2f}/1.0\")\n",
        "    print(f\"Model complexity: {perf['model_complexity']} (relative units)\")\n",
        "    print(f\"Memory usage: {perf['memory_usage_mb']:.2f} MB\")\n",
        "\n",
        "print_evaluation(evaluation_results)\n",
        "\n",
        "# Sample predictions\n",
        "print(\"\\n\" + \"=\"*50)\n",
        "print(\"GENERATING SAMPLE PREDICTIONS\")\n",
        "print(\"=\"*50)\n",
        "sample_results = trainer.predict_and_detect(\n",
        "    vmstat_df.tail(100),\n",
        "    iostat_df.tail(200),\n",
        "    netstat_df.tail(200),\n",
        "    process_df.tail(500)\n",
        ")\n",
        "\n",
        "print(f\"\\nGenerated forecasts for: {list(sample_results['forecasts'].keys())}\")\n",
        "print(f\"Detected {sample_results['anomalies']['anomalous_points']} anomalies\")\n",
        "print(f\"Top anomaly explanation:\")\n",
        "first_anomaly_key = [k for k in sample_results['explanations'].keys() if 'anomaly_' in k][0]\n",
        "print(sample_results['explanations'][first_anomaly_key]['explanation'])\n",
        "\n",
        "# Model status\n",
        "print(\"\\n\" + \"=\"*50)\n",
        "print(\"MODEL STATUS SUMMARY\")\n",
        "print(\"=\"*50)\n",
        "status = trainer.get_model_status()\n",
        "print(f\"Status: {status['status']}\")\n",
        "print(f\"Last trained: {status['last_training']}\")\n",
        "print(f\"Forecasting models: {status['models']['forecasting']}\")\n",
        "print(f\"Anomaly detectors: {status['models']['anomaly_detection']}\")\n",
        "print(f\"Memory usage: {status['memory_usage_estimate']:.2f} MB\")\n",
        "\n",
        "# Save models\n",
        "trainer.save_models(\"aix_monitoring_models\")"
      ],
      "metadata": {
        "id": "WRfSmnYwmTOZ"
      },
      "id": "WRfSmnYwmTOZ",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.11"
    },
    "colab": {
      "provenance": []
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}

NameError: name 'null' is not defined

In [None]:
"import pandas as pd\n",
        "import numpy as np\n",
        "import polars as pl\n",
        "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
        "from sklearn.ensemble import IsolationForest\n",
        "from sklearn.svm import OneClassSVM\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import precision_recall_fscore_support, mean_absolute_error\n",
        "import tensorflow as tf\n",
        "from tensorflow.keras.models import Sequential, Model\n",
        "from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, RepeatVector, TimeDistributed\n",
        "from tensorflow.keras.optimizers import Adam\n",
        "from statsmodels.tsa.arima.model import ARIMA\n",
        "from statsmodels.tsa.holtwinters import SimpleExpSmoothing\n",
        "from prophet import Prophet\n",
        "import warnings\n",
        "import sys\n",
        "warnings.filterwarnings('ignore')"