In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Attention, Concatenate

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

2025-04-15 17:03:12.054841: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-15 17:03:12.606995: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-15 17:03:13.182833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744736593.580866 1211236 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744736593.729790 1211236 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744736594.797732 1211236 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [3]:
def load_parquet_data(file_path, source_name):
    """Load a parquet file and return a DataFrame"""
    if not os.path.exists(file_path):
        print(f"Warning: {source_name} file not found: {file_path}")
        return None

    print(f"Loading {source_name} data from: {file_path}")
    table = pq.read_table(file_path)
    df = table.to_pandas()
    print(f"{source_name} data shape: {df.shape}")
    print(f"{source_name} columns: {df.columns.tolist()}")

    # Convert object columns to numeric if possible
    for col in df.columns:
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
                print(f"Converted column {col} from object to numeric")
            except Exception as e:
                print(f"Could not convert column {col} to numeric: {e}")

    return df


In [4]:
def normalize_timestamps(df, time_col='time_sec'):
    """Normalize timestamps to start from zero."""
    if df is None or df.empty:
        return df
    min_time = df[time_col].min()
    if pd.isna(min_time):
        print(f"Warning: No valid timestamps in {time_col}")
        return df
    df[f'{time_col}_normalized'] = df[time_col] - min_time
    print(f"Normalized timestamps: min={min_time}, range=[0, {df[time_col].max() - min_time}]")
    return df

In [5]:
def process_rss_data(rss_df, page_size_mb):
    """Process RSS data with appropriate page size conversion"""
    if rss_df is None or rss_df.empty:
        print("Warning: RSS DataFrame is empty")
        return None, []

    # Convert timestamp from ns to sec
    if 'ts_ns' in rss_df.columns:
        rss_df['time_sec'] = rss_df['ts_ns'] / 1e9
        print("Converted timestamp from ns to sec")

    # Sort by timestamp
    rss_df = rss_df.sort_values('time_sec')

    # Normalize timestamps to start from 0
    rss_df = normalize_timestamps(rss_df)

    # Add page size as a feature
    rss_df['page_size_mb'] = page_size_mb

    # Identify target columns (memory metrics)
    rss_cols = []
    for col in ['anon', 'file', 'swap', 'shmem']:
        if col in rss_df.columns:
            if rss_df[col].std() > 0:  # Check for variance
                rss_cols.append(col)
                print(f"Found RSS column with variance: {col}")

    # Convert page counts to MB using the correct page size
    for col in rss_cols:
        new_col = f"{col}_mb"
        rss_df[new_col] = rss_df[col] * page_size_mb
        print(f"Converted {col} to MB using {page_size_mb}MB page size")

    return rss_df, rss_cols


In [6]:
def process_tlb_data(tlb_df, tlb_type):
    """Process TLB miss data"""
    if tlb_df is None or tlb_df.empty:
        print(f"Warning: {tlb_type} DataFrame is empty")
        return None, []

    # Convert timestamp to seconds
    if 'ts_uptime_us' in tlb_df.columns:
        tlb_df['time_sec'] = tlb_df['ts_uptime_us'] / 1e6
        print(f"Converted {tlb_type} timestamp from us to sec")

    # Sort by timestamp
    tlb_df = tlb_df.sort_values('time_sec')

    # Normalize timestamps to start from 0
    tlb_df = normalize_timestamps(tlb_df)

    # Find TLB miss columns
    tlb_cols = []
    for col in tlb_df.columns:
        if 'miss' in col.lower():
            if tlb_df[col].std() > 0:  # Check for variance
                tlb_cols.append(col)
    print(f"Found {len(tlb_cols)} {tlb_type} columns with variance")

    # Choose main TLB miss column (prefer cumulative/total)
    main_tlb_col = None
    for col in tlb_cols:
        if 'cumulative' in col.lower() or 'total' in col.lower():
            main_tlb_col = col
            break

    if main_tlb_col is None and tlb_cols:
        main_tlb_col = tlb_cols[0]

    if main_tlb_col:
        print(f"Selected {tlb_type} column: {main_tlb_col}")
        tlb_df[f'{tlb_type}_misses'] = tlb_df[main_tlb_col]

        # Add derivative features
        tlb_df[f'{tlb_type}_misses_rate'] = tlb_df[f'{tlb_type}_misses'].diff() / tlb_df['time_sec_normalized'].diff()
        tlb_df[f'{tlb_type}_misses_rate'].fillna(0, inplace=True)

        # Add rolling average
        tlb_df[f'{tlb_type}_misses_avg'] = tlb_df[f'{tlb_type}_misses'].rolling(window=5, min_periods=1).mean()

        return tlb_df, [f'{tlb_type}_misses', f'{tlb_type}_misses_rate', f'{tlb_type}_misses_avg']
    else:
        print(f"Warning: No suitable {tlb_type} column found")
        return tlb_df, []


In [7]:
def create_memory_change_dataset(kb2_data, kb4_data):
    """
    Create a dataset that aligns memory changes with the most recent TLB measurements.

    Returns:
      - A dictionary with integrated datasets and a list of available memory metrics.
    """
    print("Creating memory-change aligned dataset...")

    # Extract DataFrames
    kb2_rss = kb2_data.get('rss')
    kb2_dtlb = kb2_data.get('dtlb')
    kb2_itlb = kb2_data.get('itlb')

    kb4_rss = kb4_data.get('rss')
    kb4_dtlb = kb4_data.get('dtlb')
    kb4_itlb = kb4_data.get('itlb')

    if kb2_rss is None or kb4_rss is None:
        raise ValueError("Both 2MB and 4KB RSS data are required")

    print(f"2MB RSS time range: {kb2_rss['time_sec_normalized'].min()} to {kb2_rss['time_sec_normalized'].max()}")
    print(f"4KB RSS time range: {kb4_rss['time_sec_normalized'].min()} to {kb4_rss['time_sec_normalized'].max()}")

    if kb2_dtlb is not None:
        print(f"2MB DTLB time range: {kb2_dtlb['time_sec_normalized'].min()} to {kb2_dtlb['time_sec_normalized'].max()}")
    if kb2_itlb is not None:
        print(f"2MB ITLB time range: {kb2_itlb['time_sec_normalized'].min()} to {kb2_itlb['time_sec_normalized'].max()}")
    if kb4_dtlb is not None:
        print(f"4KB DTLB time range: {kb4_dtlb['time_sec_normalized'].min()} to {kb4_dtlb['time_sec_normalized'].max()}")
    if kb4_itlb is not None:
        print(f"4KB ITLB time range: {kb4_itlb['time_sec_normalized'].min()} to {kb4_itlb['time_sec_normalized'].max()}")

    print("Checking for duplicate timestamps...")
    def deduplicate_timestamps(df, time_col='time_sec_normalized'):
        if df is None:
            return None
        before_count = len(df)
        if df[time_col].duplicated().any():
            print(f"  Found duplicate timestamps in {time_col}; removing duplicates...")
            df = df.sort_values(time_col).drop_duplicates(subset=[time_col], keep='last')
            print(f"  Removed {before_count - len(df)} duplicates")
        return df

    for name, df in [('2MB RSS', kb2_rss), ('2MB DTLB', kb2_dtlb), ('2MB ITLB', kb2_itlb),
                     ('4KB RSS', kb4_rss), ('4KB DTLB', kb4_dtlb), ('4KB ITLB', kb4_itlb)]:
        if df is not None:
            df = deduplicate_timestamps(df)
            if name == '2MB RSS': kb2_rss = df
            elif name == '2MB DTLB': kb2_dtlb = df
            elif name == '2MB ITLB': kb2_itlb = df
            elif name == '4KB RSS': kb4_rss = df
            elif name == '4KB DTLB': kb4_dtlb = df
            elif name == '4KB ITLB': kb4_itlb = df

    print("Identifying memory change points...")
    def find_memory_changes(df, memory_cols, threshold=0.01):
        if df is None:
            return pd.DataFrame()
        change_points = []
        for col in memory_cols:
            col_mb = f"{col}_mb"
            if col_mb in df.columns:
                diff = df[col_mb].diff().abs()
                changes = df[diff > threshold].copy()
                if not changes.empty:
                    changes['memory_metric'] = col_mb
                    changes['memory_value'] = changes[col_mb]
                    change_points.append(changes[['time_sec_normalized', 'memory_metric', 'memory_value']])
        if not change_points:
            return pd.DataFrame()
        result = pd.concat(change_points).sort_values('time_sec_normalized')
        return result

    kb2_memory_candidates = [col for col in ['anon', 'file'] if col in kb2_rss.columns]
    kb4_memory_candidates = [col for col in ['anon', 'file'] if col in kb4_rss.columns]

    kb2_changes = find_memory_changes(kb2_rss, kb2_memory_candidates)
    kb4_changes = find_memory_changes(kb4_rss, kb4_memory_candidates)

    print(f"Found {len(kb2_changes)} memory change points in 2MB data")
    print(f"Found {len(kb4_changes)} memory change points in 4KB data")

    print("Aligning memory changes with TLB measurements using merge_asof...")

    def align_with_tlb(changes_df, dtlb_df, itlb_df):
        if changes_df is None or changes_df.empty:
            return changes_df

        result = changes_df.copy()

        if dtlb_df is not None:
            dtlb_df = dtlb_df.sort_values('time_sec_normalized')
            dtlb_columns = [col for col in dtlb_df.columns
                           if 'time_sec' not in col and ('miss' in col.lower() or 'rate' in col.lower() or 'avg' in col.lower())]

            result = pd.merge_asof(
                result,
                dtlb_df[['time_sec_normalized'] + dtlb_columns],
                on='time_sec_normalized',
                direction='backward',
                suffixes=("", "_dtlb")
            )

        if itlb_df is not None:
            itlb_df = itlb_df.sort_values('time_sec_normalized')
            itlb_columns = [col for col in itlb_df.columns
                           if 'time_sec' not in col and ('miss' in col.lower() or 'rate' in col.lower() or 'avg' in col.lower())]

            result = pd.merge_asof(
                result,
                itlb_df[['time_sec_normalized'] + itlb_columns],
                on='time_sec_normalized',
                direction='backward',
                suffixes=("", "_itlb")
            )

        return result

    kb2_df = align_with_tlb(kb2_changes, kb2_dtlb, kb2_itlb)
    kb4_df = align_with_tlb(kb4_changes, kb4_dtlb, kb4_itlb)

    print(f"Created 2MB integrated dataset with {len(kb2_df)} rows and {len(kb2_df.columns)} columns")
    print(f"Created 4KB integrated dataset with {len(kb4_df)} rows and {len(kb4_df.columns)} columns")

    kb2_memory_metrics = kb2_df['memory_metric'].unique().tolist() if not kb2_df.empty else []
    kb4_memory_metrics = kb4_df['memory_metric'].unique().tolist() if not kb4_df.empty else []

    print(f"2MB memory metrics available: {kb2_memory_metrics}")
    print(f"4KB memory metrics available: {kb4_memory_metrics}")

    return {
        'kb2_df': kb2_df,
        'kb4_df': kb4_df,
        'kb2_memory_metrics': kb2_memory_metrics,
        'kb4_memory_metrics': kb4_memory_metrics
    }


In [8]:
def create_regular_intervals(kb2_df, kb4_df, n_bins=100):
    """
    Create regular time intervals for both datasets to align them on a
    common time scale regardless of actual times.
    """
    print(f"Creating {n_bins} regular time intervals...")

    if kb2_df.empty or kb4_df.empty:
        print("Warning: One or both datasets are empty")
        return kb2_df, kb4_df, None

    # Get the range of normalized timestamps for both datasets
    kb2_max_time = kb2_df['time_sec_normalized'].max()
    kb4_max_time = kb4_df['time_sec_normalized'].max()

    print(f"2MB max normalized time: {kb2_max_time}")
    print(f"4KB max normalized time: {kb4_max_time}")

    # Create bins that cover both datasets
    max_time = max(kb2_max_time, kb4_max_time)
    bin_edges = np.linspace(0, max_time, n_bins + 1)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    # Assign bins to each dataset
    kb2_df['time_bin'] = pd.cut(kb2_df['time_sec_normalized'], bins=bin_edges, labels=False)
    kb4_df['time_bin'] = pd.cut(kb4_df['time_sec_normalized'], bins=bin_edges, labels=False)

    # Filter out NaN bins (points outside bin range)
    kb2_valid = kb2_df.dropna(subset=['time_bin'])
    kb4_valid = kb4_df.dropna(subset=['time_bin'])

    print(f"Valid 2MB points after binning: {len(kb2_valid)}/{len(kb2_df)}")
    print(f"Valid 4KB points after binning: {len(kb4_valid)}/{len(kb4_df)}")

    return kb2_valid, kb4_valid, bin_centers


In [9]:
def prepare_cross_config_data(integrated_data, target_metric=None, window_size=1.0, step_size=0.5):
    """
    Prepare data for training a cross-configuration model using a sliding window approach.
    
    Uses the integrated 4KB data filtered by the specified memory metric as the target
    and the 2MB integrated data as features. Also extracts 2MB RSS for visualization.
    
    Args:
        integrated_data: Dictionary with integrated datasets.
        target_metric: Memory metric to use as target (e.g., 'anon_mb').
        window_size: Size of the sliding window in seconds.
        step_size: Step size for the sliding window in seconds.
    
    Returns:
        DataFrame with aligned data ready for sequence preparation.
    """
    kb2_df = integrated_data.get('kb2_df')
    kb4_df = integrated_data.get('kb4_df')
    
    if kb2_df is None or kb2_df.empty or kb4_df is None or kb4_df.empty:
        raise ValueError("Both 2MB and 4KB integrated data are required and must not be empty")
    
    available_metrics = kb4_df['memory_metric'].unique()
    if target_metric is None:
        target_metric = available_metrics[0] if len(available_metrics) > 0 else None
        print(f"No target metric specified. Using first available: {target_metric}")
    else:
        if target_metric not in available_metrics:
            raise ValueError(f"Target metric {target_metric} not found in 4KB data. Available: {available_metrics}")
    
    print(f"Using memory metric {target_metric} as target column")
    kb4_targets = kb4_df[kb4_df['memory_metric'] == target_metric][['time_sec_normalized', 'memory_value']].copy()
    if kb4_targets.empty:
        raise ValueError(f"No rows found for target metric {target_metric} in 4KB data")
    kb4_targets = kb4_targets.rename(columns={'memory_value': '4kb_target'})
    print(f"Extracted {len(kb4_targets)} target rows from 4KB data")
    
    # Extract 2MB RSS for visualization (same metric as target)
    kb2_rss = kb2_df[kb2_df['memory_metric'] == target_metric][['time_sec_normalized', 'memory_value']].copy()
    kb2_rss = kb2_rss.rename(columns={'memory_value': '2mb_rss'})
    print(f"Extracted {len(kb2_rss)} 2MB RSS rows for metric {target_metric}")
    
    # Sort both datasets by time
    kb4_targets = kb4_targets.sort_values('time_sec_normalized')
    kb2_df = kb2_df.sort_values('time_sec_normalized')
    kb2_rss = kb2_rss.sort_values('time_sec_normalized')
    
    # Define sliding window parameters
    min_time = min(kb4_targets['time_sec_normalized'].min(), kb2_df['time_sec_normalized'].min())
    max_time = max(kb4_targets['time_sec_normalized'].max(), kb2_df['time_sec_normalized'].max())
    if pd.isna(min_time) or pd.isna(max_time):
        raise ValueError("Invalid time range detected in datasets")
    window_starts = np.arange(min_time, max_time - window_size + step_size, step_size)
    
    print(f"Creating sliding windows: window_size={window_size}s, step_size={step_size}s")
    print(f"Time range: {min_time} to {max_time}, {len(window_starts)} windows")
    
    # Initialize lists to store windowed data
    window_data = []
    kb2_features = [col for col in kb2_df.columns 
                   if col not in ['time_sec_normalized', 'memory_metric', 'memory_value']]
    
    for start in window_starts:
        end = start + window_size
        # Filter 2MB data within the window
        kb2_window = kb2_df[(kb2_df['time_sec_normalized'] >= start) & 
                           (kb2_df['time_sec_normalized'] < end)]
        # Filter 4KB target within the window
        kb4_window = kb4_targets[(kb4_targets['time_sec_normalized'] >= start) & 
                                (kb4_targets['time_sec_normalized'] < end)]
        # Filter 2MB RSS within the window
        kb2_rss_window = kb2_rss[(kb2_rss['time_sec_normalized'] >= start) & 
                                (kb2_rss['time_sec_normalized'] < end)]
        
        # Aggregate 2MB features (mean within the window)
        if not kb2_window.empty:
            kb2_agg = kb2_window[kb2_features].mean().to_dict()
        else:
            # If no 2MB data, fill with NaN
            kb2_agg = {col: np.nan for col in kb2_features}
        
        # Get 4KB target (use the last value in the window if available)
        if not kb4_window.empty:
            kb4_target = kb4_window['4kb_target'].iloc[-1]
        else:
            kb4_target = np.nan
        
        # Get 2MB RSS (use the last value in the window if available)
        if not kb2_rss_window.empty:
            kb2_rss_value = kb2_rss_window['2mb_rss'].iloc[-1]
        else:
            kb2_rss_value = np.nan
        
        # Store the window data
        window_data.append({
            'window_start': start,
            'window_end': end,
            'time_sec': (start + end) / 2,  # Use midpoint for reference
            '4kb_target': kb4_target,
            '2mb_rss': kb2_rss_value,
            **kb2_agg
        })
    
    # Convert to DataFrame
    cross_config_df = pd.DataFrame(window_data)
    
    # Drop rows where the target is missing
    initial_rows = len(cross_config_df)
    cross_config_df = cross_config_df.dropna(subset=['4kb_target'])
    print(f"Dropped {initial_rows - len(cross_config_df)} rows with missing 4KB targets")
    
    if cross_config_df.empty:
        raise ValueError("No valid data remains after dropping rows with missing 4KB targets")
    
    # Check for NaN values in features
    nan_counts = cross_config_df[kb2_features].isna().sum()
    print(f"NaN counts in features:\n{nan_counts}")
    cross_config_df[kb2_features] = cross_config_df[kb2_features].fillna(0)  # Fill NaNs with 0 for features only
    
    print(f"Final cross-configuration dataset has {len(cross_config_df)} rows and {len(cross_config_df.columns)} columns")
    
    return cross_config_df

In [10]:
def augment_sequences(X, y, noise_level=0.05, n_augmentations=3):
    """
    Augment training data with slightly noisy versions to increase dataset size.

    Args:
        X: Input sequences
        y: Target values
        noise_level: Standard deviation of noise to add
        n_augmentations: Number of augmented sequences to create per original

    Returns:
        Augmented X and y arrays
    """
    print(f"Augmenting {len(X)} sequences with {n_augmentations} variations (noise level: {noise_level})...")
    X_aug = X.copy()
    y_aug = y.copy()

    # Add small random noise to create new samples
    for i in range(n_augmentations):
        noise = np.random.normal(0, noise_level, X.shape)
        X_noisy = X + noise
        X_aug = np.vstack([X_aug, X_noisy])
        y_aug = np.vstack([y_aug, y])

    print(f"After augmentation: {len(X_aug)} sequences")
    return X_aug, y_aug

In [11]:
def prepare_lstm_sequences(df, target_col, feature_cols, seq_length=10):
    """
    Prepare sequences for the LSTM model.

    Args:
        df: DataFrame with aligned data.
        target_col: Target column name.
        feature_cols: List of feature column names.
        seq_length: Sequence length.

    Returns:
        X, y, scalers, valid_feature_cols.
    """
    print(f"Preparing sequences with target: {target_col}")
    print(f"Using {len(feature_cols)} features")
    valid_feature_cols = []
    for col in feature_cols:
        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
            if df[col].std() > 0:
                valid_feature_cols.append(col)
            else:
                print(f"Skipping constant column: {col}")
    print(f"Using {len(valid_feature_cols)} valid features after filtering")

    if not valid_feature_cols:
        raise ValueError("No valid feature columns with non-zero variance")

    scalers = {}
    scaled_data = {}
    for col in valid_feature_cols + [target_col]:
        try:
            scaler = MinMaxScaler()
            data = df[col].values.reshape(-1, 1)
            if np.all(np.isnan(data)):
                raise ValueError(f"Column {col} contains only NaN values")
            scaled_data[col] = scaler.fit_transform(data)
            scalers[col] = scaler
        except Exception as e:
            print(f"Error scaling {col}: {e}")
            if col in valid_feature_cols:
                valid_feature_cols.remove(col)

    if not scalers:
        raise ValueError("No columns could be scaled successfully")

    X, y = [], []
    for i in range(len(df) - seq_length):
        features_seq = []
        for col in valid_feature_cols:
            features_seq.append(scaled_data[col][i:i+seq_length])
        X.append(np.hstack(features_seq))
        y.append(scaled_data[target_col][i+seq_length])

    X, y = np.array(X), np.array(y)
    X = X.reshape((X.shape[0], seq_length, len(valid_feature_cols)))
    print(f"Created {len(X)} sequences with shape {X.shape}")

    # Check for NaN values in sequences
    if np.any(np.isnan(X)) or np.any(np.isnan(y)):
        raise ValueError("NaN values detected in prepared sequences")

    return X, y, scalers, valid_feature_cols


In [12]:
def build_cross_page_model(input_shape):
    """
    Build a model to predict 4KB memory usage from 2MB metrics.

    Args:
        input_shape: Tuple with input shape (sequence_length, n_features).

    Returns:
        A compiled Keras model.
    """
    print(f"Building model with input shape: {input_shape}")

    # Input layer
    inputs = Input(shape=input_shape)

    # LSTM layers with attention
    lstm1 = LSTM(64, activation='relu', return_sequences=True)(inputs)
    lstm1 = Dropout(0.2)(lstm1)

    lstm2 = LSTM(32, activation='relu', return_sequences=True)(lstm1)
    lstm2 = Dropout(0.2)(lstm2)

    # Self-attention mechanism
    attention = Attention()([lstm2, lstm2])

    # Final LSTM layer
    lstm_out = LSTM(16, activation='relu')(attention)
    lstm_out = Dropout(0.2)(lstm_out)

    # Output layer
    outputs = Dense(1)(lstm_out)

    # Create and compile model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='mse')
    model.summary()

    return model


In [13]:
def cross_validate_model(X, y, input_shape, n_splits=5, epochs=30, batch_size=16):
    """
    Perform time-series cross-validation for the model.

    Args:
        X: Input sequences
        y: Target values
        input_shape: Shape for model input
        n_splits: Number of cross-validation folds
        epochs: Training epochs per fold
        batch_size: Batch size for training

    Returns:
        List of metrics for each fold and the final trained model
    """
    print(f"Performing {n_splits}-fold time series cross-validation...")

    tscv = TimeSeriesSplit(n_splits=n_splits)
    fold_metrics = []
    best_model = None
    best_mse = float('inf')

    for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
        print(f"\nFold {fold+1}/{n_splits}")
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Add data augmentation for training set
        X_train_aug, y_train_aug = augment_sequences(X_train, y_train, noise_level=0.03)

        # Build and train model
        model = build_cross_page_model(input_shape)
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', patience=10, restore_best_weights=True
        )

        history = model.fit(
            X_train_aug, y_train_aug,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=[early_stopping],
            verbose=1
        )

        # Evaluate on test set
        y_pred = model.predict(X_test)
        if np.any(np.isnan(y_pred)):
            print(f"Warning: NaN predictions in fold {fold+1}")
            continue
        mse = np.mean(np.square(y_pred - y_test))
        rmse = np.sqrt(mse)

        print(f"Fold {fold+1} - MSE: {mse:.4f}, RMSE: {rmse:.4f}")
        fold_metrics.append({
            'fold': fold+1,
            'mse': mse,
            'rmse': rmse,
            'train_samples': len(X_train_aug),
            'test_samples': len(X_test)
        })

        # Save the best model
        if mse < best_mse:
            best_mse = mse
            best_model = model

    if not fold_metrics:
        raise ValueError("No valid folds completed due to prediction issues")

    # Calculate average metrics
    avg_mse = np.mean([m['mse'] for m in fold_metrics])
    avg_rmse = np.mean([m['rmse'] for m in fold_metrics])
    print(f"\nAverage MSE across folds: {avg_mse:.4f}")
    print(f"Average RMSE across folds: {avg_rmse:.4f}")

    return fold_metrics, best_model



In [14]:
def visualize_predictions(actual_values, predicted_values, kb2_rss_values, time_steps=None, save_path='output/prediction_comparison.png'):
    """
    Create detailed visualizations comparing predicted vs actual 4KB memory usage, including 2MB RSS.

    Args:
        actual_values: Array of actual 4KB memory values
        predicted_values: Array of predicted 4KB memory values
        kb2_rss_values: Array of 2MB RSS values
        time_steps: Optional array of time points for x-axis (if None, will use indices)
        save_path: Path to save the visualization
    """
    # Make sure inputs are numpy arrays
    actual = np.array(actual_values).flatten()
    predicted = np.array(predicted_values).flatten()
    kb2_rss = np.array(kb2_rss_values).flatten()

    if np.all(np.isnan(actual)) or np.all(np.isnan(predicted)):
        print(f"Warning: All actual or predicted 4KB values are NaN. Skipping visualization at {save_path}")
        return {
            'mse': np.nan,
            'rmse': np.nan,
            'mae': np.nan,
            'mape': np.nan,
            'r2': np.nan,
            'correlation': np.nan
        }

    # Create x-axis (either time steps or indices)
    x_values = time_steps if time_steps is not None else np.arange(len(actual))
    x_label = 'Time (seconds)' if time_steps is not None else 'Sample Index'

    # Calculate error metrics
    error = actual - predicted
    mse = np.nanmean(np.square(error))
    rmse = np.sqrt(mse) if not np.isnan(mse) else np.nan
    mae = np.nanmean(np.abs(error))
    mape = np.nanmean(np.abs(error / (actual + 1e-10))) * 100
    r2 = 1 - (np.nansum(np.square(error)) / np.nansum(np.square(actual - np.nanmean(actual)))) if not np.all(np.isnan(actual)) else np.nan
    corr = np.corrcoef(actual, predicted)[0, 1] if not (np.all(np.isnan(actual)) or np.all(np.isnan(predicted))) else np.nan

    # Create figure with multiple subplots
    plt.figure(figsize=(20, 16))

    # 1. Main comparison plot
    plt.subplot(3, 2, (1, 2))
    plt.plot(x_values, actual, 'b-', linewidth=2, label='Actual 4KB Memory')
    plt.plot(x_values, predicted, 'r--', linewidth=2, label='Predicted 4KB Memory')
    plt.plot(x_values, kb2_rss, 'g-.', linewidth=2, label='Actual 2MB Memory')
    plt.title('Memory Usage Comparison: 4KB (Actual vs Predicted) and 2MB', fontsize=16)
    plt.xlabel(x_label, fontsize=12)
    plt.ylabel('Memory Usage (MB)', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.legend(fontsize=12)

    # Add metrics as text annotation
    metrics_text = f"MSE: {mse:.4f}\nRMSE: {rmse:.4f}\nMAE: {mae:.4f}\nMAPE: {mape:.2f}%\nR²: {r2:.4f}"
    plt.annotate(metrics_text, xy=(0.02, 0.85), xycoords='axes fraction',
                 bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8),
                 fontsize=12)

    # 2. Error plot
    plt.subplot(3, 2, 3)
    plt.bar(x_values, error, color='purple', alpha=0.7)
    plt.axhline(y=0, color='black', linestyle='-')
    plt.title('Prediction Error (Actual - Predicted 4KB)', fontsize=14)
    plt.xlabel(x_label, fontsize=12)
    plt.ylabel('Error (MB)', fontsize=12)
    plt.grid(True, alpha=0.3)

    # 3. Scatter plot
    plt.subplot(3, 2, 4)
    plt.scatter(actual, predicted, alpha=0.7, c='blue')
    # Add scatter for 2MB vs 4KB actual
    plt.scatter(kb2_rss, actual, alpha=0.7, c='green', marker='^', label='2MB vs 4KB Actual')

    # Add perfect prediction line
    valid_actual = actual[~np.isnan(actual) & ~np.isnan(predicted)]
    valid_predicted = predicted[~np.isnan(actual) & ~np.isnan(predicted)]
    if len(valid_actual) > 0:
        min_val = min(np.min(valid_actual), np.min(valid_predicted))
        max_val = max(np.max(valid_actual), np.max(valid_predicted))
        plt.plot([min_val, max_val], [min_val, max_val], 'r--')

    plt.title('Predicted vs Actual 4KB Memory Usage', fontsize=14)
    plt.xlabel('Actual 4KB Memory (MB)', fontsize=12)
    plt.ylabel('Predicted 4KB Memory (MB)', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.legend()

    # Add correlation annotation
    plt.annotate(f"Correlation: {corr:.4f}", xy=(0.02, 0.95), xycoords='axes fraction',
                 bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8),
                 fontsize=12)

    # 4. Relative error histogram
    plt.subplot(3, 2, 5)
    rel_error = (error / (actual + 1e-10)) * 100  # Percentage error
    valid_rel_error = rel_error[~np.isnan(rel_error)]
    if len(valid_rel_error) > 0:
        plt.hist(valid_rel_error, bins=20, alpha=0.7, color='green')
    else:
        plt.text(0.5, 0.5, 'No valid data for histogram', horizontalalignment='center', verticalalignment='center')
    plt.title('Distribution of Percentage Error (4KB)', fontsize=14)
    plt.xlabel('Percentage Error (%)', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.grid(True, alpha=0.3)

    # 5. Cumulative error
    plt.subplot(3, 2, 6)
    cum_error = np.nancumsum(error)
    plt.plot(x_values, cum_error, 'g-', linewidth=2)
    plt.axhline(y=0, color='black', linestyle='-')
    plt.title('Cumulative Error Over Time (4KB)', fontsize=14)
    plt.xlabel(x_label, fontsize=12)
    plt.ylabel('Cumulative Error (MB)', fontsize=12)
    plt.grid(True, alpha=0.3)

    # Add overall title and adjust layout
    plt.suptitle('4KB Memory Prediction Analysis with 2MB RSS', fontsize=20)
    plt.tight_layout(rect=[0, 0, 1, 0.97])

    # Save the figure
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"Visualization saved to {save_path}")

    return {
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mape': mape,
        'r2': r2,
        'correlation': corr
    }


In [15]:
def feature_importance_analysis(model, X_test, y_test, feature_names, target_scaler):
    """
    Analyze feature importance by permutation and feature ablation.

    Args:
        model: Trained model.
        X_test: Test features.
        y_test: Test targets.
        feature_names: List of feature names.
        target_scaler: Scaler for the target.

    Returns:
        DataFrame with feature importance scores.
    """
    print("Analyzing feature importance...")

    # Get baseline performance
    baseline_pred = model.predict(X_test)
    if np.all(np.isnan(baseline_pred)):
        print("Warning: All baseline predictions are NaN. Skipping feature importance analysis.")
        return pd.DataFrame()

    baseline_mse = np.mean(np.square(
        target_scaler.inverse_transform(y_test.reshape(-1, 1)) -
        target_scaler.inverse_transform(baseline_pred)
    ))
    print(f"Baseline MSE: {baseline_mse:.4f}")

    # Permutation importance
    importances = []
    for i in range(X_test.shape[2]):
        # Permute one feature at a time
        X_permuted = X_test.copy()
        X_permuted[:, :, i] = np.random.permutation(X_permuted[:, :, i])

        # Get predictions with permuted feature
        permuted_pred = model.predict(X_permuted)

        # Calculate MSE with permuted feature
        permuted_mse = np.mean(np.square(
            target_scaler.inverse_transform(y_test.reshape(-1, 1)) -
            target_scaler.inverse_transform(permuted_pred)
        ))

        # Calculate importance as increase in error
        importance = permuted_mse - baseline_mse
        relative_importance = (permuted_mse / baseline_mse) - 1  # % increase in error

        importances.append({
            'feature': feature_names[i],
            'importance': importance,
            'relative_importance': relative_importance * 100,  # as percentage
            'permuted_mse': permuted_mse
        })

    # Convert to DataFrame and sort
    importance_df = pd.DataFrame(importances).sort_values('importance', ascending=False)

    # Visualize feature importance
    plt.figure(figsize=(12, 8))
    plt.barh(importance_df['feature'], importance_df['relative_importance'])
    plt.title('Feature Importance (% Increase in Error When Feature is Permuted)')
    plt.xlabel('% Increase in MSE')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig('output/feature_importance.png')
    plt.close()
    print("Feature importance plot saved to output/feature_importance.png")

    # Feature correlation analysis
    original_predictions = baseline_pred.flatten()

    # For each pair of features, analyze correlation in prediction impact
    feature_impact_corr = np.zeros((len(feature_names), len(feature_names)))

    for i in range(X_test.shape[2]):
        X_i = X_test.copy()
        X_i[:, :, i] = np.random.permutation(X_i[:, :, i])
        pred_i = model.predict(X_i).flatten()
        impact_i = pred_i - original_predictions

        for j in range(i+1, X_test.shape[2]):
            X_j = X_test.copy()
            X_j[:, :, j] = np.random.permutation(X_j[:, :, j])
            pred_j = model.predict(X_j).flatten()
            impact_j = pred_j - original_predictions

            # Correlation between impact of permuting feature i and feature j
            correlation = np.corrcoef(impact_i, impact_j)[0, 1]
            feature_impact_corr[i, j] = correlation
            feature_impact_corr[j, i] = correlation

    # Set diagonal to 1.0
    np.fill_diagonal(feature_impact_corr, 1.0)

    # Visualize feature impact correlations
    plt.figure(figsize=(12, 10))
    plt.imshow(feature_impact_corr, cmap='coolwarm', vmin=-1, vmax=1)
    plt.colorbar(label='Correlation')
    plt.title('Feature Impact Correlation Matrix')
    plt.xticks(range(len(feature_names)), feature_names, rotation=90)
    plt.yticks(range(len(feature_names)), feature_names)
    plt.tight_layout()
    plt.savefig('output/feature_impact_correlation.png')
    plt.close()
    print("Feature impact correlation plot saved to feature_impact_correlation.png")

    return importance_df


In [16]:
def main():
    """Main function to run the cross-page prediction model."""
    try:

        # Add this at the beginning of your script
        base_dir = os.path.expanduser("~/KernMLOps")

        kb2_id = "2d801a0a-e668-4398-b7c3-6332c90607be"
        # Then update your paths
        kb2_paths = {
            'rss': os.path.join(base_dir, f"data/curated/gap/{kb2_id}/mm_rss_stat.end.parquet"),
            'dtlb': os.path.join(base_dir, f"data/curated/gap/{kb2_id}/dtlb_misses.end.parquet"),
            'itlb': os.path.join(base_dir, f"data/curated/gap/{kb2_id}/itlb_misses.end.parquet")
        }

        kb4_id = "57f484a8-a7a4-48a3-969b-ba86f524065a"
        kb4_paths = {
            'rss': os.path.join(base_dir, f"data/curated/gap/{kb4_id}/mm_rss_stat.end.parquet"),
            'dtlb': os.path.join(base_dir, f"data/curated/gap/{kb4_id}/dtlb_misses.end.parquet"),
            'itlb': os.path.join(base_dir, f"data/curated/gap/{kb4_id}/itlb_misses.end.parquet")
        }
        
        # Create output directory if it doesn't exist
        os.makedirs('output', exist_ok=True)
        
        print("="*50)
        print("CROSS-PAGE MEMORY PREDICTION MODEL (2MB to 4KB)")
        print("="*50)
        
        print("\n=== Loading 2MB Page Data ===")
        kb2_data = {}
        kb2_data['rss'] = load_parquet_data(kb2_paths['rss'], "2MB RSS")
        kb2_data['dtlb'] = load_parquet_data(kb2_paths['dtlb'], "2MB DTLB")
        kb2_data['itlb'] = load_parquet_data(kb2_paths['itlb'], "2MB ITLB")
        
        if kb2_data['rss'] is not None:
            kb2_data['rss'], kb2_rss_cols = process_rss_data(kb2_data['rss'], page_size_mb=0.004)
        else:
            raise ValueError("Failed to load 2MB RSS data")
        if kb2_data['dtlb'] is not None:
            kb2_data['dtlb'], kb2_dtlb_cols = process_tlb_data(kb2_data['dtlb'], "2MB DTLB")
        if kb2_data['itlb'] is not None:
            kb2_data['itlb'], kb2_itlb_cols = process_tlb_data(kb2_data['itlb'], "2MB ITLB")
        
        print("\n=== Loading 4KB Page Data ===")
        kb4_data = {}
        kb4_data['rss'] = load_parquet_data(kb4_paths['rss'], "4KB RSS")
        kb4_data['dtlb'] = load_parquet_data(kb4_paths['dtlb'], "4KB DTLB")
        kb4_data['itlb'] = load_parquet_data(kb4_paths['itlb'], "4KB ITLB")
        
        if kb4_data['rss'] is not None:
            kb4_data['rss'], kb4_rss_cols = process_rss_data(kb4_data['rss'], page_size_mb=0.004)
        else:
            raise ValueError("Failed to load 4KB RSS data")
        if kb4_data['dtlb'] is not None:
            kb4_data['dtlb'], kb4_dtlb_cols = process_tlb_data(kb4_data['dtlb'], "4KB DTLB")
        if kb4_data['itlb'] is not None:
            kb4_data['itlb'], kb4_itlb_cols = process_tlb_data(kb4_data['itlb'], "4KB ITLB")
        
        print("\n=== Creating Integrated Dataset ===")
        integrated_data = create_memory_change_dataset(kb2_data, kb4_data)
        
        # Set desired target memory metric (default is usually "anon_mb")
        target_metric = "anon_mb"
        
        print(f"\n=== Preparing Cross-Configuration Data with target {target_metric} ===")
        # Use sliding window instead of binning
        cross_config_df = prepare_cross_config_data(
            integrated_data, 
            target_metric, 
            window_size=1.0,  # 1-second window
            step_size=0.5     # 0.5-second step
        )
        
        # Save the prepared dataset
        cross_config_df.to_csv('output/cross_config_dataset.csv', index=False)
        print("Prepared dataset saved to output/cross_config_dataset.csv")
        
        # Define target and feature columns
        target_column = '4kb_target'
        feature_columns = [col for col in cross_config_df.columns 
                          if col not in ['window_start', 'window_end', 'time_sec', target_column, '2mb_rss']]
        
        print("\n=== Preparing LSTM Sequences ===")
        seq_length = 5  # Adjust based on temporal dynamics
        X, y, scalers, valid_features = prepare_lstm_sequences(
            cross_config_df, target_column, feature_columns, seq_length=seq_length
        )
        if len(X) < 10:
            print(f"Not enough data for training. Only {len(X)} sequences created.")
            return None, None, None
        
        print("\n=== Splitting Data into Train/Test Sets ===")
        # Use time-based split (not random) to preserve temporal structure
        test_size = 0.2
        split_idx = int(len(X) * (1 - test_size))
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")
        
        print("\n=== Cross Validation ===")
        # Perform cross-validation
        cv_metrics, best_model = cross_validate_model(
            X_train, y_train, 
            input_shape=(X.shape[1], X.shape[2]),
            n_splits=min(5, len(X_train) // 10),  # Adjust based on data size
            epochs=50,
            batch_size=min(32, len(X_train) // 5)
        )
        
        # Save cross-validation results
        pd.DataFrame(cv_metrics).to_csv('output/cross_validation_results.csv', index=False)
        print("Cross-validation results saved to output/cross_validation_results.csv")
        
        # Save the best model with proper extension
        best_model.save('output/cross_page_prediction_model.keras')
        print("Best model saved to output/cross_page_prediction_model.keras")
        
        # Alternatively, save as H5 format
        best_model.save('output/cross_page_prediction_model.h5')
        print("Best model also saved in H5 format to output/cross_page_prediction_model.h5")
        
        print("\n=== Model Evaluation ===")
        # Generate predictions
        y_pred = best_model.predict(X_test)
        
        # Inverse transform to original scale
        y_test_orig = scalers[target_column].inverse_transform(y_test.reshape(-1, 1)).flatten()
        y_pred_orig = scalers[target_column].inverse_transform(y_pred).flatten()
        
        # Check for NaN predictions
        if np.all(np.isnan(y_test_orig)) or np.all(np.isnan(y_pred_orig)):
            print("Error: All test predictions or actual values are NaN. Skipping evaluation.")
            return None, None, None
        
        # Get 2MB RSS values for test set
        start_idx = len(X_train) + seq_length
        end_idx = start_idx + len(y_test)
        kb2_rss_test = cross_config_df['2mb_rss'].values[start_idx:end_idx]
        
        # Get time steps if available
        time_values = None
        if 'time_sec' in cross_config_df.columns:
            time_values = cross_config_df['time_sec'].values[start_idx:end_idx]
        
        # Visualize predictions
        metrics = visualize_predictions(
            actual_values=y_test_orig,
            predicted_values=y_pred_orig,
            kb2_rss_values=kb2_rss_test,
            time_steps=time_values,
            save_path='output/test_prediction_comparison.png'
        )
        
        # Save metrics
        with open('output/evaluation_metrics.txt', 'w') as f:
            for k, v in metrics.items():
                f.write(f"{k}: {v}\n")
        print("Evaluation metrics saved to output/evaluation_metrics.txt")
        
        print("\n=== Feature Importance Analysis ===")
        # Analyze feature importance
        importance_df = feature_importance_analysis(
            best_model, X_test, y_test, 
            valid_features, scalers[target_column]
        )
        
        # Save feature importance
        importance_df.to_csv('output/feature_importance.csv', index=False)
        print("Feature importance results saved to output/feature_importance.csv")
        
        print("\n=== Generating Full Prediction Comparison ===")
        all_X = np.vstack([X_train, X_test])
        all_y = np.vstack([y_train, y_test])
        
        # Get predictions for all data
        all_pred = best_model.predict(all_X)
        
        # Inverse transform to original scale
        all_y_orig = scalers[target_column].inverse_transform(all_y)
        all_pred_orig = scalers[target_column].inverse_transform(all_pred)
        
        # Get 2MB RSS values for full dataset
        kb2_rss_full = cross_config_df['2mb_rss'].values[seq_length:seq_length+len(all_y_orig)]
        
        # Create time array
        if 'time_sec' in cross_config_df.columns:
            time_values = cross_config_df['time_sec'].values[seq_length:seq_length+len(all_y_orig)]
        else:
            time_values = None
        
        # Visualize full comparison with enhanced details
        metrics_full = visualize_predictions(
            actual_values=all_y_orig,
            predicted_values=all_pred_orig,
            kb2_rss_values=kb2_rss_full,
            time_steps=time_values,
            save_path='output/full_prediction_comparison.png'
        )
        print("Full prediction comparison saved to output/full_prediction_comparison.png")
        
        print("\n=== Analysis Complete ===")
        print(f"Model achieved RMSE of {metrics['rmse']:.4f} ({metrics['mape']:.2f}% MAPE)")
        print(f"Top 3 important features: {', '.join(importance_df['feature'].head(3).tolist())}")
        
        return best_model, metrics, importance_df

    except Exception as e:
        print(f"Error in main function: {e}")
        import traceback
        traceback.print_exc()
        return None, None, None
if __name__ == "__main__":
    model, metrics, importance = main()

CROSS-PAGE MEMORY PREDICTION MODEL (2MB to 4KB)

=== Loading 2MB Page Data ===
Loading 2MB RSS data from: /home/anish/KernMLOps/data/curated/gap/2d801a0a-e668-4398-b7c3-6332c90607be/mm_rss_stat.end.parquet
2MB RSS data shape: (1341019, 6)
2MB RSS columns: ['pid', 'tgid', 'ts_ns', 'member', 'count', 'collection_id']
Converted column member from object to numeric
Converted column collection_id from object to numeric
Loading 2MB DTLB data from: /home/anish/KernMLOps/data/curated/gap/2d801a0a-e668-4398-b7c3-6332c90607be/dtlb_misses.end.parquet
2MB DTLB data shape: (933014, 8)
2MB DTLB columns: ['cpu', 'pid', 'tgid', 'ts_uptime_us', 'cumulative_dtlb_misses', 'pmu_enabled_time_us', 'pmu_running_time_us', 'collection_id']
Converted column collection_id from object to numeric
Loading 2MB ITLB data from: /home/anish/KernMLOps/data/curated/gap/2d801a0a-e668-4398-b7c3-6332c90607be/itlb_misses.end.parquet
2MB ITLB data shape: (7507, 8)
2MB ITLB columns: ['cpu', 'pid', 'tgid', 'ts_uptime_us', 'cumu

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tlb_df[f'{tlb_type}_misses_rate'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tlb_df[f'{tlb_type}_misses_rate'].fillna(0, inplace=True)


4KB RSS data shape: (1511096, 6)
4KB RSS columns: ['pid', 'tgid', 'ts_ns', 'member', 'count', 'collection_id']
Converted column member from object to numeric
Converted column collection_id from object to numeric
Loading 4KB DTLB data from: /home/anish/KernMLOps/data/curated/gap/57f484a8-a7a4-48a3-969b-ba86f524065a/dtlb_misses.end.parquet
4KB DTLB data shape: (830993, 8)
4KB DTLB columns: ['cpu', 'pid', 'tgid', 'ts_uptime_us', 'cumulative_dtlb_misses', 'pmu_enabled_time_us', 'pmu_running_time_us', 'collection_id']
Converted column collection_id from object to numeric
Loading 4KB ITLB data from: /home/anish/KernMLOps/data/curated/gap/57f484a8-a7a4-48a3-969b-ba86f524065a/itlb_misses.end.parquet
4KB ITLB data shape: (8888, 8)
4KB ITLB columns: ['cpu', 'pid', 'tgid', 'ts_uptime_us', 'cumulative_itlb_misses', 'pmu_enabled_time_us', 'pmu_running_time_us', 'collection_id']
Converted column collection_id from object to numeric
Converted timestamp from ns to sec
Normalized timestamps: min=245676

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tlb_df[f'{tlb_type}_misses_rate'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  tlb_df[f'{tlb_type}_misses_rate'].fillna(0, inplace=True)


  Removed 63 duplicates
  Found duplicate timestamps in time_sec_normalized; removing duplicates...
  Removed 23473 duplicates
  Found duplicate timestamps in time_sec_normalized; removing duplicates...
  Removed 35 duplicates
  Found duplicate timestamps in time_sec_normalized; removing duplicates...
  Removed 61 duplicates
  Found duplicate timestamps in time_sec_normalized; removing duplicates...
  Removed 18669 duplicates
  Found duplicate timestamps in time_sec_normalized; removing duplicates...
  Removed 10 duplicates
Identifying memory change points...
Found 0 memory change points in 2MB data
Found 0 memory change points in 4KB data
Aligning memory changes with TLB measurements using merge_asof...
Created 2MB integrated dataset with 0 rows and 0 columns
Created 4KB integrated dataset with 0 rows and 0 columns
2MB memory metrics available: []
4KB memory metrics available: []

=== Preparing Cross-Configuration Data with target anon_mb ===
Error in main function: Both 2MB and 4KB i

Traceback (most recent call last):
  File "/tmp/ipykernel_1211236/1745304503.py", line 68, in main
    cross_config_df = prepare_cross_config_data(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1211236/3606396847.py", line 21, in prepare_cross_config_data
    raise ValueError("Both 2MB and 4KB integrated data are required and must not be empty")
ValueError: Both 2MB and 4KB integrated data are required and must not be empty
