In [30]:
import os
import random
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import StandardScaler

# --- Set seeds and environment for reproducibility ---
os.environ['PYTHONHASHSEED'] = '42'
os.environ['TF_DETERMINISTIC_OPS'] = '1'  # Force deterministic GPU ops

random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# --- Load CSV data ---
df = pd.read_csv('ABN_MetroExodus_0.results.1.dx12-g3_._test_wishGranted.csv')

# --- Filter where TestMarker == 0.1 ---
filtered_df = df[df['TestMarker'] == 0.1].reset_index(drop=True)

# --- Feature columns ---
features = [
    'GT Effective Freq', 'GPU_Busy', 'GTI_Busy',
    'PKG Reported Temp', 'PKG Reported Power',
    'GTI Rd BW', 'GTI Wr BW', 'GTI Total BW'
]

# --- Prepare feature matrix and scale ---
X = filtered_df[features].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Build autoencoder ---
input_dim = X_scaled.shape[1]
encoding_dim = input_dim // 2 if input_dim > 1 else 1

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu")(input_layer)
decoder = Dense(input_dim, activation="linear")(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mse')

# --- Train autoencoder ---
autoencoder.fit(
    X_scaled, X_scaled,
    epochs=50,
    batch_size=32,
    shuffle=True,
    verbose=1
)

# --- Predict and compute reconstruction errors ---
reconstructions = autoencoder.predict(X_scaled)
diff = X_scaled - reconstructions

# Index of 'GT Effective Freq'
freq_idx = features.index('GT Effective Freq')

# Frequency-specific reconstruction error and difference
freq_recon_error = np.square(diff[:, freq_idx])   # squared errors for freq feature
freq_diff = diff[:, freq_idx]                      # signed difference for dip/spike direction

# Threshold for anomaly (top 10% error)
freq_threshold = np.percentile(freq_recon_error, 90)

# Detect dips and spikes based on freq error and direction
dip_mask = (freq_recon_error > freq_threshold) & (freq_diff < 0)
spike_mask = (freq_recon_error > freq_threshold) & (freq_diff > 0)

# Helper function to get continuous index ranges
def get_ranges(indexes):
    indexes = np.sort(np.unique(indexes))
    ranges = []
    if len(indexes) > 0:
        start = indexes[0]
        for prev, curr in zip(indexes, indexes[1:]):
            if curr != prev + 1:
                ranges.append((start, prev))
                start = curr
        ranges.append((start, indexes[-1]))
    return ranges

dip_indexes = np.where(dip_mask)[0]
spike_indexes = np.where(spike_mask)

dip_ranges = get_ranges(dip_indexes)
spike_ranges = get_ranges(spike_indexes)

# Format ranges as strings for printing
clean_dip_ranges = [f"{start}-{end}" for start, end in dip_ranges]
clean_spike_ranges = [f"{start}-{end}" for start, end in spike_ranges]

print("Dip ranges:", clean_dip_ranges)
print("Spike ranges:", clean_spike_ranges)

# --- Feature influence analysis ---
def feature_influence_per_range(df, feature_list, ranges, target_feature):
    overall_mean = df[feature_list].mean()
    overall_std = df[feature_list].std()

    results = []
    for start, end in ranges:
        subset = df.loc[start:end, feature_list]

        mean_diff = subset.mean() - overall_mean
        z_scores = mean_diff / overall_std

        # exclude the target feature itself
        influencers = z_scores.drop(target_feature)

        # top 3 influencers by absolute z-score
        top_influencers = influencers.abs().sort_values(ascending=False).index[:3]
        top_vals = influencers.loc[top_influencers].round(2)

        results.append({
            "Range": f"{start}-{end}",
            "Top Influencers": ", ".join(
                [f"{feat} ({val:+.2f} z)" for feat, val in zip(top_influencers, top_vals)]
            )
        })

    return pd.DataFrame(results)

dip_influencer_table = feature_influence_per_range(filtered_df, features, dip_ranges, 'GT Effective Freq')
spike_influencer_table = feature_influence_per_range(filtered_df, features, spike_ranges, 'GT Effective Freq')

spike_influencer_table

Epoch 1/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.2320   
Epoch 2/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.1133 
Epoch 3/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.0247 
Epoch 4/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.9550 
Epoch 5/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.8938 
Epoch 6/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.8374 
Epoch 7/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7855 
Epoch 8/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7360 
Epoch 9/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6887 
Epoch 10/50
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.644

Unnamed: 0,Range,Top Influencers
0,0-42,"PKG Reported Temp (-2.78 z), PKG Reported Powe..."
1,48-49,"PKG Reported Temp (-2.44 z), GPU_Busy (-1.01 z..."
2,57-57,"PKG Reported Temp (-1.43 z), PKG Reported Powe..."
3,59-59,"PKG Reported Temp (-1.43 z), PKG Reported Powe..."
4,64-64,"PKG Reported Temp (-1.77 z), PKG Reported Powe..."
5,71-73,"PKG Reported Temp (-1.32 z), PKG Reported Powe..."
6,401-401,"GTI Rd BW (-0.95 z), GTI Total BW (-0.90 z), G..."
7,403-403,"GTI Rd BW (-0.69 z), GTI Total BW (-0.65 z), G..."
8,409-409,"GTI Rd BW (-0.88 z), GTI Total BW (-0.82 z), G..."
9,414-414,"GTI Rd BW (-0.93 z), GTI Total BW (-0.88 z), G..."
