In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import load_model
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt


# Load Pretrained VAE Model
# -------------------------------
def sampling(args):
    """Sampling function for the VAE latent space."""
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

custom_objects = {'sampling': sampling}
auto = load_model('/home/model-6', custom_objects=custom_objects)


# Utility Functions
# -------------------------------
def print_clusters(df, cluster_labels):
    
    clusters = {}
    for idx, lbl in enumerate(cluster_labels):
        clusters.setdefault(lbl, []).append(df.columns[idx])
    for lbl, feats in clusters.items():
        print(f"Cluster {lbl}: {', '.join(feats)}")


def create_clustered_dataframes(df, cluster_labels):
    
    clustered = {}
    for lbl in np.unique(cluster_labels):
        cols = [col for col, lab in zip(df.columns, cluster_labels) if lab == lbl]
        clustered[lbl] = df[cols]
    return clustered


def create_sequences(clustered_dfs, seq_length):
    
    sequences = {}
    for lbl, df in clustered_dfs.items():
        arr = df.values  # shape: (time_steps, n_features_cluster)
        xs = []
        for i in range(len(arr) - seq_length):
            xs.append(arr[i : i + seq_length])
        sequences[lbl] = np.stack(xs)  # → shape (n_samples, seq_length, n_features_cluster)
    return sequences


def sequences_to_original(X_seq, seq_length):
    
    n_samples, L, n_feats = X_seq.shape
    recon = np.zeros((n_samples + seq_length, n_feats))
    for i in range(n_samples):
        recon[i] = X_seq[i, 0]
    recon[n_samples:] = X_seq[-1, :, :]
    return recon


# Prepare Data and Clustering
# -------------------------------
# NOTE: `train` and `test` must already be loaded pandas DataFrames with matching columns.

# Ensure feature names are strings
train.columns = train.columns.astype(str)
test.columns  = test.columns.astype(str)

# Determine optimal number of clusters
n_clusters = optimal_cluster  # e.g., 4

# Compute correlation matrix on training set
corr_mat = train.corr()

# Cluster features based on correlation
clustering = KMeans(n_clusters=n_clusters, random_state=3).fit(corr_mat)
cluster_labels = clustering.labels_

print("Feature clusters (training data):")
print_clusters(train, cluster_labels)

# Create Clustered DataFrames & Sequences
# -------------------------------
clustered_train_dfs = create_clustered_dataframes(train, cluster_labels)
clustered_test_dfs  = create_clustered_dataframes(test, cluster_labels)

# Sequence length
T = 14
np.random.seed(60)

# Build rolling sequences for each cluster
train_sequences = create_sequences(clustered_train_dfs, T)
test_sequences  = create_sequences(clustered_test_dfs, T)


# Feature Scoring 
# -------------------------------
# Initialize DataFrame to hold mean‐absolute‐deviation scores per sample, per feature
feature_scores = pd.DataFrame(index=test.index[T:], columns=test.columns)

for feature in test.columns:
    
    test_variant = test.copy()
    median_value = test[feature].median()
    test_variant[feature] = median_value

    clustered_variant = create_clustered_dataframes(test_variant, cluster_labels)
    variant_sequences = create_sequences(clustered_variant, T)

    input_list = [variant_sequences[lbl] for lbl in sorted(variant_sequences.keys())]
    pred_seq = auto.predict(input_list, verbose=0)
    pred_recon = sequences_to_original(pred_seq, T)

    actual_concat = pd.concat(
        [clustered_variant[lbl] for lbl in sorted(clustered_variant.keys())],
        axis=1
    ).values

    n_samples = pred_seq.shape[0]
    mad = np.mean(np.abs(pred_recon[:n_samples] - actual_concat[:n_samples]), axis=1)

    feature_scores[feature] = mad

    print(f"Feature: {feature}, mean MAD (over all samples): {mad.mean():.6f}")


# Identify Top “Most Contributive” Feature per Time Step
# -------------------------------
# We compare each feature’s MAD to the ‘Loss_mae’ column; features exceeding that are candidates
max_value_cols = []
for idx, row in feature_scores.iterrows():
    main_val = row['Loss_mae']
    # Filter features whose score > main_val
    higher = {col: row[col] for col in feature_scores.columns if col != 'Loss_mae' and row[col] > main_val}
    max_col = max(higher, key=higher.get) if higher else None
    max_value_cols.append(max_col)

feature_scores['max_value_column'] = max_value_cols

# Save and Plot Results
# -------------------------------
feature_scores.to_csv('/home/feature_score-6.csv')

# Plot the top 20 features by appearance in max_value_column
value_counts = feature_scores['max_value_column'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(10, 8))
plt.barh(
    value_counts[:20].index[::-1],
    value_counts[:20].values[::-1]
)
plt.xlabel("Counts")
plt.ylabel("Feature")
plt.title("Top 20 Features Appearing in 'max_value_column'")
plt.show()
