In [12]:
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Directories
input_dir = "../filtered_workloads_1s_stats_max_waittime"

# List CSV files in the directory
csv_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith(".csv")]

# Initialize a list to store workload summaries
workload_data = []

# Read and aggregate data from all CSV files
for file in csv_files:
    df = pd.read_csv(file)
    # Extract metrics for the workload
    mean_power_draw = df.loc[df["column_name"] == "power_draw_W", "mean"].values[0]
    total_length_seconds = df.loc[df["column_name"] == "power_draw_W", "total_length_seconds"].values[0]
    max_waittime = df.loc[df["column_name"] == "power_draw_W", "max_waittime"].values[0]
    
    # Skip samples where max_waittime <= 0
    if max_waittime <= 0:
        continue
    
    # Calculate energy (mean_power_draw * total_length_seconds)
    energy = mean_power_draw * total_length_seconds
    
    workload_summary = {
        "workload": os.path.basename(file),  # File name as workload identifier
        "energy": energy,
        "max_waittime": max_waittime,
        "total_length_seconds": total_length_seconds,
    }
    workload_data.append(workload_summary)

# Convert to DataFrame
workloads_df = pd.DataFrame(workload_data)

# Select features for clustering
features = ["energy", "max_waittime", "total_length_seconds"]

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(workloads_df[features])

# Perform k-means clustering
optimal_k = 3  # Number of clusters
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
workloads_df["cluster"] = kmeans.fit_predict(scaled_features)

# Apply PCA to reduce dimensions to 2D
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(scaled_features)

# Scatter plot for clusters in PCA-reduced space
plt.figure(figsize=(10, 7))
for cluster in workloads_df["cluster"].unique():
    cluster_data = reduced_features[workloads_df["cluster"] == cluster]
    plt.scatter(
        cluster_data[:, 0],
        cluster_data[:, 1],
        label=f"Cluster {cluster}",
        alpha=0.7
    )

plt.title("Clusters of Workloads (PCA-Reduced Features)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.show()


KeyError: "None of [Index(['energy', 'max_waittime', 'total_length_seconds'], dtype='object')] are in the [columns]"