In [None]:
# Step 1: Remove constant (zero variance) features
X = nX.loc[:, X.var() != 0]

# Step 2: Calculate the similarity (distance) matrix between features
feature_distances = np.corrcoef(X.T)
distance_matrix = 1 - feature_distances
distance_matrix = np.nan_to_num(distance_matrix, nan=1.0)

# Step 3: Apply SLINK (single linkage clustering) using scipy
flattened_distance_matrix = distance_matrix[np.triu_indices_from(distance_matrix, k=1)]
Z = linkage(flattened_distance_matrix, method='single')

# Step 4: Generate 238 clusters from the linkage matrix
num_clusters = 238
clusters = fcluster(Z, num_clusters, criterion='maxclust')

# Step 5: Aggregate features within each cluster efficiently
clustered_features = {}  # Dictionary to store each cluster's aggregated feature
for cluster_num in range(1, num_clusters + 1):
    feature_indices = np.where(clusters == cluster_num)[0]
    clustered_features[f'cluster_{cluster_num}'] = X.iloc[:, feature_indices].mean(axis=1)

# Convert dictionary to DataFrame all at once
X_reduced = pd.DataFrame(clustered_features)

# Step 6: Optional - Plot dendrogram to visualize the clustering
plt.figure(figsize=(12, 8))
dendrogram(Z, truncate_mode='lastp', p=num_clusters)
plt.title('Dendrogram of Feature Clustering using SLINK')
plt.xlabel('Cluster')
plt.ylabel('Distance')
plt.show()

# X_reduced now has 238 aggregated features
print("Reduced DataFrame with", X_reduced.shape[1], "features")