In [None]:
# # Example: Conformational Landscape Analysis using KDE

# This notebook demonstrates how to analyze the conformational landscape sampled in an MD simulation using 2D Kernel Density Estimation (KDE) and minima finding. KDE often serves as a proxy for the free energy landscape (where high density implies low free energy). Finding minima helps identify stable conformational states.

# **Workflow:**
# 1. Import necessary libraries.
# 2. Load 2D data representing conformational coordinates. This can be:
#     *   Principal component projections (e.g., PC1 & PC2 from PCA).
#     *   Two key distances or dihedral angles relevant to the motion of interest.
# 3. Calculate the 2D KDE using `calculate_kde_2d`.
# 4. Find local minima in the KDE using `find_kde_minima_2d`.
# 5. Visualize the KDE landscape and the identified minima.
# 6. Find representative frames closest to each minimum using logic similar to `find_closest_frames_to_centroids`.
# 7. (Optional) Save the representative structures.


In [None]:
# Import necessary libraries
import md_analysis_tools # Our custom library
import MDAnalysis as mda
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # For potentially nicer plotting
import os
import sys # For checking errors

# Configure plotting style (optional)
plt.style.use('seaborn-v0_8-poster')

# Check if scikit-image is available (needed for minima finding)
if not md_analysis_tools._SKIMAGE_AVAILABLE:
     print("Warning: scikit-image not found. KDE minima finding will not work.", file=sys.stderr)
     # Consider exiting if this is essential
     # exit()


In [None]:
# ## 1. Load/Prepare 2D Data for Landscape Analysis

# We need two variables (collective coordinates). Common choices include PC1/PC2 from PCA or key distances/dihedrals.

# **ACTION:** Choose **ONE** data source method below:
#   A. Load pre-calculated 2D data (e.g., PCA projections, distances) from a file.
#   B. Calculate two specific distances directly from the trajectory using `calculate_distances`.

# Set paths for topology/trajectory files - needed for Method B and for saving structures later.


In [None]:
# --- User Input ---

# Method A: Load from file
# Set data_file_2d to your file path, leave calculate_distances_now = False
data_file_2d = "pca_analysis_output/pca_projections_calculated.csv" # <-- REPLACE or set to None
calculate_distances_now = False # Set to True to use Method B

# Method B: Calculate distances now
# Define pairs if calculate_distances_now = True
selection_pairs_for_kde = [
     ("resid 10 and name CA", "resid 50 and name CA"), # <-- REPLACE with relevant distance 1
     ("protein and chainID A", "resname LIG")         # <-- REPLACE with relevant distance 2
]

# Column indices/names (used if loading file, also labels if calculating)
col_x_index = 0 # Index (if loading file) or first distance pair (if calculating)
col_y_index = 1 # Index (if loading file) or second distance pair (if calculating)
col_x_label = "PC1 / Distance 1" # Label for plotting X-axis (will be updated)
col_y_label = "PC2 / Distance 2" # Label for plotting Y-axis (will be updated)

# Files needed for calculation (Method B) or saving structures later
topology_file = "placeholder.prmtop" # <-- Needs path if calculating distances or saving structures
trajectory_file = "placeholder.dcd"   # <-- Needs path if calculating distances or saving structures
output_dir = "landscape_analysis_output" # Directory for saving results

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# --- Load or Calculate Data ---
data_x = None
data_y = None
frame_indices_kde = None # Store frame indices corresponding to data_x/data_y

print(f"Preparing 2D data for KDE...")

if not calculate_distances_now and data_file_2d and os.path.exists(data_file_2d):
    # Method A: Load from file
    print(f"Loading 2D data from file: {data_file_2d}")
    try:
        df_data = pd.read_csv(data_file_2d) # Adjust sep= if needed
        if col_x_index < len(df_data.columns) and col_y_index < len(df_data.columns):
            data_x = df_data.iloc[:, col_x_index].values
            data_y = df_data.iloc[:, col_y_index].values
            # Try to get frame indices if 'Frame' column exists
            frame_indices_kde = df_data['Frame'].values if 'Frame' in df_data.columns else np.arange(len(data_x))
            # Update labels from column names
            col_x_label = df_data.columns[col_x_index] if isinstance(df_data.columns[col_x_index], str) else f"Column_{col_x_index}"
            col_y_label = df_data.columns[col_y_index] if isinstance(df_data.columns[col_y_index], str) else f"Column_{col_y_index}"
            print(f"Loaded data for X ('{col_x_label}') and Y ('{col_y_label}'). Shape: {data_x.shape}")
        else:
             raise IndexError(f"Column index {col_x_index} or {col_y_index} out of bounds.")
    except Exception as e:
        print(f"Error loading 2D data from file: {e}", file=sys.stderr)
        # exit()

elif calculate_distances_now:
    # Method B: Calculate distances
    print("Calculating distances to use for KDE...")
    u = None
    try:
        if not os.path.exists(topology_file) or not os.path.exists(trajectory_file):
            raise FileNotFoundError("Topology/Trajectory files not found for distance calculation.")
        u = mda.Universe(topology_file, trajectory_file)
        print(f"Universe loaded with {len(u.trajectory)} frames.")
        
        df_distances = md_analysis_tools.calculate_distances(
            universe=u,
            selection_pairs=selection_pairs_for_kde
            # Add start/stop/step if needed
        )
        
        if df_distances is not None and len(df_distances.columns) >= 3: # Need Frame + 2 dist columns
            dist_col_1 = df_distances.columns[1] # First distance column name
            dist_col_2 = df_distances.columns[2] # Second distance column name
            data_x = df_distances[dist_col_1].values
            data_y = df_distances[dist_col_2].values
            frame_indices_kde = df_distances['Frame'].values
            # Update labels
            col_x_label = dist_col_1
            col_y_label = dist_col_2
            print(f"Calculated distances for X ('{col_x_label}') and Y ('{col_y_label}'). Shape: {data_x.shape}")
        else:
             print("Distance calculation failed or did not return expected columns.", file=sys.stderr)
             # exit()
             
    except Exception as e:
        print(f"Error loading Universe or calculating distances: {e}", file=sys.stderr)
        # exit()
else:
    print("Error: No valid data source specified (set data_file_2d or calculate_distances_now=True).", file=sys.stderr)
    # exit()

# --- Verify Data ---
if data_x is None or data_y is None:
     print("\nKDE analysis cannot proceed due to data loading/calculation errors.", file=sys.stderr)
     # exit()
elif frame_indices_kde is None:
     print("\nWarning: Frame indices not available. Representative structure saving will use relative indices.", file=sys.stderr)
     frame_indices_kde = np.arange(len(data_x)) # Assign default indices



In [None]:
# ## 2. Calculate 2D Kernel Density Estimate (KDE)

# We use the `calculate_kde_2d` function on our loaded/calculated 2D data.


In [None]:
# --- Calculate KDE ---
# (Code here is identical to the previous version of this notebook)
kde_Z = None
kde_X = None
kde_Y = None

if data_x is not None and data_y is not None:
    print("\nCalculating 2D KDE...")
    kde_result_tuple = md_analysis_tools.calculate_kde_2d(
        x_data=data_x,
        y_data=data_y,
        grid_size=100j, 
        bandwidth=None
    )
    if kde_result_tuple:
        kde_Z, kde_X, kde_Y = kde_result_tuple
        print("KDE calculation successful.")
    else:
        print("KDE calculation failed.", file=sys.stderr)
else:
    print("Skipping KDE calculation due to data loading errors.", file=sys.stderr)


In [None]:
# ## 3. Find Local Minima in KDE

# We use `find_kde_minima_2d` to identify the low-energy basins. Remember to check if `scikit-image` is available. Parameters like `sigma`, `min_distance`, and `threshold_rel` might need tuning based on the landscape.


In [None]:
# --- Find Minima ---
# (Code here is identical to the previous version of this notebook)
minima_coords = None

if kde_Z is not None and md_analysis_tools._SKIMAGE_AVAILABLE:
    print("\nFinding KDE local minima...")
    minima_coords = md_analysis_tools.find_kde_minima_2d(
        Z=kde_Z, X=kde_X, Y=kde_Y,
        sigma=2.0,      # Adjust smoothing
        min_distance=5, # Adjust minimum separation
        threshold_rel=0.05 # Adjust relative threshold
    )
    if minima_coords is not None:
         print(f"\nFound {len(minima_coords)} minima coordinates:")
         for i, (mx, my) in enumerate(minima_coords):
              print(f"  Minimum {i}: ({mx:.2f}, {my:.2f})")
    else:
         print("Minima finding failed.", file=sys.stderr)
elif not md_analysis_tools._SKIMAGE_AVAILABLE:
     print("\nSkipping minima finding because scikit-image is not installed.", file=sys.stderr)
else:
     print("\nSkipping minima finding due to KDE calculation errors.", file=sys.stderr)


In [None]:
# ## 4. Visualize Landscape and Minima

# Plot the KDE heatmap/contour and overlay the found minima.


In [None]:
# --- Plot Landscape ---
# (Code here is identical to the previous version of this notebook)
if kde_Z is not None:
    print("\nPlotting KDE landscape...")
    plt.figure(figsize=(9, 8))
    xmin, xmax = kde_X[0, 0], kde_X[0, -1]
    ymin, ymax = kde_Y[0, 0], kde_Y[-1, 0]
    extent = [xmin, xmax, ymin, ymax]
    levels = 15 
    plt.contourf(kde_X, kde_Y, kde_Z, levels=levels, cmap='viridis')
    plt.colorbar(label='Probability Density')
    plt.contour(kde_X, kde_Y, kde_Z, levels=levels, colors='white', linewidths=0.5, alpha=0.5)
    if minima_coords:
        min_x_coords = [m[0] for m in minima_coords]
        min_y_coords = [m[1] for m in minima_coords]
        plt.scatter(min_x_coords, min_y_coords, marker='X', s=200, c='red', edgecolor='black', label='Minima', zorder=10)
        plt.legend()
    plt.title('2D KDE Landscape Analysis')
    plt.xlabel(col_x_label) # Use updated labels
    plt.ylabel(col_y_label) # Use updated labels
    plt.grid(True, linestyle=':', alpha=0.3)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "kde_landscape_minima.png"), dpi=300)
    plt.show()
else:
    print("Skipping landscape plotting due to KDE calculation errors.", file=sys.stderr)


In [None]:
# ## 5. Find Representative Frames Near Minima

# Find the actual trajectory frames closest to the identified KDE minima coordinates using the original 2D data (e.g., PC1/PC2 or distances).


In [None]:
# --- Find Representatives Near Minima ---
# (Code here is identical to the previous version of this notebook, but uses frame_indices_kde)
representative_indices_kde = None
data_for_search = None

if minima_coords and data_x is not None and data_y is not None:
     print("\nFinding representative frames closest to KDE minima...")
     data_for_search = np.vstack([data_x, data_y]).T 
     minima_np = np.array(minima_coords) 
     representative_indices_kde = []
     for i in range(len(minima_np)):
          target_coords = minima_np[i].reshape(1, -1) 
          distances = cdist(data_for_search, target_coords, 'euclidean')
          # Get the index relative to data_for_search
          nearest_relative_index = np.argmin(distances)
          # Map to the original frame index using frame_indices_kde
          original_frame_index = frame_indices_kde[nearest_relative_index]
          representative_indices_kde.append(original_frame_index)
          print(f"  Minimum {i} ({minima_np[i][0]:.2f}, {minima_np[i][1]:.2f}): Closest Frame Index = {original_frame_index}")
     if not representative_indices_kde:
          print("Failed to find representative frames for KDE minima.", file=sys.stderr)
elif not minima_coords:
     print("Skipping representative finding as no minima were identified.", file=sys.stderr)
else:
     print("Skipping representative finding due to data loading errors.", file=sys.stderr)

# --- Optional: Plot landscape with representatives highlighted ---
# (Plotting code identical to previous version, just ensure variables are defined)
if representative_indices_kde and data_for_search is not None and kde_Z is not None:
     # Map original frame indices back to relative indices for plotting
     # This requires mapping frame_indices_kde to range(len(data_for_search))
     relative_indices_for_plot = []
     frame_to_relative_map = {frame: idx for idx, frame in enumerate(frame_indices_kde)}
     for frame in representative_indices_kde:
         rel_idx = frame_to_relative_map.get(frame)
         if rel_idx is not None:
              relative_indices_for_plot.append(rel_idx)

     if relative_indices_for_plot:
          rep_data = data_for_search[relative_indices_for_plot]
          plt.figure(figsize=(9, 8))
          # ... (rest of plotting code identical to previous version) ...
          plt.contourf(kde_X, kde_Y, kde_Z, levels=levels, cmap='viridis')
          plt.colorbar(label='Probability Density')
          plt.contour(kde_X, kde_Y, kde_Z, levels=levels, colors='white', linewidths=0.5, alpha=0.5)
          min_x_coords = [m[0] for m in minima_coords]
          min_y_coords = [m[1] for m in minima_coords]
          plt.scatter(min_x_coords, min_y_coords, marker='X', s=200, c='red', edgecolor='black', label='Minima', zorder=10)
          plt.scatter(rep_data[:, 0], rep_data[:, 1], marker='o', s=150, c='yellow', edgecolor='black', label='Representatives', zorder=11)
          plt.title('KDE Landscape with Minima and Representatives')
          plt.xlabel(col_x_label)
          plt.ylabel(col_y_label)
          plt.legend()
          plt.grid(True, linestyle=':', alpha=0.3)
          plt.tight_layout()
          plt.savefig(os.path.join(output_dir, "kde_landscape_representatives.png"), dpi=300)
          plt.show()



In [None]:
# ## 6. (Optional) Save Representative Structures

# Save the identified representative frames as PDB files.


In [None]:
# --- Save Structures ---
# (Code here is identical to the previous version, uses representative_indices_kde which now holds ORIGINAL frame indices)
if representative_indices_kde:
    u_struct = None # Define variable to hold universe for saving
    try:
        # Only load if needed and not already loaded (e.g., if distances were calculated)
        if 'u' not in locals() or not isinstance(u, mda.Universe):
             print("\nReloading Universe to save structures...")
             if os.path.exists(topology_file) and os.path.exists(trajectory_file):
                  u_struct = mda.Universe(topology_file, trajectory_file)
             else:
                  raise FileNotFoundError("Topology/Trajectory files not found for saving structures.")
        elif isinstance(u, mda.Universe): # Check if 'u' from distance calculation exists
             u_struct = u
        else:
             raise TypeError("Could not obtain valid MDAnalysis Universe for saving.")
             
        print("\nSaving representative structures for KDE minima...")
        all_atoms = u_struct.select_atoms("all")
        
        for minima_id, frame_idx in enumerate(representative_indices_kde):
            if frame_idx is not None:
                 try:
                      u_struct.trajectory[frame_idx] # Go to the correct frame index
                      pdb_filename = os.path.join(output_dir, f"representative_kde_minimum_{minima_id}_frame_{frame_idx}.pdb")
                      all_atoms.write(pdb_filename)
                      print(f"  Saved: {pdb_filename}")
                 except IndexError:
                      print(f"  Error: Frame index {frame_idx} out of bounds for trajectory. Cannot save structure for minimum {minima_id}.", file=sys.stderr)
                 except Exception as e:
                      print(f"  Error saving structure for minimum {minima_id} (frame {frame_idx}): {e}", file=sys.stderr)

    except Exception as e:
        print(f"Error during structure saving setup: {e}", file=sys.stderr)
else:
    print("Skipping structure saving as no representative indices were found.", file=sys.stderr)



In [None]:
#%% md