In [None]:
import os
import mdtraj as md
import numpy as np

# Define the folder path containing the PDB files
folder_path = "/Users/kendalllemons/Downloads/asap-datasets/full_frag_prepped_mpro_20230603/prepped_structures/splits/splits"

In [None]:
pdb_paths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".pdb")]

In [None]:
filtered_trajectories = []

In [None]:
# Load PDB structures into memory and get atomic coordinates
for pdb_path in pdb_paths:
    traj = md.load(pdb_path)
    
    # Check if it's the first structure encountered or has the same atom count as the first one
    if len(filtered_trajectories) == 0 or traj.n_atoms == filtered_trajectories[0].n_atoms:
        filtered_trajectories.append(traj)

In [None]:
# Initialize the RMSD matrix
num_structures = len(filtered_trajectories)
rmsd_matrix = np.zeros((num_structures, num_structures))

In [None]:
# Calculate RMSD for the filtered structures
for i in range(num_structures):
    for j in range(i + 1, num_structures):
        rmsd = md.rmsd(filtered_trajectories[i], filtered_trajectories[j])
        rmsd_matrix[i, j] = rmsd
        rmsd_matrix[j, i] = rmsd  # RMSD is symmetric

In [None]:
rmsd_matrix

In [None]:
np.shape(rmsd_matrix)

In [None]:
rmsd_matrix.max()

In [None]:
rmsd_matrix.min()

In [None]:
np.all(rmsd_matrix - rmsd_matrix.T) < 0.00001

In [None]:
import plotly.express as px
import plotly.graph_objects as go

fig_heatmap = go.Figure(data=go.Heatmap(z=rmsd_matrix, colorscale='Viridis'))
fig_heatmap.update_layout(title='RMSD Heatmap', xaxis_title='Structure Index', yaxis_title='Structure Index')
fig_heatmap.show()

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Perform dimensionality reduction (PCA)
pca = PCA(n_components=2)
reduced_rmsd = pca.fit_transform(rmsd_matrix)

# Cluster the reduced data using K-means
kmeans = KMeans(n_clusters=3)  # Adjust the number of clusters as needed
cluster_labels = kmeans.fit_predict(reduced_rmsd)

# Create a scatter plot of the clustered RMSD data using Plotly
scatter_data = {'PC1': reduced_rmsd[:, 0], 'PC2': reduced_rmsd[:, 1], 'Cluster': cluster_labels}
df = pd.DataFrame(scatter_data)
fig_scatter = px.scatter(df, x='PC1', y='PC2', color='Cluster', title='Clustered RMSD Data')
fig_scatter.show()