### Kinetic Analysis of Alanine Dipeptide Simulation Datasets using VAMPnets and MSMs

In [12]:
# Import dependencies
import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import seaborn as sns
import os

import mdtraj as md
from msmbuilder.featurizer import AtomPairsFeaturizer

from sklearn.decomposition import PCA
from scipy.linalg import eigh

### Step 1 of MSM and VAMPnets: Generate Features

<img src="fig/alanine_dipeptide.png" width="500" align="center"/>

In [13]:
# Initialize the AtomPairsFeaturizer with atom pairs loaded from the 'ala2_atom_pairs' file
pairs_feat = AtomPairsFeaturizer(np.loadtxt("project2-md-dataset/ala2-xtc-1ps/ala2_atom_pairs"))

# Ensure the features directory exists
os.makedirs("project2-md-dataset/features", exist_ok=True)

# Process each trajectory file for indices 0 to 9
for i in range(10):
    # Load the trajectory with mdtraj
    traj = md.load(f"project2-md-dataset/ala2-xtc-1ps/ala2-1ps-0{i}.xtc", top="project2-md-dataset/ala2-xtc-1ps/ala2.pdb")
    # Transform the trajectory to features using the featurizer
    features = pairs_feat.transform([traj])
    # Save the features to a binary file in NumPy '.npy' format
    np.save(f"project2-md-dataset/features/ftraj_{i}", features[0])

# Repeat the process for indices 10 to 99
for i in range(10, 100):
    # Load the trajectory with mdtraj
    traj = md.load(f"project2-md-dataset/ala2-xtc-1ps/ala2-1ps-{i}.xtc", top="project2-md-dataset/ala2-xtc-1ps/ala2.pdb")
    # Transform the trajectory to features using the featurizer
    features = pairs_feat.transform([traj])
    # Save the features to a binary file in NumPy '.npy' format
    np.save(f"project2-md-dataset/features/ftraj_{i}", features[0])

### Step 2 of MSM: Generate TICA

In [15]:

def compute_covariance_matrices(features, lag_time):
    """
    Compute the covariance and time-lagged covariance matrices.
    
    Parameters:
    - features: The input features, typically a list of 2D arrays.
    - lag_time: The lag time for the time-lagged covariance matrix.
    
    Returns:
    - C_0: Covariance matrix at time t.
    - C_tau: Time-lagged covariance matrix between t and t + tau.
    """
    # Concatenate all features to compute overall covariance
    concatenated_features = np.concatenate(features)
    # Center the features
    mean_features = np.mean(concatenated_features, axis=0)
    concatenated_features -= mean_features
    # Compute covariance matrix at time t
    C_0 = np.cov(concatenated_features.T)
    
    # Compute time-lagged covariance matrix
    concatenated_shifted_features = np.concatenate([f[lag_time:] for f in features])
    concatenated_shifted_features -= mean_features  # ensure the mean is removed
    C_tau = concatenated_features[:len(concatenated_shifted_features)].T @ concatenated_shifted_features / (len(concatenated_shifted_features) - 1)
    
    return C_0, C_tau

def perform_tica(C_0, C_tau, n_components):
    """
    Perform TICA given covariance matrices.
    
    Parameters:
    - C_0: Covariance matrix at time t.
    - C_tau: Time-lagged covariance matrix between t and t + tau.
    - n_components: Number of TICs to return.
    
    Returns:
    - eigenvalues: The top eigenvalues.
    - eigenvectors: The top eigenvectors corresponding to TICs.
    """
    # Solve the generalized eigenvalue problem
    eigenvalues, eigenvectors = eigh(C_tau, C_0)
    
    # Sort by largest eigenvalues
    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    
    # Select the top components
    eigenvalues = eigenvalues[:n_components]
    eigenvectors = eigenvectors[:, :n_components]
    
    return eigenvalues, eigenvectors

# Define your lag time and the number of components you want to keep
lag_time = 10  # This could be in the units of your time steps, for example, 1ps
n_components = 3  # Number of TICs

# Assuming `features` is a list of arrays containing the pairwise distances of heavy atoms for each trajectory
C_0, C_tau = compute_covariance_matrices(features, lag_time)
eigenvalues, eigenvectors = perform_tica(C_0, C_tau, n_components)

# Print the top eigenvalues
print('Top eigenvalues:', eigenvalues)

# We now have the TICs, let's project the data onto the TIC space
# First, recalculate the mean features for the transformation step
mean_features = np.mean(np.concatenate(features), axis=0)

# Transform the original features into the TIC space for all trajectories
tic_features_list = [(f - mean_features).dot(eigenvectors) for f in features]


Top eigenvalues: [10.25878265  2.35384048  1.93254931]
