# Apply msmbuilder API to WT ff14SB cTN

MD datasets are usually quite large. It doesn't make sense to load everything into memory at once. The `dataset` object lazily-loads trajectories as they are needed. Below, we create a dataset out of all the trajectories we have at the moment.

In [1]:
from msmbuilder.dataset import dataset
xyz = dataset("./run8/05*nc", topology="./test.pdb")

## Featurization
The raw (x, y, z) coordinates from the simulation do not respect the translational and rotational symmetry of our problem. A Featurizer transforms cartesian coordinates into other representations.

### Dihedrals
Here we use the `DihedralFeaturizer` to turn our data into phi and psi dihedral angles.
Observe that the 6812*3-dimensional space is reduced substantially.

In [None]:
from msmbuilder.featurizer import DihedralFeaturizer
featurizer = DihedralFeaturizer(types=['phi', 'psi'])
diheds = xyz.fit_transform_with(featurizer, "diheds", fmt='dir-npy')
from msmbuilder.utils import load
# diheds = dataset('diheds_phi_psi/') # Load dihedrals-transformed dataset
print(xyz[0].xyz.shape)
print(diheds[0].shape)

### Contact Featurizer
Featurizer based on residue-residue distances

This featurizer transforms a dataset containing MD trajectories into a vector dataset by representing each frame in each of the MD trajectories by a vector of the distances between pairs of amino-acid residues.

The exact method for computing the the distance between two residues is configurable with the `scheme` parameter. In this case we use `"ca"` to determine the distance between two residues as the distance between their alpha carbons.

In [None]:
from msmbuilder.featurizer import ContactFeaturizer
featurizer_contact = ContactFeaturizer("all", scheme="ca")
contacts = xyz.fit_transform_with(featurizer_contact, "contacts", fmt ="dir-npy")
print(xyz[0].xyz.shape)
print(contacts[0].shape)

## Intermediate kinetic model: tICA
`tICA` is similar to PCA. Note the reduction to just 4 dimensions.

In [None]:
from msmbuilder.decomposition import tICA
lag = 10
tica_model = tICA(lag_time=lag, n_components=4)
tica_model_diheds = diheds.fit_with(tica_model)
tica_trajs_diheds = diheds.transform_with(tica_model, 'tica_diheds_lag%s/' % lag, fmt='dir-npy')

tica_model_contacts = contacts.fit_with(tica_model)
tica_trajs_contacts = contacts.transform_with(tica_model, 'tica_contacts_lag%s/' % lag, fmt='dir-npy')


# tica_trajs = dataset('./ticas_diheds/')
print(diheds[0].shape)
print(tica_trajs_diheds[0].shape)
print(tica_contacts_diheds[0].shape)

### tICA Heatmap
We can histogram our data projecting along the two first tICS (the two slowest DOFs found by tICA).

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
txx = np.concatenate(tica_trajs)

#fig = plt.figure(figsize=(10,10))
plt.subplot(1, 2, 1)
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1)
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
cb = plt.colorbar()
cb.set_label('log10(N)')
plt.subplot(1,2,2)
plt.hexbin(txx[:,2], txx[:,3], bins='log', mincnt=1)
plt.xlabel('tIC 3')
plt.ylabel('tIC 4')
cb = plt.colorbar()
cb.set_label('log10(N)')
plt.tight_layout()
plt.savefig('tICA_fromDiheds_API.eps', dpi = 300, format = 'eps')

In [None]:
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1)
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
cb = plt.colorbar()
cb.set_label('log10(N)')
plt.savefig('tICA_fromDiheds_tics1-2.png', dpi = 100, format = 'png')

In [None]:
subplot2grid((2,1),(0,0))
plot(Y[:,0])
ylabel('ind. comp. 1')
subplot2grid((2,1),(1,0))
plot(Y[:,1])
ylabel('ind. comp. 2')
xlabel('time (10 ns)')

## Clustering
Conformations need to be clustered into states (sometimes written as microstates). We cluster based on the tICA projections to group conformations that interconvert rapidly. Note that we transform our trajectories from the 4-dimensional tICA space into a 1-dimensional cluster index.

In [None]:
from msmbuilder.cluster import MiniBatchKMeans
clusterer = MiniBatchKMeans(n_clusters=100)
clustered_trajs = tica_trajs.fit_transform_with(clusterer,
                                               'kmeans_diheds_100/',
                                                fmt='dir-npy')

#clustered_trajs = dataset('./kmeans_diheds_100/')
print(tica_trajs[0].shape)
print(clustered_trajs[0].shape)

In [None]:
clusterer.cluster_centers_.shape

In [None]:
fig = plt.figure(figsize=(10,10))
plt.subplot(2, 2, 1)
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1)
plt.scatter(clusterer.cluster_centers_[:,0],
            clusterer.cluster_centers_[:,1], 
            s=100, c='w')
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
plt.subplot(2,2,2)
plt.hexbin(txx[:,2], txx[:,3], bins='log', mincnt=1)
plt.scatter(clusterer.cluster_centers_[:,2],
            clusterer.cluster_centers_[:,3], 
            s=100, c='w')
plt.xlabel('tIC 3')
plt.ylabel('tIC 4')
plt.show()

## MSM
We can construct an MSM from the labeled trajectories.

In [None]:
from msmbuilder.msm import MarkovStateModel
from msmbuilder.utils import dump
msm = MarkovStateModel(lag_time=5)
msm.fit(clustered_trajs)
print("The MSM has %s states.\n" % msm.n_states_)
print(msm.left_eigenvectors_.shape)

In [None]:
print(msm.left_eigenvectors_[:,1].shape)
print(clusterer.cluster_centers_[:,0].shape)

In [None]:
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap="Greys")
plt.scatter(clusterer.cluster_centers_[:,0],
            clusterer.cluster_centers_[:,1],
            s=1e4 * msm.populations_, # size by population
            c=msm.left_eigenvectors_[:,1], # color by eigenvector
            cmap="RdBu") 
plt.colorbar(label='First dynamical eigenvector')
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')
plt.tight_layout()

## Macrostate model

In [None]:
from msmbuilder.lumping import PCCAPlus
pcca = PCCAPlus.from_msm(msm, n_macrostates=5)
macro_trajs = pcca.transform(clustered_trajs)

In [None]:
print(msm.left_eigenvectors_[:,1].shape)
print(clusterer.cluster_centers_[:,0].shape)

In [None]:
plt.hexbin(txx[:,0], txx[:,1], bins='log', mincnt=1, cmap="Greys")
plt.scatter(clusterer.cluster_centers_[:,0],
            clusterer.cluster_centers_[:,1],
            s=100,
            c=pcca.microstate_mapping_,
      )
plt.xlabel('tIC 1')
plt.ylabel('tIC 2')