In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from io import StringIO

# Principle component analysis

#### 1) Loading trajectory

We will compute PCA across all dimensions of all atoms, so we can reshape the timesteps, atoms, physical dimension shape into timesteps, all coordinates.

In [None]:
def load_traj(filename):
    if filename.split('.')[-1] == 'npy':
        return np.load('trajectory_calpha.npy')
    else:
        with open(filename, "r") as f:
            docu = f.read()
        frames = docu.split("128\n generated by VMD\n  ")[1:]
        
        columns_names = ['type', 'x', 'y', 'z']
        dfs = [pd.read_csv(StringIO(f), names=columns_names, delim_whitespace=True, header=None) for f in frames]
        arrays = [df[['x','y','z']].values for df in dfs]
    return np.stack(arrays)

In [None]:
traj = load_traj("trajectory_calpha.npy")
print(traj.shape)

# We are trying to find an optimal coordinate projection. Therefore, we can drop the cartesian structure.
traj = traj.reshape(traj.shape[0], -1)
print(traj.shape)

#### 2) Mean-free trajectory

In [None]:
mean_free_traj = traj - np.mean(traj, axis=0)

#### 3) Covariance matrix

In [None]:
t_n = mean_free_traj.shape[0]
cov = (t_n - 1)**-1 * np.einsum('ij,ik', mean_free_traj, mean_free_traj)  # sum over rows, multiply columns
print(cov.shape)

In [None]:
# compare to cov
print(np.all(np.isclose(np.cov(mean_free_traj, rowvar=False), cov)))

#### 4) Eingenvalues and eigenvectors

In [None]:
evals, evecs = np.linalg.eig(cov)

In [None]:
fig, ax = plt.subplots(nrows=2, sharex=True, figsize=(15,5))
ax[0].scatter(range(len(evals)), evals)
ax[1].scatter(range(len(evals)), np.cumsum(evals))

ax[0].set_title('Eigenvalues and cumulative sum versus eigenvalue rank')
ax[0].set_ylabel('Eigenvalue magnitude')
ax[1].set_ylabel('Eigenvalue cumulative sum')
ax[1].set_xlabel('Eigenvalue rank')

#### 5) Projection

#### 6) Visualization

#### 7) Which molecular structure corresponds with the highest maximum?