In [1]:
import os
import glob
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import MDAnalysis as mda
from tqdm import tqdm
from numpy import linalg as LA
from MDAnalysis.analysis import align
from matplotlib.ticker import FuncFormatter
# %run ~/.mpl_set.py

import networkx as nx

from sknetwork.clustering import Louvain

In [2]:
df = pd.read_pickle('df_comp_kmeans.pkl')
df

Unnamed: 0,sys_name,frame,n_res,dist,n_contacts,rmsd,rmsd_nsp10,rmsd_nsp16,labels
0,comp_0,0,415,32.605477,143,1.270958,1.157980,1.061589,453
1,comp_0,1,415,32.323373,144,1.266247,1.135788,1.180746,228
2,comp_0,2,415,31.932924,156,1.251338,1.298619,1.130007,448
3,comp_0,3,415,32.521470,149,1.232570,1.067588,1.155031,59
4,comp_0,4,415,32.193099,157,1.497461,1.652155,1.348938,183
...,...,...,...,...,...,...,...,...,...
83995,comp_100,3995,415,39.647390,107,10.644501,5.009146,2.698992,283
83996,comp_100,3996,415,39.943104,114,10.607467,4.521076,2.744293,102
83997,comp_100,3997,415,39.952250,113,10.840326,4.824111,2.710605,72
83998,comp_100,3998,415,40.436388,108,10.780112,4.630009,2.768572,72


In [3]:
dtrajs = []
for sys_name in sorted(df.sys_name.unique()): 
    sub_df = df[df['sys_name'] == sys_name]
    dtrajs.append(sub_df['labels'].values)

In [4]:
def get_trans_count(dtrajs, lag=1):
    adj_sparse = {}
    for dtraj in dtrajs: 
        for i in range(len(dtraj) - lag): 
            transition = (dtraj[i], dtraj[i+lag])
            if transition in adj_sparse: 
                adj_sparse[transition] += 1 
            else: 
                adj_sparse[transition] = 1

    n_states = len(set(np.concatenate(dtrajs)))
    trans_count = np.zeros((n_states, n_states))
    for edges in adj_sparse: 
        trans_count[edges] = adj_sparse[edges]

    trans_count = trans_count + trans_count.T
    return trans_count 

def get_trans_mat(dtrajs, lag=1): 
    trans_count = get_trans_count(dtrajs, lag=lag)
    trans_mat = trans_count / np.sum(trans_count, axis=0)
    return trans_mat

In [5]:
trans_mat_norm = get_trans_mat(dtrajs)

In [6]:
G = nx.from_numpy_matrix(trans_mat_norm, create_using=nx.DiGraph())

In [7]:
cluster_info = []
for label in df.labels.unique()[:]: 
    sub_df = df[df.labels == label] 
    cluster_info.append({'label': int(label), 
                         'count':sub_df.dist.count(), 
                         'dist_mean': sub_df.dist.mean(), 
                         'dist_std': sub_df.dist.std(), 
                         'rmsd_mean': sub_df.rmsd.mean(), 
                         'rmsd_std': sub_df.rmsd.std(), 
                         'rmsd_nsp10_mean': sub_df.rmsd_nsp10.mean(), 
                         'rmsd_nsp10_std': sub_df.rmsd_nsp10.std(), 
                         'rmsd_nsp16_mean': sub_df.rmsd_nsp16.mean(), 
                         'rmsd_nsp16_std': sub_df.rmsd_nsp16.std(), 
                         'n_contacts_mean': sub_df.n_contacts.mean(), 
                         'n_contacts_std': sub_df.n_contacts.std(), 
                        })
#     print(sub_df.dist.count(), sub_df.dist.std(), sub_df.dist.mean())

cluster_info = pd.DataFrame(cluster_info)
cluster_info = cluster_info.sort_values('label').reset_index(drop=True)
cluster_info

Unnamed: 0,label,count,dist_mean,dist_std,rmsd_mean,rmsd_std,rmsd_nsp10_mean,rmsd_nsp10_std,rmsd_nsp16_mean,rmsd_nsp16_std,n_contacts_mean,n_contacts_std
0,0,211,33.804700,0.517098,6.759526,0.578152,4.425174,0.749414,2.425710,0.183806,72.009479,8.328946
1,1,178,31.232320,0.318628,4.239073,0.367239,5.287994,0.619525,3.499962,0.374306,151.191011,8.754500
2,2,87,40.157197,0.713022,11.127746,0.616297,4.590900,1.208197,2.163707,0.304617,69.367816,6.806486
3,3,122,31.161196,0.224260,5.507611,0.150200,3.832075,0.393595,5.756104,0.140445,166.565574,6.019236
4,4,126,30.952763,0.253477,5.014740,0.191235,4.263786,0.210250,2.922146,0.077618,127.825397,8.425750
...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,148,33.126107,0.304478,6.576859,0.218415,6.026360,0.127292,5.198477,0.140730,94.614865,8.714304
496,496,245,31.854926,0.216680,3.332788,0.323787,3.356417,0.853381,3.143672,0.212188,161.942857,5.532764
497,497,192,31.118335,0.221192,4.043443,0.131623,3.723211,0.211565,3.733414,0.136929,201.302083,7.733953
498,498,128,31.762294,0.302404,3.736390,0.322002,5.155250,0.776469,2.592428,0.332706,152.242188,5.691195


In [8]:
col_names = sorted([col for col in cluster_info.columns])
for i in sorted(G.nodes()):
    for col in col_names:
        G.nodes[i][col] = cluster_info[col][i]

In [9]:
trans_count = get_trans_count(dtrajs)
louvain = Louvain(modularity='newman', random_state=42)

labels = louvain.fit_transform(trans_count)
len(set(labels))

35

In [10]:
for i in sorted(G.nodes()):
    G.nodes[i]['mod'] = labels[i]

In [11]:
cluster_info['mod'] = labels

In [12]:
nx.write_gexf(G, 'nsp10_16_kmeans.gexf')
nx.write_gpickle(G, 'comp_kmeans.pkl')

In [13]:
df['mod'] = [labels[i] for i in df['labels']]

In [14]:
df.to_pickle('./df_comp_kmeans_mod.pkl')

In [15]:
dtrajs_mod = []
for sys_name in sorted(df.sys_name.unique()): 
    sub_df = df[df['sys_name'] == sys_name]
    dtrajs_mod.append(sub_df['mod'].values)

In [16]:
trans_mat_mod = get_trans_mat(dtrajs_mod)
trans_mat_mod.shape

(35, 35)

In [17]:
G_mod = nx.from_numpy_matrix(trans_mat_mod, create_using=nx.DiGraph())

In [18]:
cluster_info = []
for label in df['mod'].unique()[:]: 
    sub_df = df[df['mod'] == label] 
    cluster_info.append({'mod': int(label), 
                         'count':sub_df.dist.count(), 
                         'dist_mean': sub_df.dist.mean(), 
                         'dist_std': sub_df.dist.std(), 
                         'rmsd_mean': sub_df.rmsd.mean(), 
                         'rmsd_std': sub_df.rmsd.std(), 
                         'rmsd_nsp10_mean': sub_df.rmsd_nsp10.mean(), 
                         'rmsd_nsp10_std': sub_df.rmsd_nsp10.std(), 
                         'rmsd_nsp16_mean': sub_df.rmsd_nsp16.mean(), 
                         'rmsd_nsp16_std': sub_df.rmsd_nsp16.std(), 
                         'n_contacts_mean': sub_df.n_contacts.mean(), 
                         'n_contacts_std': sub_df.n_contacts.std(), 
                        })
#     print(sub_df.dist.count(), sub_df.dist.std(), sub_df.dist.mean())

cluster_info = pd.DataFrame(cluster_info)
cluster_info = cluster_info.sort_values('mod').reset_index(drop=True)

In [19]:
col_names = sorted([col for col in cluster_info.columns])
for i in sorted(G_mod.nodes()):
    for col in col_names:
        G_mod.nodes[i][col] = cluster_info[col][i]

In [20]:
nx.write_gexf(G_mod, 'nsp10_16_kmeans_mod.gexf')
nx.write_gpickle(G_mod, 'comp_mod.pkl')

In [21]:
cluster_info.to_pickle('./cluster_comp_kmeans_mod.pkl')