In [1]:
import os
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import MDAnalysis as mda
from tqdm import tqdm
from numpy import linalg as LA
from MDAnalysis.analysis import align
from matplotlib.ticker import FuncFormatter

%run ~/.mpl_set.py

In [2]:
df = pd.read_pickle("df_comp.pkl")
df = df.drop(columns=[col for col in df.columns if col.startswith('lat')])
df

Unnamed: 0,sys_name,frame,n_res,dist,n_contacts,rmsd,rmsd_nsp10,rmsd_nsp16
0,comp_0,0,415,32.605477,143,1.270958,1.157980,1.061589
1,comp_0,1,415,32.323373,144,1.266247,1.135788,1.180746
2,comp_0,2,415,31.932924,156,1.251338,1.298619,1.130007
3,comp_0,3,415,32.521470,149,1.232570,1.067588,1.155031
4,comp_0,4,415,32.193099,157,1.497461,1.652155,1.348938
...,...,...,...,...,...,...,...,...
83995,comp_100,3995,415,39.647390,107,10.644501,5.009146,2.698992
83996,comp_100,3996,415,39.943104,114,10.607467,4.521076,2.744293
83997,comp_100,3997,415,39.952250,113,10.840326,4.824111,2.710605
83998,comp_100,3998,415,40.436388,108,10.780112,4.630009,2.768572


In [3]:
cluster_df = pd.read_pickle('./cluster_kmeans.pkl')
cluster_df

Unnamed: 0,n_cluster,labels,centers
0,30,"[14, 15, 25, 15, 15, 15, 15, 15, 20, 25, 20, 2...","[[0.108596206, 0.3566435, -0.91065025, 1.11021..."
1,50,"[14, 35, 40, 14, 40, 14, 40, 45, 45, 35, 35, 2...","[[0.5283964, 0.18438724, -0.31090447, -0.19644..."
2,100,"[86, 86, 56, 86, 66, 56, 45, 66, 42, 45, 45, 6...","[[0.048754055, 0.64428294, 0.19046207, 0.00058..."
3,200,"[95, 95, 175, 139, 20, 139, 20, 18, 151, 84, 1...","[[1.3147413, 1.0601119, 1.1008426, -0.17935225..."
4,250,"[200, 84, 139, 139, 113, 124, 95, 115, 84, 95,...","[[0.43860808, 0.852178, -0.5610412, 0.45407733..."
5,500,"[453, 228, 448, 59, 183, 298, 473, 248, 371, 1...","[[-1.550877, -1.1808438, -1.5491257, 0.7485294..."
6,1000,"[679, 679, 259, 679, 642, 96, 8, 799, 280, 799...","[[-2.2851336, -1.1689458, -2.2658885, -0.14017..."
7,1500,"[782, 1316, 44, 260, 1485, 471, 525, 600, 441,...","[[-0.5123599, 1.2032837, 0.2667237, -0.9375480..."
8,3000,"[1287, 1287, 287, 1386, 1098, 2419, 1120, 1931...","[[-0.08806394, -0.5946998, 0.23943464, 1.86635..."
9,5000,"[1451, 2102, 1090, 1101, 3470, 3669, 4432, 941...","[[-0.1820339, 0.55776024, 1.1501688, -0.017369..."


In [4]:
for label in cluster_df['labels']: 
    print(len(set(label)))

30
50
100
200
250
500
1000
1497
2996
4997


In [5]:
labels = cluster_df[cluster_df.n_cluster == 500]['labels'] 
df['labels'] = labels.explode().reset_index()['labels']

cluster_info = []
for label in df.labels.unique()[:]: 
    sub_df = df[df.labels == label] 
    cluster_info.append({'label': int(label), 
                         'count':sub_df.dist.count(), 
                         'dist_mean': sub_df.dist.mean(), 
                         'dist_std': sub_df.dist.std(), 
                         'rmsd_mean': sub_df.rmsd.mean(), 
                         'rmsd_std': sub_df.rmsd.std(), 
                         'rmsd_nsp10_mean': sub_df.rmsd_nsp10.mean(), 
                         'rmsd_nsp10_std': sub_df.rmsd_nsp10.std(), 
                         'rmsd_nsp16_mean': sub_df.rmsd_nsp16.mean(), 
                         'rmsd_nsp16_std': sub_df.rmsd_nsp16.std(), 
                         'n_contacts_mean': sub_df.n_contacts.mean(), 
                         'n_contacts_std': sub_df.n_contacts.std(), 
                        })
#     print(sub_df.dist.count(), sub_df.dist.std(), sub_df.dist.mean())

cluster_info = pd.DataFrame(cluster_info)
cluster_info.describe()

Unnamed: 0,label,count,dist_mean,dist_std,rmsd_mean,rmsd_std,rmsd_nsp10_mean,rmsd_nsp10_std,rmsd_nsp16_mean,rmsd_nsp16_std,n_contacts_mean,n_contacts_std
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,249.5,168.0,33.29574,0.388743,5.38995,0.397809,4.023296,0.587343,2.808204,0.346029,125.74674,7.536586
std,144.481833,76.622072,2.623128,0.222669,3.119314,0.19252,0.936571,0.337614,0.770209,0.179035,36.096989,2.259605
min,0.0,25.0,30.488922,0.161984,2.410827,0.097996,1.89079,0.105851,1.513343,0.05563,7.139456,4.319459
25%,124.75,110.75,31.731854,0.279208,3.34623,0.26092,3.460403,0.301748,2.418524,0.191974,99.001888,5.880477
50%,249.5,159.0,32.019628,0.319511,4.014389,0.364627,3.986518,0.492898,2.628962,0.338017,136.996077,7.007462
75%,374.25,214.0,33.652225,0.417066,6.305726,0.515279,4.638893,0.858869,2.960729,0.477665,152.054341,8.651935
max,499.0,503.0,41.402289,2.949512,16.321236,2.40948,6.207892,1.37113,5.856205,1.103461,201.302083,19.29382


In [10]:
df

Unnamed: 0,sys_name,frame,n_res,dist,n_contacts,rmsd,rmsd_nsp10,rmsd_nsp16,labels
0,comp_0,0,415,32.605477,143,1.270958,1.157980,1.061589,453
1,comp_0,1,415,32.323373,144,1.266247,1.135788,1.180746,228
2,comp_0,2,415,31.932924,156,1.251338,1.298619,1.130007,448
3,comp_0,3,415,32.521470,149,1.232570,1.067588,1.155031,59
4,comp_0,4,415,32.193099,157,1.497461,1.652155,1.348938,183
...,...,...,...,...,...,...,...,...,...
83995,comp_100,3995,415,39.647390,107,10.644501,5.009146,2.698992,283
83996,comp_100,3996,415,39.943104,114,10.607467,4.521076,2.744293,102
83997,comp_100,3997,415,39.952250,113,10.840326,4.824111,2.710605,72
83998,comp_100,3998,415,40.436388,108,10.780112,4.630009,2.768572,72


In [11]:
df.to_pickle('df_comp_kmeans.pkl')