# THIS NOTEBOOK CONTAINS AN EXAMPLE OF A SPECTRAL CLUSTERING ALGORITHM, IN THIS CASE NORMALIZED HARD SPECTRAL CLUSTERING, APPLIED TO A Protein Contact Network OF THE SARSCOV2 SPIKE PROTEIN

In [2]:
#handle different path separators
from sys import platform
if platform == "linux" or platform == "linux2":
    # linux
    add_slash_to_path = '/'
elif platform == "darwin":
    # OS X
    add_slash_to_path = '/'
elif platform == "win32":
    # Windows...
    add_slash_to_path = '\\' 

In [3]:
import os

try:
    from pcn.pcn_miner import pcn_miner, pcn_pymol_scripts #installed with pip
except:
    try: 
        import sys                #git cloned
        cwd = os.getcwd()
        exd = os.path.abspath(os.path.join(cwd, os.pardir))
        pcnd =  os.path.abspath(os.path.join(exd, os.pardir)) + add_slash_to_path + "pcn"
        sys.path.append(pcnd)
        from pcn_miner import pcn_miner, pcn_pymol_scripts 
    except:
        raise ImportError("PCN-Miner is not correctly installed.")
        
import subprocess
import numpy as np
import networkx as nx

In [11]:
output_path = ""
proteins_path = ""
adj_path = "Adj\\"

In [12]:
protein = "6vxx"
protein_path = "{}.pdb".format(proteins_path+protein)
atoms = pcn_miner.readPDBFile(protein_path) #read 
coordinates = pcn_miner.getResidueCoordinates(atoms)
coordinates

Start Reading PDB

2022-04-19 10:56:06.247153
End Reading PDB

2022-04-19 10:56:06.297019


array([['ALA27 A',
        array([' 172.298', ' 252.181', ' 223.613'], dtype='<U8')],
       ['TYR28 A',
        array([' 174.968', ' 250.129', ' 221.763'], dtype='<U8')],
       ['THR29 A',
        array([' 177.648', ' 250.850', ' 219.220'], dtype='<U8')],
       ...,
       ['LEU1145 C',
        array([' 213.878', ' 212.413', ' 120.414'], dtype='<U8')],
       ['ASP1146 C',
        array([' 216.645', ' 214.563', ' 118.918'], dtype='<U8')],
       ['SER1147 C',
        array([' 218.576', ' 211.497', ' 117.584'], dtype='<U8')]],
      dtype=object)

In [13]:
dict_residue_name = pcn_miner.associateResidueName(coordinates)
residue_names = np.array(list (dict_residue_name.items()))
residue_names

array([['0', 'ALA27 A'],
       ['1', 'TYR28 A'],
       ['2', 'THR29 A'],
       ...,
       ['2913', 'LEU1145 C'],
       ['2914', 'ASP1146 C'],
       ['2915', 'SER1147 C']], dtype='<U9')

In [14]:
A = pcn_miner.adjacent_matrix(output_path, coordinates, protein, 4, 8)
A

saved distances matrix████████████████████████████████████████████████████████████████████████████████| Current progress: 100.00%
saved edge list
saved adj matrix


array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [15]:
k = 14
norm_hsc_labels = pcn_miner.norm_hsc(A, k)
norm_hsc_labels

array([12, 12, 12, ...,  4,  4,  4])

In [16]:
pcn_miner.save_labels(output_path, norm_hsc_labels, residue_names, protein, method="norm_hsc")

0 256
1 367
2 141
3 300
4 304
5 131
6 169
7 292
8 166
9 131
10 125
11 235
12 168
13 131
256
Clusters 0:  ['GLN321 A', 'PRO322 A', 'THR323 A', 'GLU324 A', 'SER325 A', 'ILE326 A', 'VAL327 A', 'ARG328 A', 'PHE329 A', 'PRO330 A', 'ASN331 A', 'ILE332 A', 'THR333 A', 'ASN334 A', 'LEU335 A', 'CYS336 A', 'PRO337 A', 'PHE338 A', 'GLY339 A', 'GLU340 A', 'VAL341 A', 'PHE342 A', 'ASN343 A', 'ALA344 A', 'THR345 A', 'ARG346 A', 'PHE347 A', 'ALA348 A', 'SER349 A', 'VAL350 A', 'TYR351 A', 'ALA352 A', 'TRP353 A', 'ASN354 A', 'ARG355 A', 'LYS356 A', 'ARG357 A', 'ILE358 A', 'SER359 A', 'ASN360 A', 'CYS361 A', 'VAL362 A', 'ALA363 A', 'ASP364 A', 'TYR365 A', 'SER366 A', 'VAL367 A', 'LEU368 A', 'TYR369 A', 'ASN370 A', 'SER371 A', 'ALA372 A', 'SER373 A', 'PHE374 A', 'SER375 A', 'THR376 A', 'PHE377 A', 'LYS378 A', 'CYS379 A', 'TYR380 A', 'GLY381 A', 'VAL382 A', 'SER383 A', 'PRO384 A', 'THR385 A', 'LYS386 A', 'LEU387 A', 'ASN388 A', 'ASP389 A', 'LEU390 A', 'CYS391 A', 'PHE392 A', 'THR393 A', 'ASN394 A', 'VAL39

{'ALA27 A': 12,
 'TYR28 A': 12,
 'THR29 A': 12,
 'ASN30 A': 12,
 'SER31 A': 12,
 'PHE32 A': 12,
 'THR33 A': 12,
 'ARG34 A': 12,
 'GLY35 A': 12,
 'VAL36 A': 12,
 'TYR37 A': 12,
 'TYR38 A': 11,
 'PRO39 A': 11,
 'ASP40 A': 11,
 'LYS41 A': 11,
 'VAL42 A': 11,
 'PHE43 A': 11,
 'ARG44 A': 11,
 'SER45 A': 11,
 'SER46 A': 11,
 'VAL47 A': 11,
 'LEU48 A': 13,
 'HIS49 A': 13,
 'SER50 A': 13,
 'THR51 A': 13,
 'GLN52 A': 13,
 'ASP53 A': 13,
 'LEU54 A': 12,
 'PHE55 A': 12,
 'LEU56 A': 12,
 'PRO57 A': 12,
 'PHE58 A': 12,
 'PHE59 A': 12,
 'SER60 A': 12,
 'ASN61 A': 12,
 'VAL62 A': 12,
 'THR63 A': 12,
 'TRP64 A': 12,
 'PHE65 A': 12,
 'HIS66 A': 12,
 'ALA67 A': 12,
 'ILE68 A': 12,
 'HIS69 A': 12,
 'ASP80 A': 12,
 'ASN81 A': 12,
 'PRO82 A': 12,
 'VAL83 A': 12,
 'LEU84 A': 12,
 'PRO85 A': 12,
 'PHE86 A': 12,
 'ASN87 A': 12,
 'ASP88 A': 12,
 'GLY89 A': 12,
 'VAL90 A': 12,
 'TYR91 A': 12,
 'PHE92 A': 12,
 'ALA93 A': 12,
 'SER94 A': 12,
 'THR95 A': 12,
 'GLU96 A': 12,
 'LYS97 A': 12,
 'SER98 A': 12,
 'ASN99 

In [17]:
pcn_pymol_scripts.pymol_plot(protein_path, output_path, "Clusters", "norm_hsc", k)

ALA27 A chartreuse
TYR28 A chartreuse
THR29 A chartreuse
ASN30 A chartreuse
SER31 A chartreuse
PHE32 A chartreuse
THR33 A chartreuse
ARG34 A chartreuse
GLY35 A chartreuse
VAL36 A chartreuse
TYR37 A chartreuse
TYR38 A orange
PRO39 A orange
ASP40 A orange
LYS41 A orange
VAL42 A orange
PHE43 A orange
ARG44 A orange
SER45 A orange
SER46 A orange
VAL47 A orange
LEU48 A limegreen
HIS49 A limegreen
SER50 A limegreen
THR51 A limegreen
GLN52 A limegreen
ASP53 A limegreen
LEU54 A chartreuse
PHE55 A chartreuse
LEU56 A chartreuse
PRO57 A chartreuse
PHE58 A chartreuse
PHE59 A chartreuse
SER60 A chartreuse
ASN61 A chartreuse
VAL62 A chartreuse
THR63 A chartreuse
TRP64 A chartreuse
PHE65 A chartreuse
HIS66 A chartreuse
ALA67 A chartreuse
ILE68 A chartreuse
HIS69 A chartreuse
ASP80 A chartreuse
ASN81 A chartreuse
PRO82 A chartreuse
VAL83 A chartreuse
LEU84 A chartreuse
PRO85 A chartreuse
PHE86 A chartreuse
ASN87 A chartreuse
ASP88 A chartreuse
GLY89 A chartreuse
VAL90 A chartreuse
TYR91 A chartreuse
P

In [20]:
filepath = "norm_hsc\Sessions\{}_Clusters_norm_hsc_k{}_session.pse".format(protein, k)
if platform == "win32":
    os.startfile(filepath)
else:
    subprocess.run(["pymol", filepath])