# THIS NOTEBOOK CONTAINS AN EXAMPLE OF A EMBEDDING + CLUSTERING ALGORITHM, IN THIS CASE LAPLACIANEIGENMAP+KMEAN, APPLIED TO A Protein Contact Network OF THE SARSCOV2 SPIKE PROTEIN

In [10]:
from pcn_miner import pcn_miner, pcn_pymol_scripts
import numpy as np
from sys import platform
import subprocess
import networkx as nx
import os

In [11]:
output_path = ""
adj_path = "Adj\\"

In [12]:
protein = "6vxx"
protein_path = "{}.pdb".format(protein)
atoms = pcn_miner.readPDBFile(protein_path) #read 
coordinates = pcn_miner.getResidueCoordinates(atoms)
coordinates

Start Reading PDB

2022-04-19 10:57:16.728825
End Reading PDB

2022-04-19 10:57:16.785673


array([['ALA27 A',
        array([' 172.298', ' 252.181', ' 223.613'], dtype='<U8')],
       ['TYR28 A',
        array([' 174.968', ' 250.129', ' 221.763'], dtype='<U8')],
       ['THR29 A',
        array([' 177.648', ' 250.850', ' 219.220'], dtype='<U8')],
       ...,
       ['LEU1145 C',
        array([' 213.878', ' 212.413', ' 120.414'], dtype='<U8')],
       ['ASP1146 C',
        array([' 216.645', ' 214.563', ' 118.918'], dtype='<U8')],
       ['SER1147 C',
        array([' 218.576', ' 211.497', ' 117.584'], dtype='<U8')]],
      dtype=object)

In [13]:
dict_residue_name = pcn_miner.associateResidueName(coordinates)
residue_names = np.array(list (dict_residue_name.items()))
residue_names

array([['0', 'ALA27 A'],
       ['1', 'TYR28 A'],
       ['2', 'THR29 A'],
       ...,
       ['2913', 'LEU1145 C'],
       ['2914', 'ASP1146 C'],
       ['2915', 'SER1147 C']], dtype='<U9')

In [14]:
A = pcn_miner.adjacent_matrix(output_path, coordinates, protein, 4, 8)
A

saved distances matrix████████████████████████████████████████████████████████████████████████████████| Current progress: 100.00%
saved edge list
saved adj matrix


array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [15]:
k = 14
d = 128
lem_km_labels = pcn_miner.kmeans_laplacianeigenmaps(A, k, d)
lem_km_labels

Laplacian matrix recon. error (low rank): 57.350107


array([ 4,  4,  3, ..., 12, 12, 12])

In [16]:
pcn_miner.save_labels(output_path, lem_km_labels, residue_names, protein, "kmeans_laplacianeigenmaps", d)

0 406
1 186
2 132
3 98
4 136
5 425
6 169
7 181
8 110
9 97
10 229
11 155
12 423
13 169
406
ClustersEmbeddings 0:  ['ARG319 A', 'VAL320 A', 'SER591 A', 'PHE592 A', 'THR618 A', 'GLU619 A', 'SER689 A', 'LEU699 A', 'GLY700 A', 'ALA701 A', 'GLU702 A', 'ASN703 A', 'SER704 A', 'VAL705 A', 'ALA706 A', 'TYR707 A', 'SER708 A', 'ASN709 A', 'ASN710 A', 'SER711 A', 'ILE712 A', 'ALA713 A', 'ILE714 A', 'PRO715 A', 'THR716 A', 'ASN717 A', 'THR961 A', 'PHE970 A', 'GLY971 A', 'ILE973 A', 'ARG1039 A', 'VAL1040 A', 'ASP1041 A', 'PHE1042 A', 'LYS1045 A', 'PRO1069 A', 'ALA1070 A', 'GLN1071 A', 'GLU1072 A', 'LYS1073 A', 'ASN1074 A', 'PHE1075 A', 'THR1076 A', 'THR1077 A', 'ALA1078 A', 'PRO1079 A', 'ALA1080 A', 'ILE1081 A', 'CYS1082 A', 'HIS1083 A', 'ASP1084 A', 'GLY1085 A', 'LYS1086 A', 'ALA1087 A', 'HIS1088 A', 'PHE1089 A', 'PRO1090 A', 'ARG1091 A', 'GLU1092 A', 'GLY1093 A', 'VAL1094 A', 'PHE1095 A', 'VAL1096 A', 'SER1097 A', 'ASN1098 A', 'GLY1099 A', 'THR1100 A', 'HIS1101 A', 'TRP1102 A', 'PHE1103 A', 'VAL11

{'ALA27 A': 4,
 'TYR28 A': 4,
 'THR29 A': 3,
 'ASN30 A': 4,
 'SER31 A': 3,
 'PHE32 A': 4,
 'THR33 A': 4,
 'ARG34 A': 3,
 'GLY35 A': 3,
 'VAL36 A': 4,
 'TYR37 A': 4,
 'TYR38 A': 4,
 'PRO39 A': 7,
 'ASP40 A': 11,
 'LYS41 A': 4,
 'VAL42 A': 7,
 'PHE43 A': 7,
 'ARG44 A': 7,
 'SER45 A': 4,
 'SER46 A': 4,
 'VAL47 A': 7,
 'LEU48 A': 4,
 'HIS49 A': 7,
 'SER50 A': 4,
 'THR51 A': 4,
 'GLN52 A': 4,
 'ASP53 A': 4,
 'LEU54 A': 4,
 'PHE55 A': 4,
 'LEU56 A': 3,
 'PRO57 A': 4,
 'PHE58 A': 4,
 'PHE59 A': 4,
 'SER60 A': 4,
 'ASN61 A': 4,
 'VAL62 A': 3,
 'THR63 A': 3,
 'TRP64 A': 3,
 'PHE65 A': 3,
 'HIS66 A': 4,
 'ALA67 A': 4,
 'ILE68 A': 7,
 'HIS69 A': 11,
 'ASP80 A': 4,
 'ASN81 A': 4,
 'PRO82 A': 4,
 'VAL83 A': 3,
 'LEU84 A': 4,
 'PRO85 A': 4,
 'PHE86 A': 3,
 'ASN87 A': 4,
 'ASP88 A': 4,
 'GLY89 A': 3,
 'VAL90 A': 3,
 'TYR91 A': 3,
 'PHE92 A': 3,
 'ALA93 A': 3,
 'SER94 A': 3,
 'THR95 A': 3,
 'GLU96 A': 3,
 'LYS97 A': 3,
 'SER98 A': 4,
 'ASN99 A': 4,
 'ILE100 A': 3,
 'ILE101 A': 3,
 'ARG102 A': 3,
 'GLY

In [17]:
pcn_pymol_scripts.pymol_plot_embeddings(protein_path, output_path, "ClustersEmbeddings", "kmeans_laplacianeigenmaps", k, d)
                                     

ALA27 A yellow
TYR28 A yellow
THR29 A cyan
ASN30 A yellow
SER31 A cyan
PHE32 A yellow
THR33 A yellow
ARG34 A cyan
GLY35 A cyan
VAL36 A yellow
TYR37 A yellow
TYR38 A yellow
PRO39 A salmon
ASP40 A orange
LYS41 A yellow
VAL42 A salmon
PHE43 A salmon
ARG44 A salmon
SER45 A yellow
SER46 A yellow
VAL47 A salmon
LEU48 A yellow
HIS49 A salmon
SER50 A yellow
THR51 A yellow
GLN52 A yellow
ASP53 A yellow
LEU54 A yellow
PHE55 A yellow
LEU56 A cyan
PRO57 A yellow
PHE58 A yellow
PHE59 A yellow
SER60 A yellow
ASN61 A yellow
VAL62 A cyan
THR63 A cyan
TRP64 A cyan
PHE65 A cyan
HIS66 A yellow
ALA67 A yellow
ILE68 A salmon
HIS69 A orange
ASP80 A yellow
ASN81 A yellow
PRO82 A yellow
VAL83 A cyan
LEU84 A yellow
PRO85 A yellow
PHE86 A cyan
ASN87 A yellow
ASP88 A yellow
GLY89 A cyan
VAL90 A cyan
TYR91 A cyan
PHE92 A cyan
ALA93 A cyan
SER94 A cyan
THR95 A cyan
GLU96 A cyan
LYS97 A cyan
SER98 A yellow
ASN99 A yellow
ILE100 A cyan
ILE101 A cyan
ARG102 A cyan
GLY103 A cyan
TRP104 A cyan
ILE105 A cyan
PHE106 A cy

In [18]:
filepath = "kmeans_laplacianeigenmaps\Sessions\{}_kmeans_laplacianeigenmaps_d{}_k{}_session.pse".format(protein, d, k)
if platform == "win32":
    os.startfile(filepath)
else:
    subprocess.run(["pymol", filepath])