In [1]:
prediction_data = '/work/lpdi/users/khakzad/Surfacome/database/dataset_surfaces.pt'
pdb_data = '/work/lpdi/users/khakzad/Surfacome/pdbs/target_list_1_chopped/'
pointcloud_results = '/work/lpdi/users/khakzad/Surfacome/pdbs/target_list_1_pointcloud/'

In [2]:
import nglview as ng
import ipywidgets as widgets
import numpy as np
import torch
import os
from nglview.color import ColormakerRegistry
from pdbparser.pdbparser import pdbparser



In [3]:
data = torch.load(prediction_data, map_location=torch.device('cpu'))

## Test on all proteins

In [6]:
import sys
sys.path.append("/work/lpdi/users/khakzad/Surfacome/")
from data import Protein
from tqdm import tqdm
from IPython.utils import io

metrics = []
stop=0
for item in data:
    if stop < 10:
        stop+=1
        # initializing all the outputs that we are going to produce per protein and store in the dictionary
        sequence = []
        sasa = []
        point2residue = []
        residue_number = []
        no_binding_sites = 0
        filtered_site_points = []
        normalized_labels = []
        cluster_for_all_points = []

        acc = item['acc']
        coords = item['coords']
        embeddings = item['embeddings']
        labels = item['label']

        target_pdb = os.path.join(pdb_data,acc+"_cropped.pdb")

        with io.capture_output() as captured:
            protein = Protein(acc, coords, embeddings, labels, target_pdb, 0.7, 'cpu')

            sequence = protein.get_sequence()
            sasa = protein.get_sasa()
            point2residue, residue2coords, residue_number = protein.point2residue()
            no_binding_sites, filtered_site_points, normalized_labels, cluster_for_all_points = protein.clustering_labels()
        
        ## Calculating and adding the area based on above information
        NUM_POINTS_THRESHOLD = 200
        AREA_A2 = []
        AREA_P = []
        SEQ_COMP = []
        centroids = []
        i = 0
        for clus in range(no_binding_sites-1):
            seq_composition = []
            cluster = torch.where(cluster_for_all_points==i+1)
            i+=1
            if len(cluster[0]) > NUM_POINTS_THRESHOLD:
                cluster2residue = point2residue[cluster[0]]
                INDEX = torch.unique(cluster2residue)
                clustercoords = residue2coords[INDEX]

                # calculate the centorid
                centroid_of_cluster = np.mean(clustercoords.numpy(), axis=0)                
                tmp = torch.tensor([[centroid_of_cluster[0], centroid_of_cluster[1], centroid_of_cluster[2]]])
                dist = torch.cdist(clustercoords, tmp, p=2)
                # calculate the closest point to the centroid
                centroid_point = clustercoords[torch.argmin(dist)]
                # calculate the closest residue
                distances_allvsall = torch.norm(centroid_point - residue2coords, dim=-1)
                distances_min = torch.argmin(distances_allvsall, dim=0)
                centroids.append(residue_number[distances_min.item()])

                area = 0
                for idx in INDEX:
                    area += sasa[idx+1]
                    seq_composition.append(f'{residue_number[idx]}_{sequence[idx]}')

                # print("area by summing up SASA:", area, "A^2") 
                # print("area by considering number of points:", len(cluster[0]))

                AREA_A2.append(area)
                AREA_P.append(len(cluster[0]))
                SEQ_COMP.append({"cluster_"+str(i+1): seq_composition})
        # print(residue_number)
        print(acc, 'centroids: ', centroids)
        metrics.append({'acc': acc,
                        'sequence': sequence,
                        'sasa': sasa,
                        'point2residue': point2residue.numpy(),
                        'no_binding_sites': no_binding_sites,
                        'filtered_site_points': filtered_site_points.numpy(),
                        'normalized_labels': normalized_labels,
                        'cluster_for_all_points': cluster_for_all_points.numpy(),
                        'area_angstrom2': AREA_A2,
                        'area_points': AREA_P,
                        'sequence_compositions_all_clusters': SEQ_COMP,
                        'centroids': centroids})

    #     tt = cluster_for_all_points.numpy()
    #     norm_label_for_presentation = (tt-min(tt))/(max(tt)-min(tt))


P26992 centroids:  [201, 135, 86]
P15509 centroids:  [87, 338]
Q99062 centroids:  [636, 616, 507, 409, 262]
P19235 centroids:  [296, 260, 153, 38]
P10912 centroids:  [213, 59, 7, 277]
Q14626 centroids:  [379, 144, 254]
P42701 centroids:  [561, 347, 280, 107]
Q99665 centroids:  [637, 498, 411, 253, 15, 119]
P78552 centroids:  [385, 359, 276, 89, 45]
Q14627 centroids:  [97, 146, 352]


In [None]:
# import pandas as pd
# pd.DataFrame(metrics).to_csv('/work/lpdi/users/khakzad/Surfacome/database/result_metrics.csv', index=False)

## How many binding site each protein has? and what is their area?

In [None]:
import torch
NUM_POINTS_THRESHOLD = 200
metrics_area = []
for eachProtein in metrics:
    AREA_A2 = []
    AREA_P = []
    i = 0

    accP = eachProtein['acc']
    XX = eachProtein['cluster_for_all_points']
    YY = eachProtein['point2residue']
    SASA = eachProtein['sasa']

    NUM_CLUSTERS = eachProtein['no_binding_sites']
    for clus in range(NUM_CLUSTERS-1):
        cluster = torch.where(XX==i+1)
        i+=1
        if len(cluster[0]) > NUM_POINTS_THRESHOLD:
            cluster2residue = YY[cluster[0]]
            INDEX = torch.unique(cluster2residue)
            area = 0
            for idx in INDEX:
                area += SASA[idx+1]

            print("area by summing up SASA:", area, "A^2") 
            print("area by considering number of points:", len(cluster[0]))

            AREA_A2.append(area)
            AREA_P.append(len(cluster[0]))

    metrics_area.append({'acc': accP,
                        'area_angstrom2': AREA_A2,
                        'area_points':AREA_P})

In [None]:
metrics_area

In [None]:
pd.DataFrame(metrics_area).to_csv('/work/lpdi/users/khakzad/Surfacome/database/result_metrics_area.csv', index=False)

## Loading the file from drive and make some plots

In [None]:
import pandas as pd
result_area = pd.read_csv('/work/lpdi/users/khakzad/Surfacome/database/result_metrics.csv')

In [None]:
result_area

In [None]:
result_area['area_angstrom2'] = result_area['area_angstrom2'].apply(ast.literal_eval)
result_area['area_points'] = result_area['area_points'].apply(ast.literal_eval)

In [None]:
## adding some threshold for the size of detected clusters
## if it's too small, delete it. (it's now added into the main analysis!)
import ast
all_area_angstrom = []
all_area_points = []
all_area_BI_no = []
for index, row in result_area.iterrows():
    BI_no = 0
    for i, item in enumerate(row['area_points']):
        if item > 100:
            BI_no += 1
            all_area_points.append(item)
            all_area_angstrom.append(row['area_angstrom2'][i])
    all_area_BI_no.append(BI_no)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from matplotlib.ticker import PercentFormatter

fig, axs = plt.subplots(1, 1, sharey=True, tight_layout=True)
axs.hist(all_area_angstrom, bins=np.linspace(0, 10000, 50))

In [None]:
fig, axs = plt.subplots(1, 1, sharey=True, tight_layout=True)
axs.hist(all_area_BI_no, bins=len(all_area_BI_no))

## Representing some clustering results in the form of pdb

In [None]:
import pandas as pd
result_metrics = pd.read_csv('/work/lpdi/users/khakzad/Surfacome/database/result_metrics.csv')

In [None]:
result_metrics

In [None]:
import csv
 
filename = '/work/lpdi/users/khakzad/Surfacome/database/result_metrics.csv'

data = open(filename, 'r')
metrics_data = csv.DictReader(data)
for item in metrics_data:
    acc = item['acc']
    

In [None]:
a = [1,2,3,4,5]
b = [6,7,8,9,10]
c = a+b
print(c)

In [None]:
c