In [1]:
#!/usr/bin/env python

import sys
import time
from pathlib import Path

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

In [2]:
start_time = time.time()
# Script to compute similarity matrices for subregion electrostatics of each PDB. Will need to plot them too. Hmmm -> matrix heatmap.

## Load the data ##

pdbs = []
potentials = {}
data_folder = Path('data')
a = open(data_folder / 'all_spike_strs_regions_pot.csv', 'r')
for line in a:
    mm = line.split(',')
    if len(mm) == 3 and mm[0] != 'PDB ID':
        if mm[1] == 'region_1':
            pdbs.append(mm[0])
            temp_potential = [float(mm[2])]

        elif mm[1] != 'region_1':
            temp_potential.append(float(mm[2]))

        if mm[1] == 'region_21':
            potentials[mm[0]] = np.array(temp_potential)


a.close()

In [3]:
import pandas as pd
df = pd.DataFrame(potentials).T
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
6nb3,-37.007494,38.017644,0.000000,-64.447512,13.548643,-24.700141,-61.442738,39.701019,-9.367816,7.047386,...,0.000000,0.000000,-14.829590,-4.907621,-24.906685,20.196743,91.989530,-92.915405,-93.861211,-29.920780
6nb4,-39.309666,4.351932,-8.529892,-58.231302,4.455779,-24.904879,-70.019377,-15.160095,-88.910999,13.506284,...,0.000000,0.000000,-15.353076,2.676513,6.772544,41.345764,80.547941,-34.812359,-23.880191,-7.523575
6nb7,29.718320,17.444863,17.196672,50.635741,23.528359,16.992476,0.000000,-10.101497,3.839521,-4.793793,...,43.327603,-9.240390,-7.107809,0.339340,-3.329914,23.550809,0.000000,3.704321,9.623276,12.266870
6xcn,-11.623877,-11.438894,26.102622,28.683797,-15.384182,-1.396089,-24.297922,0.000000,2.181448,20.503412,...,1.221186,103.059929,11.172447,-43.979687,29.796868,-125.641598,-2.617514,4.560883,-23.991658,14.585877
6xe1,-2.818537,-59.844341,67.660837,98.990223,-37.931578,43.758384,5.777396,0.000000,14.990841,64.889319,...,-28.920803,34.941593,33.933554,41.089594,-81.166401,-93.837709,0.502137,-24.445485,59.725633,33.060191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7nd9,-1.439270,18.180753,45.639846,63.558292,-38.279262,35.410320,27.748688,0.000000,31.648613,-4.784029,...,-5.558738,57.336610,39.456045,-4.649641,12.673177,17.701092,7.585118,55.644950,21.406670,17.830524
7ndb,8.309010,-13.676192,5.470143,56.340189,-17.502078,-4.297498,41.588344,0.000000,29.713759,8.546395,...,-2.746451,-1.227531,-3.267499,-50.508540,19.557913,-81.474540,13.407211,12.373330,23.544920,10.555678
7ndc,67.673109,22.182180,1.127871,47.733260,-16.856332,4.768456,-15.643784,0.000000,40.157224,52.942158,...,9.951634,-63.130610,25.638484,29.779071,12.699822,1.805031,1.068793,-37.986890,43.624990,53.290844
7ndd,38.132149,10.315195,1.608883,91.226035,-34.970719,24.451178,4.902896,0.000000,47.742092,27.270429,...,-7.017262,-22.836347,26.832999,60.132157,30.846530,-23.816268,12.003750,-10.547584,27.930232,17.292850


In [4]:
def lower_triangle(df):
    """Compute the correlation matrix, returning only unique values."""
    lower_triangle = pd.DataFrame(
        np.tril(np.ones(df.shape), -1)).astype(bool)
    lower_triangle.index, lower_triangle.columns = df.index, df.columns
    return df.where(lower_triangle)

In [5]:
dict_dist= {}
metrics = ['cosine', 'euclidean', 'l2', 'manhattan', 'l1', 'hamming', 'chebyshev'] # 'jaccard' excluded as it's for binary data
for _metric in metrics:
    dict_dist[_metric] = pd.DataFrame(pairwise_distances(X=df, metric=_metric), index=df.index, columns=df.index)
    dict_dist[_metric] = lower_triangle(dict_dist[_metric]).stack()
df_metrics = pd.DataFrame(dict_dist)
df_metrics

Unnamed: 0,Unnamed: 1,cosine,euclidean,l2,manhattan,l1,hamming,chebyshev
6nb4,6nb3,0.288190,145.602781,145.602781,433.165570,433.165570,0.904762,79.543183
6nb7,6nb3,1.295634,249.215516,249.215516,882.012713,882.012713,1.000000,115.083253
6nb7,6nb4,1.287162,218.937090,218.937090,777.793951,777.793951,1.000000,108.867043
6xcn,6nb3,1.048786,283.285662,283.285662,1051.969903,1051.969903,1.000000,145.838341
6xcn,6nb4,1.132652,270.118347,270.118347,906.173982,906.173982,1.000000,166.987362
...,...,...,...,...,...,...,...,...
7ntc,7nd8,0.531588,197.639670,197.639670,609.155619,609.155619,0.952381,99.833570
7ntc,7nd9,0.318854,158.510661,158.510661,480.532655,480.532655,0.952381,100.327947
7ntc,7ndb,0.738062,225.223504,225.223504,676.355432,676.355432,0.952381,119.389193
7ntc,7ndc,0.822989,245.366533,245.366533,792.388503,792.388503,0.952381,150.770188
