# Description
This notebook will contain basic code to align pharmacophores (based on recently seen paper). For the starters I will use pharmacophores from align-it (also check the reader code)

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import random
import torch
import sys

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
CODEDIR = "../code/common"
if not CODEDIR in sys.path:
    sys.path.append(CODEDIR)
from alignit import read_pharm_set

In [4]:

# constants

def seed_everything(seed=42):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    pass

# Cell with constants
DATADIR = Path("../data")
if not DATADIR.exists():
  # DATADIR.mkdir(DATADIR)
  !gdown --id 1qnvNxd6SvhwHPxD0huTpmODB270ENs7j
  !tar -xzvf inhibitors_data.tar.gz

RANDOM_SEED = 2407
seed_everything(RANDOM_SEED)

TMP_DIR = Path("../tmp")
TMP_DIR.mkdir(exist_ok=True)

train_df = pd.read_csv(DATADIR / "train.csv", index_col=0)
test_df = pd.read_csv(DATADIR / "test.csv", index_col=0)

# train_df['canonical'] = train_df.Smiles.apply(smiles2canonical)
# test_df['canonical'] = test_df.Smiles.apply(smiles2canonical)

# the name of the baseline BERT model which is getting fine-tuned
SMILES_COL = "Smiles"  # "canonical" 
NFOLDS = 5

PHARM_DIR = TMP_DIR / "pharmacophores" / "alignit"
print(list(PHARM_DIR.iterdir()))
phar_files = list(PHARM_DIR.iterdir())



[PosixPath('../tmp/pharmacophores/alignit/alignit_test.phar'), PosixPath('../tmp/pharmacophores/alignit/alignit_train.phar')]


In [5]:
file_data = dict()
for file in phar_files:
    with open(file.as_posix()) as f:
        file_data[file.name] = list(read_pharm_set(f))

In [6]:
train_pharmacophores = file_data["alignit_train.phar"]
test_pharmacophores = file_data["alignit_test.phar"]

In [125]:
p = train_pharmacophores[0]
q = train_pharmacophores[1]

In [126]:
from scipy.optimize import linear_sum_assignment

def get_distance(p, q):
    coords_p = (p.x, p.y, p.z)
    coords_q = (q.x, q.y, q.z)
    d = np.sum([(pc - qc)**2 for pc, qc in zip(coords_p, coords_q)])
    return np.sqrt(d)

def describe_pharmacophore_neigborhood(p):
    # print(p.phar)
    props_neigbors = []
    for prop1 in p.phar:
        neigbor_list = []
        
        for i, prop2 in enumerate(p.phar):
            d = get_distance(prop1, prop2)
            #if d > 0:
            # print(d)
            neigbor_list.append(d)
            # type_list.append(prop1.name == prop2.name)
        #neigbor_list = sorted(neigbor_list, key=lambda x: x[0])
        props_neigbors.append(neigbor_list)
        # props_types.append(type_list)
    return np.asarray(props_neigbors)


def compute_distance_matrix(features_p, features_q, p, q):
    distance_matrix = []
    ntypes = []
    for k, dp in enumerate(features_p):
        dist_list = []
        type_list = []
        for m, dq in enumerate(features_q):
            # print(dp, dq, p.phar[k], q.phar[m])
            ap = p.phar[k].a
            aq = q.phar[m].a
            d = np.abs(dp - dq)*ap*aq/(ap+aq)
            dist_list.append(d)
            type_list.append((p.phar[k].name == q.phar[m].name) and (d < 1))
        ntypes.append(type_list)
        distance_matrix.append(dist_list)
    return np.asarray(distance_matrix), np.asarray(ntypes)


def get_neighborhood_diff(p, q):
    distances_p  = describe_pharmacophore_neigborhood(p)
    distances_q  = describe_pharmacophore_neigborhood(q)
    # print(distances_p,"\n\n", distances_q)
    psize = len(p.phar)
    qsize = len(q.phar)
    size = max(psize, qsize)
    table = []
    for i, prop_p in enumerate(p.phar):
        line = []
        for j, prop_q in enumerate(q.phar):
            if prop_p.name!= prop_q.name:
                line.append(np.inf)
                continue
            distances, ntypes = compute_distance_matrix(distances_p[i], distances_q[j], p, q)
            #print(distances, "\n---\n", ntypes)
            distances[~ntypes] = np.inf
            reachable_cols = np.where((distances < np.inf).any(0))[0]
            reachable_rows = np.where((distances < np.inf).any(1))[0]
            #print(distances)
            dm = distances[reachable_rows][:, reachable_cols]
            row_ind, col_ind = linear_sum_assignment(dm)
            min_cost = dm[row_ind, col_ind].sum()/size
            # aligned_indices = reachable_rows[row_ind], reachable_cols[col_ind]
            line.append(min_cost)
        table.append(line)
    return np.asarray(table)

    
def build_neighborhood_matrix(p, q):
    table = get_neighborhood_diff(p, q)
    reachable_cols = np.where((table < np.inf).any(0))[0]
    reachable_rows = np.where((table < np.inf).any(1))[0]
    subtable = table[reachable_rows][:, reachable_cols]
    row_ind, col_ind = linear_sum_assignment(subtable)
    top3_indices = np.argsort(subtable[row_ind, col_ind])
    row_ind = row_ind[top3_indices]
    col_ind = col_ind[top3_indices]
    return reachable_rows[row_ind], reachable_cols[col_ind], subtable[row_ind, col_ind]
    # distances[np.eye(distances.shape[0]) == 1] = np.inf

p_ids, q_ids, costs = build_neighborhood_matrix(p, q)
p_features = [p.phar[i] for i in p_ids]
q_features = [q.phar[i] for i in q_ids]
p_matrix = np.asarray([[p.x, p.y, p.z] for p in p_features])
q_matrix = np.asarray([[q.x, q.y, q.z] for q in q_features])
print(np.sqrt(np.mean((p_matrix - q_matrix)**2)))
p_centroid = p_matrix.mean(0)
p_matrix -= p_centroid
q_centroid = q_matrix.mean(0)
q_matrix -= q_centroid


1.8669588904548011


In [127]:
covariance_matrix = np.dot(p_matrix.T, q_matrix)

In [128]:
u, s, vh = np.linalg.svd(covariance_matrix)

In [141]:
sign = np.sign(np.linalg.det(u @ vh))
#sign = (np.linalg.det(u) * np.linalg.det(vh)) < 0.0
s[-1] = sign
s = np.array([1, 1, sign])
rot_matrix = np.dot((u*np.array([1, 1, sign])), vh)

In [142]:
q_matrix 

array([[-0.29063933,  1.727593  , -0.11276967],
       [-2.62808733,  0.046882  ,  0.19367033],
       [ 2.91872667, -1.774475  , -0.08090067]])

In [143]:
np.sqrt(np.mean((np.dot(p_matrix, rot_matrix) - q_matrix)**2))

1.2272055792651013

In [144]:
# from scipy.spatial.transform import Rotation # align_vectors
np.sqrt(np.mean((p_matrix - q_matrix)**2))

1.6694651833343161