In [5]:
%matplotlib inline

import os

import torch
import gpytorch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import linalg
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import PathToSubmol, FindAtomEnvironmentOfRadiusN, MolToSmiles
from rdkit.Chem.AtomPairs import Pairs
from MDAnalysis.analysis.rms import rmsd
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from map4 import MAP4Calculator
import tmap as tm


from mobius import ForceField, VirtualTarget

In [2]:
mhci = pd.read_csv('binding_data_2013/bdata.20130222.mhci.csv')

In [3]:
def fingerprint(sequences, radius=10, nBits=4096):
    GMFABV = AllChem.GetMorganFingerprintAsBitVect
    fps = [np.array(GMFABV(Chem.rdmolfiles.MolFromFASTA(s), useChirality=True, useFeatures=True, radius=radius, nBits=nBits)) for s in sequences]
    return torch.from_numpy(np.array(fps)).float()

In [4]:
def calculate_unhashed_fps(mols, radii):
    # Source: https://github.com/isidroc/FingerprintCalculator/blob/master/FingerprintCalculator.py
    # get the dictionary for the substructures
    idxs = []
    substr_ids = []
    counts = []
    max_radius = max(radii)
    substructures_smiles = {}
    substructure_dictionary = {}
    reference_substructure_keys = []
        
    for mol_index, mol in enumerate(mols):
        info = {}
        fp = AllChem.GetMorganFingerprint(mol, useChirality=True, useFeatures=True, radius=max_radius, bitInfo=info)
        
        tmp_dictionary = {k:[mol_index] for k, v in info.items() if v[0][1] in radii}
        substr_ids.append(tmp_dictionary.keys())
        idxs.append([mol_index] * len(tmp_dictionary.keys()))
        counts.append([len(list(info.values())[x]) for x in np.arange(0,len(info)) if list(info.values())[x][0][1] in radii])
        
        substructure_dictionary.update(tmp_dictionary)
        
    idxs = np.array([val for sublist in idxs for val in sublist])
    counts = np.array([val for sublist in counts for val in sublist])
    substr_ids_flattened = [val for sublist in substr_ids for val in sublist]
    substr_ids = np.array(substr_ids_flattened)
    substructure_ids = substr_ids

    columns = np.array(list(set(substructure_dictionary.keys())))
    columns = np.sort(columns)
    columns_unhashed = columns
    dimensionality_unhashed = len(columns)
    
    fps_unhashed_binary = np.zeros((len(mols), dimensionality_unhashed), dtype=int)
    fps_unhashed_counts = np.zeros((len(mols), dimensionality_unhashed), dtype=int)
        
    mapping = np.array([(substr_ids[x]==columns).nonzero() for x in np.arange(0,len(substr_ids))])
    mapping = mapping.flatten()
    idxs = np.array([idxs[x] for x in np.arange(0,len(mapping)) if mapping[x].size != 0])
    counts = np.array([counts[x] for x in np.arange(0,len(mapping)) if mapping[x].size != 0])
    mapping = np.array([mapping[x] for x in np.arange(0,len(mapping)) if mapping[x].size != 0])
    
    if len(mapping) == 0:
        print("There is no intersection between the substructures \n(i)provided in the reference key set, and\n(ii) the substructures found in the input molecules.")
        return
    
    fps_unhashed_binary[idxs, mapping] = np.ones(len(mapping))
    fps_unhashed_counts[idxs, mapping] = counts
    
    return (fps_unhashed_binary, fps_unhashed_counts)

In [None]:
_, c = np.unique(X_fps, axis=0, return_counts=True)
print('Number duplicates in count fp  : ', c[c > 1])

_, c = np.unique(X_fps_bin, axis=0, return_counts=True)
print('Number duplicates in binary fp : ', c[c > 1])

In [186]:
fp1 = AllChem.GetHashedMorganFingerprint(Chem.rdmolfiles.MolFromFASTA('AAAAAA'), 2, nBits=2048, useChirality=True)
fp2 = AllChem.GetHashedMorganFingerprint(Chem.rdmolfiles.MolFromFASTA('AAAAA'), 2, nBits=2048, useChirality=True)

vec1 = np.zeros((0,), dtype=np.int8)
vec2 = np.zeros((0,), dtype=np.int8)

DataStructs.ConvertToNumpyArray(fp1, vec1)
DataStructs.ConvertToNumpyArray(fp2, vec2)

print(np.sum(vec1), np.sum(vec2))

# measure the similarity between the vectors using cosine similarity
print( ' * similarity:', 1 - cosine(vec1, vec2), 1 - jaccard(vec1, vec2))
print( ' * similarity:', 1 - cosine(vec1, vec1), 1 - jaccard(vec1, vec1))

79 66
 * similarity: 0.9988946217776751 0.38095238095238093
 * similarity: 1 1.0


In [165]:
fp1 = AllChem.GetHashedMorganFingerprint(Chem.rdmolfiles.MolFromFASTA('KLLKKLLL'), 2, nBits=2048, useChirality=True)
fp2 = AllChem.GetHashedMorganFingerprint(Chem.rdmolfiles.MolFromFASTA('KLLKKLL'), 2, nBits=2048, useChirality=True)

vec1 = np.zeros((0,), dtype=np.int8)
vec2 = np.zeros((0,), dtype=np.int8)

DataStructs.ConvertToNumpyArray(fp1, vec1)
DataStructs.ConvertToNumpyArray(fp2, vec2)

print(np.sum(vec1), np.sum(vec2))

# measure the similarity between the vectors using cosine similarity
print( ' * similarity:', 1 - cosine(vec1, vec2), 1 - jaccard(vec1, vec2))
print( ' * similarity:', 1 - cosine(vec1, vec1), 1 - jaccard(vec1, vec1))

181 160
 * similarity: 0.8328275265332209 0.5135135135135135
 * similarity: 1 1.0


In [174]:
fp1 = Pairs.GetHashedAtomPairFingerprint(Chem.rdmolfiles.MolFromFASTA('KLLKKLLL'), nBits=2048, includeChirality=True)
fp2 = Pairs.GetHashedAtomPairFingerprint(Chem.rdmolfiles.MolFromFASTA('KLLKKLL'), nBits=2048, includeChirality=True)

vec1 = np.zeros((0,), dtype=np.int8)
vec2 = np.zeros((0,), dtype=np.int8)

DataStructs.ConvertToNumpyArray(fp1, vec1)
DataStructs.ConvertToNumpyArray(fp2, vec2)

print(np.sum(vec1), np.sum(vec2))

# measure the similarity between the vectors using cosine similarity
print( ' * similarity:', 1 - cosine(vec1, vec2), 1 - jaccard(vec1, vec2))
print( ' * similarity:', 1 - cosine(vec1, vec1), 1 - jaccard(vec1, vec1))

2278 1770
 * similarity: 0.9583043220937854 0.3186180422264875
 * similarity: 1 1.0


In [179]:
fp1 = Pairs.GetAtomPairFingerprintAsBitVect(Chem.rdmolfiles.MolFromFASTA('KLLKKLLL'))
fp2 = Pairs.GetAtomPairFingerprintAsBitVect(Chem.rdmolfiles.MolFromFASTA('KLLKKLL'))

vec1 = np.array(fp1)
vec2 = np.array(fp2)

print(np.sum(vec1), np.sum(vec2))

# measure the similarity between the vectors using cosine similarity
print( ' * similarity:', 1 - cosine(vec1, vec2), 1 - jaccard(vec1, vec2))
print( ' * similarity:', 1 - cosine(vec1, vec1), 1 - jaccard(vec1, vec1))

530 457
 * similarity: 0.9102946266657893 0.8311688311688312
 * similarity: 1 1.0


In [188]:
seqs = ['KLLKKLLL', 'KLLKKLL']
fps = [Pairs.GetHashedAtomPairFingerprint(Chem.rdmolfiles.MolFromFASTA(m), nBits=2048, includeChirality=True) for m in seqs]

In [189]:
arr = np.zeros((len(seqs), 2048), dtype = np.int8)

for i in range(0,len(seqs)):
    DataStructs.ConvertToNumpyArray(fps[i], arr[i])

In [193]:
np.sum(arr[1])

1770

In [41]:
help(Pairs)

Help on module rdkit.Chem.AtomPairs.Pairs in rdkit.Chem.AtomPairs:

NAME
    rdkit.Chem.AtomPairs.Pairs

DESCRIPTION
    Contains an implementation of Atom-pair fingerprints, as
    described in:
    
    R.E. Carhart, D.H. Smith, R. Venkataraghavan;
    "Atom Pairs as Molecular Features in Structure-Activity Studies:
    Definition and Applications" JCICS 25, 64-73 (1985).
    
    The fingerprints can be accessed through the following functions:
    - GetAtomPairFingerprint
    - GetHashedAtomPairFingerprint (identical to GetAtomPairFingerprint)
    - GetAtomPairFingerprintAsIntVect
    - GetAtomPairFingerprintAsBitVect

FUNCTIONS
    ExplainPairScore(score, includeChirality=False)
        >>> from rdkit import Chem
        >>> m = Chem.MolFromSmiles('C=CC')
        >>> score = pyScorePair(m.GetAtomWithIdx(0),m.GetAtomWithIdx(1),1)
        >>> ExplainPairScore(score)
        (('C', 1, 1), 1, ('C', 2, 1))
        >>> score = pyScorePair(m.GetAtomWithIdx(0),m.GetAtomWithIdx(2),2)
     

In [168]:
MAP4_unf = MAP4Calculator(dimensions=2048, radius=4, is_folded=True, is_counted=False)

vec1 = np.array(MAP4_unf.calculate(mol=Chem.rdmolfiles.MolFromFASTA('KLLKKLLL')))
vec2 = np.array(MAP4_unf.calculate(mol=Chem.rdmolfiles.MolFromFASTA('KLLKKLL')))

print(vec1.shape)
print(vec2.shape)

# measure the similarity between the vectors using cosine similarity
print( ' * similarity:', 1 - cosine(vec1, vec2), 1 - jaccard(vec1, vec2))
print( ' * similarity:', 1 - cosine(vec1, vec1), 1 - jaccard(vec1, vec1))

(2048,)
(2048,)
 * similarity: 0.9540233661553899 0.9116052060737527
 * similarity: 1 1.0


In [169]:
MAP4_unf = MAP4Calculator(dimensions=2048, radius=4, is_folded=False, is_counted=False)

vec1 = np.array(MAP4_unf.calculate(mol=Chem.rdmolfiles.MolFromFASTA('KLLKKLL')))
vec2 = np.array(MAP4_unf.calculate(mol=Chem.rdmolfiles.MolFromFASTA('KLLKKLLL')))

print(vec1.shape)
print(vec2.shape)

# measure the similarity between the vectors using cosine similarity
print( ' * similarity:', 1 - cosine(vec1, vec2), 1 - jaccard(vec1, vec2))
print( ' * similarity:', 1 - cosine(vec1, vec1), 1 - jaccard(vec1, vec1))

(2048,)
(2048,)
 * similarity: 0.8626098279303226 0.6962890625
 * similarity: 1 1.0


In [25]:
help(MAP4Calculator)

Help on class MAP4Calculator in module map4.map4:

class MAP4Calculator(builtins.object)
 |  MAP4Calculator(dimensions=1024, radius=2, is_counted=False, is_folded=False, return_strings=False)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, dimensions=1024, radius=2, is_counted=False, is_folded=False, return_strings=False)
 |      MAP4 calculator class
 |  
 |  calculate(self, mol)
 |      Calculates the atom pair minhashed fingerprint
 |      
 |      Arguments:
 |          mol -- rdkit mol object
 |      
 |      Returns:
 |          tmap VectorUint -- minhashed fingerprint
 |  
 |  calculate_many(self, mols)
 |      Calculates the atom pair minhashed fingerprint
 |      
 |      Arguments:
 |          mols -- list of mols
 |      
 |      Returns:
 |          list of tmap VectorUint -- minhashed fingerprints list
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance v

In [109]:
from scipy.spatial.distance import cosine, jaccard, dice, rogerstanimoto
from random import randint
import numpy as np

# specify the length of each minhash vector
N = 128
max_val = (2**32)-1

# create N tuples that will serve as permutation functions
# these permutation values are used to hash all input sets
perms = [ (randint(0,max_val), randint(0,max_val)) for i in range(N)]

# initialize a sample minhash vector of length N
# each record will be represented by its own vec
vec = [float('inf') for i in range(N)]

def minhash(s, prime=4294967311):
  '''
  Given a set `s`, pass each member of the set through all permutation
  functions, and set the `ith` position of `vec` to the `ith` permutation
  function's output if that output is smaller than `vec[i]`.
  '''
  # initialize a minhash of length N with positive infinity values
  vec = [float('inf') for i in range(N)]

  for val in s:

    # ensure s is composed of integers
    if not isinstance(val, int): val = hash(val)

    # loop over each "permutation function"
    for perm_idx, perm_vals in enumerate(perms):
      a, b = perm_vals

      # pass `val` through the `ith` permutation function
      output = (a * val + b) % prime

      # conditionally update the `ith` value of vec
      if vec[perm_idx] > output:
        vec[perm_idx] = output

  # the returned vector represents the minimum hash of the set s
  return vec

In [62]:
# specify some input sets
data1 = set(['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'datasets'])
data2 = set(['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents'])

# get the minhash vectors for each input set
vec1 = minhash(data1)
vec2 = minhash(data2)

print(vec1)

# divide both vectors by their max values to scale values {0:1}
vec1 = np.array(vec1) / max(vec1)
vec2 = np.array(vec2) / max(vec2)

# measure the similarity between the vectors using cosine similarity
print( ' * similarity:', 1 - cosine(vec1, vec2) )

[778384383, 283156841, 110667589, 126495898, 131556771, 546819345, 8733040, 70672335, 151950879, 484250184, 35809988, 410711128, 333056297, 312470045, 433620984, 51959541, 316558976, 162478280, 381005148, 507395899, 14727460, 9561560, 101367602, 169783082, 79017061, 185462886, 280066231, 769727884, 517629918, 374726555, 287045096, 68457968, 63831624, 420122213, 16547863, 901327943, 430365783, 125968700, 223823555, 172360596, 81087952, 48876891, 504084538, 160462718, 143441963, 186212942, 394433330, 109352890, 624591240, 875428160, 332734024, 385840171, 92433154, 109540907, 30352964, 55148677, 197850771, 399346404, 730179644, 88577121, 263437801, 829173828, 178996576, 809484850, 171088731, 206892498, 265978099, 99606002, 25406150, 79959825, 43231762, 101405667, 738432590, 118103538, 489170516, 525981477, 250172249, 169692935, 319191386, 92585095, 196526829, 556900114, 28249728, 375697637, 68797504, 117397767, 178963810, 199560249, 763710429, 115589879, 203432836, 284748977, 67646670, 25

In [59]:
vec1

array([5.46841913e-01, 1.98927461e-01, 7.77478036e-02, 8.88677373e-02,
       9.24231754e-02, 3.84159476e-01, 6.13526223e-03, 4.96497563e-02,
       1.06750741e-01, 3.40202480e-01, 2.51577534e-02, 2.88538753e-01,
       2.33983552e-01, 2.19520999e-01, 3.04633718e-01, 3.65033722e-02,
       2.22393614e-01, 1.14146603e-01, 2.67669276e-01, 3.56463144e-01,
       1.03465493e-02, 6.71732614e-03, 7.12142415e-02, 1.19278479e-01,
       5.55122145e-02, 1.30294083e-01, 1.96756200e-01, 5.40760424e-01,
       3.63652896e-01, 2.63258347e-01, 2.01659094e-01, 4.80940870e-02,
       4.48439205e-02, 2.95150365e-01, 1.16254453e-02, 6.33214010e-01,
       3.02346827e-01, 8.84973625e-02, 1.57243778e-01, 1.21089272e-01,
       5.69670869e-02, 3.43377041e-02, 3.54136798e-01, 1.12730602e-01,
       1.00772933e-01, 1.30821023e-01, 2.77103037e-01, 7.68241821e-02,
       4.38796919e-01, 6.15018518e-01, 2.33757144e-01, 2.71066046e-01,
       6.49374832e-02, 7.69562705e-02, 2.13240056e-02, 3.87438503e-02,
      