In [2]:
#install dependencies
#%pip install pandas numpy molfeat datamol rdkit ankh torch peptides sklearn lightgbm tmap

In [2]:
#%conda install -c tmap tmap 


Collecting package metadata (current_repodata.json): done
Solving environment: unsuccessful initial attempt using frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): done
Solving environment: unsuccessful initial attempt using frozen solve. Retrying with flexible solve.
Solving environment: - 
Found conflicts! Looking for incompatible packages.
This can take several minutes.  Press CTRL-C to abort.
Examining conflict for wheel pyparsing libnghttp2 sympy pillow joblib python_abi- \ - | ^C
                                                                               failed

Note: you may need to restart the kernel to use updated packages.


In [1]:
from rdkit import Chem
import tmap as tm
from map4 import MAP4Calculator

dim = 1024

MAP4 = MAP4Calculator(dimensions=dim)
ENC = tm.Minhash(dim)

smiles_a = 'c1ccccc1'
mol_a = Chem.MolFromSmiles(smiles_a)
map4_a = MAP4.calculate(mol_a)


smiles_b = 'c1cccc(N)c1'
mol_b = Chem.MolFromSmiles(smiles_b)
map4_b = MAP4.calculate(mol_b)

# or use parallelized version:
fps = MAP4.calculate_many([mol_a, mol_b])


print(ENC.get_distance(map4_a, map4_b))

print(ENC.get_distance(fps[0], fps[1]))

0.7861328125
0.7861328125


In [1]:
#import libraries + load dataset
import pandas as pd
import datamol as dm
from molfeat.calc import FPCalculator
from molfeat.trans import MoleculeTransformer
import numpy as np
import os
import ankh
import torch
import peptides

#df = pd.read_pickle('../../Data/processed/clean_df.pkl')

df = pd.read_pickle('../../Data/processed/clean_df_grouped.pkl')

2023-09-17 17:57:25.562456: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-17 17:57:47.569381: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/mic/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/intel64/lib:/opt/intel/compilers_and_libraries_2018.3.222/linux/mpi/mic/lib::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/:/anaconda/envs/azureml_py38/lib/:/usr/local/cuda/lib64:/usr/local/cuda/extras/

### Molecular Fingerprints

In [2]:
#convert to Sequence to SMILES
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
def seq_to_smiles(seq):
    mol = Chem.MolFromFASTA(seq)
    smiles = Chem.MolToSmiles(mol)
    return smiles

df['SMILES'] = df['Sequence'].apply(seq_to_smiles)

In [4]:
#print all available molecular fingerprints from the molfeat library

from molfeat.calc import FP_FUNCS
FP_FUNCS.keys()



dict_keys(['maccs', 'avalon', 'ecfp', 'fcfp', 'topological', 'atompair', 'rdkit', 'pattern', 'layered', 'map4', 'secfp', 'erg', 'estate', 'avalon-count', 'rdkit-count', 'ecfp-count', 'fcfp-count', 'topological-count', 'atompair-count'])

In [7]:
import pickle

fingerprints = {}
#load data
with open('../Fingerprints/fingerprints_df_grouped.pickle', 'rb') as f:
    fingerprints = pickle.load(f)

In [11]:
fingerprints.keys()

dict_keys(['map4', 'erg', 'atompair-count', 'ecfp', 'layered', 'topological', 'rdkit', 'binary profile of physicochemical property', 'one-hot-encoded-sequence', 'peptide_descriptors', 'ankh_base_embedding', 'mean_embeddings_large', 'maccs', 'avalon', 'fcfp', 'atompair', 'pattern', 'secfp', 'estate', 'avalon-count', 'rdkit-count', 'ecfp-count', 'fcfp-count', 'topological-count'])

In [10]:
#generate the fingerprints
#selection = ["pattern", "ecfp", "maccs", "atompair", "rdkit", "topological", "atompair-count"]
selection = ['maccs', 'avalon', 'ecfp', 'fcfp', 'topological', 'atompair', 'rdkit', 'pattern', 'layered', 'map4', 'secfp', 'erg', 'estate', 'avalon-count', 'rdkit-count', 'ecfp-count', 'fcfp-count', 'topological-count', 'atompair-count']
#selection = ["map4", "erg", "atompair-count", "ecfp", "layered", "topological", "rdkit"]
for fingerprint in selection:
    if fingerprint in fingerprints:
        continue
    print('generating: ', fingerprint)
    calc = FPCalculator(fingerprint)
    trans = MoleculeTransformer(calc)
    with dm.without_rdkit_log():
        fingerprints[fingerprint] = trans.transform(df.SMILES.values)
    print('generated: ', fingerprint, len(fingerprints[fingerprint]))

generating:  avalon
generated:  avalon 1230
generating:  fcfp
generated:  fcfp 1230
generating:  atompair
generated:  atompair 1230
generating:  pattern
generated:  pattern 1230
generating:  secfp
generated:  secfp 1230
generating:  estate
generated:  estate 1230
generating:  avalon-count
generated:  avalon-count 1230
generating:  rdkit-count
generated:  rdkit-count 1230
generating:  ecfp-count
generated:  ecfp-count 1230
generating:  fcfp-count
generated:  fcfp-count 1230
generating:  topological-count
generated:  topological-count 1230


### Sequence Based Fingerprints

In [7]:
""" From the Pfeature library (physicochemical properties:

This method generates the output as a binary profile for each sequence, which explains if a particular physicochemical property is present in a sequence. 
A single residue is represented by a vector of length 25, where each value is corresponding to a particular physicochemical property, 
if a particular residue is having the property then that position will be assigned as 1 else 0. 
Hence, if a sequence is given with length L, the output vector will be of size 25*L.
"""

profiles = {
    "A": [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0],
    "C": [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0],
    "D": [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0],
    "E": [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
    "F": [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1],
    "G": [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0],
    "H": [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1],
    "I": [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1],
    "K": [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
    "L": [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1],
    "M": [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1],
    "N": [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0],
    "P": [0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0],
    "Q": [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
    "R": [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
    "S": [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0],
    "T": [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0],
    "V": [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
    "W": [0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1],
    "Y": [0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1]
}

sequences = df['Sequence'].values
max_sequence_length = max(len(seq) for seq in sequences)
# Pad the binary profiles with zeros to make them all have the same length
padded_profiles = [
    [profiles[aa] for aa in seq]
    for seq in sequences
]
padded_profiles = [
    seq + [[0] * 25] * (max_sequence_length - len(seq))
    for seq in padded_profiles
]

padded_profiles = np.array(padded_profiles)
padded_profiles = padded_profiles.reshape(padded_profiles.shape[0], -1)

fingerprints['binary profile of physicochemical property'] = padded_profiles

In [8]:
# Generate one-hot encoding fingerprint

amino_acids = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]

# Create a dictionary of one-hot vectors for each amino acid
profiles = {aa: [1 if i == idx else 0 for i in range(len(amino_acids))] for idx, aa in enumerate(amino_acids)}

sequences = df['Sequence'].values
max_sequence_length = max(len(seq) for seq in sequences)
# Pad the binary profiles with zeros to make them all have the same length
padded_profiles = [
    [profiles[aa] for aa in seq]
    for seq in sequences
]

padded_profiles = [
    seq + [[0] * len(amino_acids)] * (max_sequence_length - len(seq))
    for seq in padded_profiles
]

padded_profiles = np.array(padded_profiles)
padded_profiles = padded_profiles.reshape(padded_profiles.shape[0], -1)

fingerprints['one-hot-encoded-sequence'] = padded_profiles

In [9]:
# peptide_descriptors from the Peptides python library
sequences = df['Sequence'].values
descriptors = pd.DataFrame([ peptides.Peptide(s).descriptors() for s in sequences ])
descriptors = descriptors.to_numpy()
fingerprints['peptide_descriptors'] = descriptors

### LLM Embedding Fingerprint

In [10]:
# ankh base model large feature vector
model, tokenizer = ankh.load_base_model()
model.eval()

sequences = [list(seq) for seq in df['Sequence']]

outputs = tokenizer.batch_encode_plus(sequences, 
                                  add_special_tokens=True, 
                                  padding=True, 
                                  is_split_into_words=True, 
                                  return_tensors="pt")

with torch.no_grad():
  embeddings = model(input_ids=outputs['input_ids'], attention_mask=outputs['attention_mask'])

print(embeddings[0].shape)

array = np.array(embeddings[0])

reshaped_array = array.reshape(embeddings[0].shape[0], -1)

fingerprints['ankh_base_embedding'] = reshaped_array.tolist()

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

torch.Size([1230, 40, 768])


In [11]:
# ankh large model small feature vector (because averaged instead of appended)
model, tokenizer = ankh.load_large_model()
model.eval()

sequences = [list(seq) for seq in df['Sequence']]

outputs = tokenizer.batch_encode_plus(sequences, 
                                  add_special_tokens=True, 
                                  padding=True, 
                                  is_split_into_words=True, 
                                  return_tensors="pt")

with torch.no_grad():
  embeddings = model(input_ids=outputs['input_ids'], attention_mask=outputs['attention_mask'])

mean_embeddings_large = []
for embedding in embeddings[0]:
    embedding = np.array(embedding)
    mean_embeddings_large.append(np.mean(embedding, axis=0))

mean_embeddings_large = np.array(mean_embeddings_large)
print(mean_embeddings_large.shape, "mean_embeddings_large.shape")

fingerprints['mean_embeddings_large'] = mean_embeddings_large

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/7.52G [00:00<?, ?B/s]

(1230, 1536) mean_embeddings_large.shape


### Write fingerprints to pickle file

In [12]:
#save the fingerprint dictionary
import pickle
with open("../Fingerprints/fingerprints_df_grouped.pickle", "wb") as f:
    pickle.dump(fingerprints, f)