# Preparing Data for On The Fly Modeling

In this notebook, we prepare data for modeling arbitrary sets of proteins.

In [1]:
import pandas as pd
import os
import joblib
import numpy as np
from fragmentembedding import FragmentEmbedder

DATA_PATH = "../data"

## Preprocessing screening data, especially to calculate promiscuity

In [2]:
import collections

df = pd.read_csv(os.path.join(DATA_PATH, "screening_hits.tsv"), sep="\t")

hits = {}
for r in df[["accession", "fragId", "l2fc"]].values:
    hits[(r[0], r[1])] = float(r[2])

fid_prom = collections.defaultdict(int)
for r in list(df["fragId"]):
    fid_prom[r] += 1

fid_prom = dict((k, v) for k, v in fid_prom.items())

pid_prom = collections.defaultdict(int)
for r in list(df["accession"]):
    pid_prom[r] += 1

pid_prom = dict((k, v) for k, v in pid_prom.items())

joblib.dump((hits, fid_prom, pid_prom), os.path.join(DATA_PATH, "hits.joblib"))

['../data/hits.joblib']

## Embeddings for CeMM fragments

In [3]:
fid2smi = pd.read_csv(os.path.join(DATA_PATH, "cemm_smiles.csv"))

fe = FragmentEmbedder()

print(list(fid2smi["smiles"]))

X = fe.transform(list(fid2smi["smiles"]))
print(X)

joblib.dump(
    (list(fid2smi["fid"]), list(fid2smi["smiles"]), X),
    os.path.join(DATA_PATH, "cemm_emb.joblib"),
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


['C#CCCC1(CCC(=O)NC2(CCO)CS(=O)(=O)C2)N=N1', 'C#CCCC1(CCC(=O)NC(C(N)=O)c2ccc3ccccc3c2)N=N1', 'C#CCCC1(CCC(=O)Nc2ccc(-c3cn4cccc(C)c4n3)cc2)N=N1', 'C#CCCC1(CCNC(=O)c2cc(=O)[nH]c(C3CCOCC3)n2)N=N1', 'C#CCCC1(CCNC(=O)CC2CS(=O)(=O)c3ccccc3O2)N=N1', 'C#CCCC1(CCNC(=O)c2c[nH]c(C(C)(C)C)n2)N=N1', 'C#CCCC1(CCNC(=O)CC2(C(F)(F)F)CCC(=O)NC2)N=N1', 'C#CCCC1(CCC(=O)Nc2cc(CCc3ccccc3C)n[nH]2)N=N1', 'C#CCCC1(CCNC(=O)c2c(C(F)(F)F)nc3n2CCCC3)N=N1', 'C#CCCC1(CCNC(=O)c2cc(OC)c3c(c2)OCCO3)N=N1', 'C#CCCC1(CCNC(=O)c2ncsc2S(N)(=O)=O)N=N1', 'C#CCCC1(CCNC(=O)c2ccc(S(=O)(=O)c3ccccc3)s2)N=N1', 'C#CCCC1(CCC(=O)NC(Cc2ccccn2)C2COc3ccccc3O2)N=N1', 'C#CCCC1(CCC(=O)Nc2ccc3c(c2)nc(C)n3C2(CO)CCC2)N=N1', 'C#CCCC1(CCC(=O)N2CCCN(S(=O)(=O)c3ccc(Cl)s3)CC2)N=N1', 'C#CCCC1(CCNC(=O)C2=NN(c3ccc(F)cc3)C(C(N)=O)C2)N=N1', 'C#CCCC1(CCC(=O)N2C[C@H]3CN(C)CC[C@@]3(CO)C2)N=N1', 'C#CCCC1(CCC(=O)NCCS(=O)(=O)N2CCOCC2)N=N1', 'C#CCCC1(CCC(=O)N2CCC3(CC2)OCc2ccncc23)N=N1', 'C#CCCC1(CCC(=O)NC2CCc3cccc4cccc2c34)N=N1', 'C#CCCC1(CCC(=O)Nc2cc(C3CC3)nn2

100%|██████████| 407/407 [00:03<00:00, 117.95it/s]

[[ 1.723    1.947   -0.2703  ...  0.4717  -0.269   -0.198  ]
 [-0.1489  -0.2766  -0.0764  ...  0.793   -0.1729   0.9316 ]
 [-0.163   -0.2305   1.099   ... -0.2356  -0.2478  -0.2695 ]
 ...
 [-0.13     0.3564   2.34    ...  1.66    -0.2764   0.3894 ]
 [-0.1133  -0.2778  -0.03015 ...  0.7607  -0.2712   0.3013 ]
 [ 2.9      0.61    -0.2397  ...  0.512   -0.2097   0.635  ]]





['../data/cemm_emb.joblib']

## Embeddings for Enamine Stock Fragments

In [4]:
df = pd.read_csv(os.path.join(DATA_PATH, "enamine_stock.csv"))
df = df[~df["smiles"].isin(fid2smi["smiles"])]
print(df.shape)

fe = FragmentEmbedder()

X = fe.transform(list(df["smiles"]))

joblib.dump(
    (list(df["catalog_id"]), list(df["smiles"]), X),
    os.path.join(DATA_PATH, "enamine_stock_emb.joblib"),
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


(5663, 2)


100%|██████████| 1024/1024 [00:08<00:00, 124.64it/s]
100%|██████████| 1024/1024 [00:08<00:00, 119.99it/s]
100%|██████████| 1024/1024 [00:08<00:00, 119.35it/s]
100%|██████████| 1024/1024 [00:08<00:00, 116.51it/s]
100%|██████████| 1024/1024 [00:09<00:00, 113.13it/s]
100%|██████████| 543/543 [00:04<00:00, 119.28it/s]


['../data/enamine_stock_emb.joblib']

## Morgan Fingerprints for CeMM fragments

In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem

R = []
for smi in list(fid2smi["smiles"]):
    mol = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    R += [fp]

X = np.array(R, dtype=int)

joblib.dump(
    (list(fid2smi["fid"]), list(fid2smi["smiles"]), X),
    os.path.join(DATA_PATH, "cemm_morgan.joblib"),
)

['../data/cemm_morgan.joblib']

# Protein name mapping

In [3]:
df = pd.read_csv("../data/pid2name_primary.tsv", sep="\t", header=None)
df.columns = ["uniprot_ac", "gene_name"]

# Protein screening similarity maps

In [4]:
import joblib
import collections
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

hits, fid_prom, pid_prom = joblib.load("../data/hits.joblib")

In [5]:
pid_docs = collections.defaultdict(list)
for k, v in hits.items():
    pid_docs[k[0]] += [k[1]]
pid_docs = dict((k, " ".join(v)) for k, v in pid_docs.items())

pids = sorted(pid_docs.keys())
pid_docs = [pid_docs[k] for k in pids]

X = TfidfVectorizer().fit_transform(pid_docs).toarray()

In [6]:
similarities = cosine_similarity(X, X)

file_name = "../data/protein_protein_hit_cosines.joblib"
joblib.dump((pids, similarities), file_name)

['../data/protein_protein_hit_cosines.joblib']

In [10]:
import pandas as pd
import os

df = pd.read_csv("../data/cemm_interest_protein_class.tsv", header=None, sep="\t")
df.columns = ["uniprot_ac", "family"]
df = df[df["family"] == "SLC"]
slcs = sorted(set(df["uniprot_ac"]))

with open("../data/examples/slc_cemm_interest.txt", "w") as f:
    for r in slcs:
        f.write(r + os.linesep)