### Adding Fingerprints

In [80]:
#Libraries
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Draw import IPythonConsole
import pandas as pd
from rdkit.Chem import rdMolDescriptors as rdescriptors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from rdkit import RDLogger 
RDLogger.DisableLog('rdApp.*') 

In [81]:
#Loading zinc database from csv 
suppl_csv = pd.read_csv('../data/ZINCFINAL.csv', delimiter = ' ')
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC
...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC


### Calculating Fingerprints for ZINCFINAL.csv 

In [47]:
# List of ECFP of zinc products
ligandm_database = []
for ligand in suppl_csv["Smiles"]:
		ligandm_database.append(Chem.MolFromSmiles(ligand))

In [48]:
db_ECFP = []
for substance in ligandm_database:
		db_ECFP.append(Chem.GetMorganFingerprintAsBitVect(substance, 2, nBits=2048))

In [67]:
suppl_csv['ECFP'] = db_ECFP

In [68]:
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [62]:
bitinfo = {} # dict to store infor about what bits were set by what atoms
db_ECFP1 = []
db_bitinfo = []
for x in suppl_csv.Smiles:
    molecule = Chem.MolFromSmiles(x)
    fp = Chem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=2048, bitInfo=bitinfo)
    db_ECFP1.append(list(fp.GetOnBits()))
    db_bitinfo.append(bitinfo)

In [71]:
suppl_csv['Bitset'] = db_ECFP1
suppl_csv['Bitinfo'] = db_bitinfo

In [72]:
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP,Bitset,Bitinfo
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 80, 140, 197, 389, 623, 650, 667, 739, 786...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[41, 80, 197, 339, 389, 390, 650, 652, 807, 88...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[80, 187, 190, 207, 305, 307, 389, 469, 650, 8...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[209, 221, 285, 389, 650, 656, 807, 926, 1017,...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[15, 106, 301, 314, 378, 584, 650, 664, 724, 7...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
...,...,...,...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[14, 234, 235, 241, 350, 383, 517, 587, 650, 6...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[80, 94, 114, 121, 140, 208, 235, 249, 289, 29...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[13, 74, 80, 92, 140, 212, 216, 262, 296, 378,...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[80, 99, 165, 216, 237, 255, 294, 295, 333, 36...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."


In [77]:
suppl_csv.to_csv('../data/ZINCECFP.csv', sep=' ', index = False)

In [79]:
#Loading zinc database from csv 
suppl_csv = pd.read_csv('../data/ZINCECFP.csv', delimiter = ' ')
suppl_csv

Unnamed: 0,Zinc_id,Smiles,MW,logP,Inchi,Inchi_s,Name,ECFP,Bitset,Bitinfo
0,1589384,N=C(N)NCC[C@H](N)C(=O)O,160.096026,-1.72853,IFPQOXNWLSRZKX-VKHMYHEASA-N,IFPQOXNWLSRZKX,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[1, 80, 140, 197, 389, 623, 650, 667, 739, 786...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
1,83822513,CN(C)CC(=O)NCC(=O)O,160.084792,-1.25120,HQFNONZTUQSPJS-UHFFFAOYSA-N,HQFNONZTUQSPJS,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[41, 80, 197, 339, 389, 390, 650, 652, 807, 88...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
2,214763687,O=C(O)CN1CCCNS1(=O)=O,194.036128,-1.38890,KCIDTUHJDPZBTQ-UHFFFAOYSA-N,KCIDTUHJDPZBTQ,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[80, 187, 190, 207, 305, 307, 389, 469, 650, 8...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
3,306392345,O=C(O)[C@@H]1C[C@H]2O[C@H]1[C@H](O)[C@H]2O,174.052823,-1.41990,ZZTJOHOETCDWML-VFUOTHLCSA-N,ZZTJOHOETCDWML,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[209, 221, 285, 389, 650, 656, 807, 926, 1017,...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
4,85343607,N[C@@H]1N=C2N=C(O)N=C2C(=O)N1,167.044324,-1.87440,OSXQHYVRCFCLQV-SCSAIBSYSA-N,OSXQHYVRCFCLQV,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[15, 106, 301, 314, 378, 584, 650, 664, 724, 7...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
...,...,...,...,...,...,...,...,...,...,...
379007,97986502,Cc1ccc(C(=O)N(c2ccc(N(C(=O)c3ccc(C)cc3)S(=O)(=...,636.051720,6.49784,KYPDAVIHIYTGQW-UHFFFAOYSA-N,KYPDAVIHIYTGQW,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[14, 234, 235, 241, 350, 383, 517, 587, 650, 6...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
379008,97948800,CCc1c(-c2ccc(OC)c(OC)c2)nn(-c2nc(-c3ccc(C)cc3)...,604.229740,7.58732,AWQWFNRTEBAZJO-UHFFFAOYSA-N,AWQWFNRTEBAZJO,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[80, 94, 114, 121, 140, 208, 235, 249, 289, 29...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
379009,54274624,Cc1cc(N2CCN(CCOc3cccc4ccccc34)CC2)n2nc(C)c(-c3...,511.213888,6.02074,HAHVCBFPDRUVIL-UHFFFAOYSA-N,HAHVCBFPDRUVIL,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[13, 74, 80, 92, 140, 212, 216, 262, 296, 378,...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
379010,2093448,CCOc1cc(CNc2cc(Cl)ccc2C)cc(I)c1OCc1ccccc1Cl,541.007232,7.49622,IYFPBZIIJDRXMB-UHFFFAOYSA-N,IYFPBZIIJDRXMB,ZINC,<rdkit.DataStructs.cDataStructs.ExplicitBitVec...,"[80, 99, 165, 216, 237, 255, 294, 295, 333, 36...","{13: ((2, 1),), 76: ((2, 2),), 80: ((1, 0), (2..."
