In [21]:
from rdkit.Chem import AllChem as ch
from rdkit.Chem import Draw as d
from rdkit import DataStructs
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors as rdescriptors

In [2]:
suppl = ch.SDMolSupplier('../data/drugbank.sdf')
organic_carbon = ch.MolFromSmarts('[#6]')
mols = [x for x in suppl if x is not None and x.HasSubstructMatch(organic_carbon)]

In [3]:
mols_fps = [(m, ch.GetMorganFingerprint(m, 2)) for m in mols]

In [4]:
picker = MaxMinPicker()

In [5]:
def fp_distance(i, j, mols_fps=mols_fps):
    return 1 - DataStructs.TanimotoSimilarity(mols_fps[i][1],
                                              mols_fps[j][1])

In [6]:
pickIndices = picker.LazyPick(fp_distance, len(mols_fps), 200, seed=666)

In [7]:
filtered_mols_fps = [mols_fps[i] for i in pickIndices]
result = filtered_mols_fps

In [16]:
def num_hydrogen_bond_acceptors(mol):
    return rdescriptors.CalcNumLipinskiHBA(mol)

def num_hydrogen_bond_donors(mol):
    return rdescriptors.CalcNumLipinskiHBD(mol)

def MW(mol):
    return Descriptors.MolWt(mol)

def logP(mol):
    return Descriptors.MolLogP(mol)

def TPSA(mol):
    return Descriptors.TPSA(mol)

def num_rotatable_bonds(mol):
    return Descriptors.NumRotatableBonds(mol)

def num_heavy_atoms(mol):
    return mol.GetNumHeavyAtoms()

In [22]:
prep_data = [(m, num_hydrogen_bond_acceptors(m), num_hydrogen_bond_donors(m), MW(m),logP(m),TPSA(m),num_rotatable_bonds(m),num_heavy_atoms(m)) for m, fp in result]

In [23]:
prep_data[0]

(<rdkit.Chem.rdchem.Mol at 0x7f5e7665f8f0>,
 6,
 4,
 294.742,
 1.5508999999999993,
 84.41,
 2,
 20)

In [30]:
out = open("../out/lipinski.csv","w")
out.write("name,HA,HD,mw,lp,tpsa,rb,ha\n")
for m, HA, HD,mw, lp, tpsa, rb, ha in prep_data:
    out.write("%s,%d,%d,%f,%f,%f,%d,%d\n" % (m.GetProp("DRUGBANK_ID"), HA, HD,mw, lp, tpsa, rb, ha))
out.close()

In [35]:
out = open("../out/lipinski_metadata.csv","w")
out.write("name,smiles\n")
for m in prep_data:
    out.write("%s,%s\n" % (m[0].GetProp("DRUGBANK_ID"), ch.MolToSmiles(m[0])))
out.close()

Pro spuštění shlukování pusťte v prostředí inchlib něco podobného jako
http://www.openscreen.cz/software/inchlib/home/:
python inchlib_clust.py ../out/lipinski.csv -dh -n -html ../out/clust/