## For write_to_hdf5, I need inchi keys. Here, we make them

In [1]:
# From chembl raw data, get our smiles

In [1]:
import numpy as np
import pandas as pd
import os

def get_env_var(handle):
    tmp = os.getenv(handle)
    if not tmp:
        raise LookupError("Environment variable: {} not set.".format(handle))
    return tmp.strip("'")

In [2]:
base = get_env_var("DATA_SAVE_BASE")
expt_base = "{}/20180525_DM_scrubbing".format(base)
raw_chembl_name = "{}/raw_data/full_chembl20_cutoff800_dm_scrubbed.csv.gz".format(expt_base)
smiles = "{}/raw_data/all_chembl_smiles_mid_mwcutoff800.smi".format(expt_base)

In [14]:
df = pd.read_csv(raw_chembl_name, sep="\t", compression="gzip", index_col=0)

In [18]:
mid_smiles = df[["ChEMBL_Molecule_ID", "SMILES"]]

In [21]:
mid_smiles_unique = mid_smiles.groupby(["ChEMBL_Molecule_ID", "SMILES"]).size().reset_index().rename(columns={0:'count'})

In [23]:
mid_smiles_unique = mid_smiles_unique[["ChEMBL_Molecule_ID", "SMILES"]]

In [29]:
mid_smiles_unique.to_csv(smiles, sep="\t", compression="gzip", header=True, index=False)

In [30]:
!zcat $smiles | head

ChEMBL_Molecule_ID	SMILES
CHEMBL1	COc1ccc2[C@@H]3[C@H](COc2c1)C(C)(C)OC4=C3C(=O)C(=O)C5=C4OC(C)(C)[C@@H]6COc7cc(OC)ccc7[C@H]56
CHEMBL10	C[S+]([O-])c1ccc(cc1)c2nc(c3ccc(F)cc3)c([nH]2)c4ccncc4
CHEMBL1000	OC(=O)COCCN1CCN(CC1)C(c2ccccc2)c3ccc(Cl)cc3
CHEMBL10000	Ic1ccc(NC2=Nc3ccccc3C(=O)O2)cc1
CHEMBL100003	CCCC1C(=C(C)N=C(C)/C/1=C(\\O)/OC)C(=O)OCC
CHEMBL100004	CCO\\C(=C\\1/C(C)C(=C(C)N=C1C)C(=O)OCCSc2ccccc2)\\O
CHEMBL100005	COC(=O)C(Cc1ccc2OCOc2c1)c3c4ccccc4nc5ccccc35
CHEMBL100006	COc1cc(C)c(OC)c(CC(C)N)c1
CHEMBL100010	COC(=O)C1=C(CC2CCC1C2)c3ccccc3

gzip: stdout: Broken pipe


## Create INCHI lookup dict

Ran from this dir:

`
./get_inchi.sh
`

In [116]:
import cPickle as pkl
import gzip

In [117]:
inchi_dict="{}/raw_data/chembl20_MWmax800_smiles2inchi2mid.csv.gz".format(expt_base)
inchi2smiles = "{}/raw_data/inchi2smiles.csv.gz".format(expt_base)
mid2inchi = "{}/raw_data/mid2inchi.pkl".format(expt_base)

In [118]:
inchi_df = pd.read_csv(inchi_dict, sep="\t", compression="gzip")

In [119]:
# separate out data for convenience & reduce duplicate smiles
inchi_to_smiles = inchi_df.set_index("INCHI key").to_dict()["smiles"]
mid_to_inchi = inchi_df.set_index("mid").to_dict()["INCHI key"]

inchi_to_smiles = pd.DataFrame.from_dict(inchi_to_smiles,orient="index").reset_index()

In [125]:
# save data
inchi_to_smiles.to_csv(inchi2smiles, sep="\t", header=False, index=False, compression="gzip")
with gzip.open(mid2inchi, "wb") as f:
    # store molecules in case we want to go back and look them up.
    pkl.dump(mid_to_inchi, f, protocol=0)

# Run Smiles Generation

`fp \t INCHI`

`
./fingerprint_smiles.sh
`

In [3]:
#check output
fp_file = "{}/raw_data/chembl20_MWmax800_fps.fp.gz".format(expt_base)

In [4]:
fp_file

'/srv/nas/mk1/users/ecaceres//20180525_DM_scrubbing/raw_data/chembl20_MWmax800_fps.fp.gz'

In [6]:
pwd


u'/srv/home/ecaceres/labgits/lab-notebook-caceres/Projects/nnets/20180525_DM_scrubbing'

In [11]:
!zcat $expt_base/raw_data/all_chembl_smiles_mid_mwcutoff800.smi | head

ChEMBL_Molecule_ID	SMILES
CHEMBL1	COc1ccc2[C@@H]3[C@H](COc2c1)C(C)(C)OC4=C3C(=O)C(=O)C5=C4OC(C)(C)[C@@H]6COc7cc(OC)ccc7[C@H]56
CHEMBL10	C[S+]([O-])c1ccc(cc1)c2nc(c3ccc(F)cc3)c([nH]2)c4ccncc4
CHEMBL1000	OC(=O)COCCN1CCN(CC1)C(c2ccccc2)c3ccc(Cl)cc3
CHEMBL10000	Ic1ccc(NC2=Nc3ccccc3C(=O)O2)cc1
CHEMBL100003	CCCC1C(=C(C)N=C(C)/C/1=C(\\O)/OC)C(=O)OCC
CHEMBL100004	CCO\\C(=C\\1/C(C)C(=C(C)N=C1C)C(=O)OCCSc2ccccc2)\\O
CHEMBL100005	COC(=O)C(Cc1ccc2OCOc2c1)c3c4ccccc4nc5ccccc35
CHEMBL100006	COc1cc(C)c(OC)c(CC(C)N)c1
CHEMBL100010	COC(=O)C1=C(CC2CCC1C2)c3ccccc3

gzip: stdout: Broken pipe


In [12]:
!ls $expt_base/raw_data

all_chembl_smiles_mid_mwcutoff800.smi	    get_smiles.log
chembl20_MWmax800_fps.fp.gz		    inchi2smiles.csv.gz
chembl20_MWmax800_smiles2inchi2mid.csv.gz   mid2inchi.csv.gz
full_chembl20_cutoff800_dm_scrubbed.csv.gz  mid2inchi.pkl
get_inchi.log


In [13]:
!pwd $expt_base/raw_data

/srv/home/ecaceres/labgits/lab-notebook-caceres/Projects/nnets/20180525_DM_scrubbing
