## Preprocessing

In [1]:
import os
os.chdir("../")

In [2]:
! ls

Icon?                create_new_params.py [34mimage[m[m
LICENSE              [34mdata[m[m                 [34mmodel[m[m
README.md            [34mdeepscreening[m[m        [34mnotebook[m[m
Untitled.ipynb       environment.yml      train.py


#### data for ChemVAE

```sh
$ tree data
data
├── 250k_rndm_zinc_drugs_clean_3.csv
├── idx2smiles.json
├── test_data
│   ├── test_chemvae_logit.npy
│   ├── test_chemvae_reg.npy
│   └── test_smiles.npy
└── train_data
    ├── train_chemvae_logit.npy
    ├── train_chemvae_reg.npy
    └── train_smiles.npy
```

In [3]:
import json
import numpy as np
import pandas as pd
from deepscreening.utils import smiles2onehot
from kerasy.utils import train_test_split
from kerasy.utils import disp_val_shapes

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
df = pd.read_csv("data/250k_rndm_zinc_drugs_clean_3.csv")
smiles  = df.smiles.values
y_reg   = df[["logP", "SAS"]].values
y_logit = df[["qed"]].values
x, idx2smiles = smiles2onehot(smiles, return_meta=True)

with open("data/idx2smiles.json", mode='w') as f:
    json.dump(idx2smiles, f)

(x_obs,x_test),(y_reg_obs, y_reg_test),(y_logit_obs, y_logit_test) = train_test_split(x, y_reg, y_logit, random_state=0, test_size=0.2)

(x_train,x_val),(y_reg_train, y_reg_val),(y_logit_train, y_logit_val) = train_test_split(x_obs, y_reg_obs, y_logit_obs, random_state=0, test_size=0.2)


np.save("data/train_data/x_train_smiles.npy",        x_train)
np.save("data/train_data/y_train_chemvae_reg.npy",   y_reg_train)
np.save("data/train_data/y_train_chemvae_logit.npy", y_logit_train)
np.save("data/train_data/x_val_smiles.npy",          x_val)
np.save("data/train_data/y_val_chemvae_reg.npy",     y_reg_val)
np.save("data/train_data/y_val_chemvae_logit.npy",   y_logit_val)
np.save("data/test_data/x_test_smiles.npy",          x_test)
np.save("data/test_data/y_test_chemvae_reg.npy",     y_reg_test)
np.save("data/test_data/y_test_chemvae_logit.npy",   y_logit_test)

disp_val_shapes(
    x_train, y_reg_train, y_logit_train, 
    x_val,   y_reg_val,   y_logit_val,
    x_test,  y_reg_test,  y_logit_test, 
    scope_=locals()
)

#=== ARRAY SHAPES ===
x_train.shape      : (159651, 109, 35)
y_reg_train.shape  : (159651, 2)
y_logit_train.shape: (159651, 1)
x_val.shape        : (39913, 109, 35)
y_reg_val.shape    : (39913, 2)
y_logit_val.shape  : (39913, 1)
x_test.shape       : (49891, 109, 35)
y_reg_test.shape   : (49891, 2)
y_logit_test.shape : (49891, 1)


#### data for DeepBind

```sh
$ tree data
data
├── 250k_rndm_zinc_drugs_clean_3.csv
├── idx2smiles.json
├── test_data
│   ├── test_chemvae_logit.npy
│   ├── test_chemvae_reg.npy
│   └── test_smiles.npy
└── train_data
    ├── train_chemvae_logit.npy
    ├── train_chemvae_reg.npy
    └── train_smiles.npy
```