## Preprocessing

In [1]:
import os
os.chdir("../")

In [2]:
! ls

Icon?           README.md       [34mdeepscreening[m[m   [34mimage[m[m           [34mnotebook[m[m
LICENSE         [34mdata[m[m            environment.yml [34mmodel[m[m           train.py


#### data for ChemVAE

```sh
$ tree data
data
├── 250k_rndm_zinc_drugs_clean_3.csv
├── idx2smiles.json
├── test_data
│   ├── test_chemvae_logit.npy
│   ├── test_chemvae_reg.npy
│   └── test_smiles.npy
└── train_data
    ├── train_chemvae_logit.npy
    ├── train_chemvae_reg.npy
    └── train_smiles.npy
```

In [6]:
import json
import numpy as np
import pandas as pd
from deepscreening.utils import smiles2onehot
from kerasy.utils import train_test_split
from kerasy.utils import disp_val_shapes

In [7]:
df = pd.read_csv("data/250k_rndm_zinc_drugs_clean_3.csv")
smiles  = df.smiles.values
y_reg   = df[["logP", "SAS"]].values
y_logit = df[["qed"]].values
X, idx2smiles = smiles2onehot(smiles, return_meta=True)

with open("data/idx2smiles.json", mode='w') as f:
    json.dump(idx2smiles, f)

(x_train,x_test),(y_reg_train, y_reg_test),(y_logit_train, y_logit_test) = train_test_split(X, y_reg, y_logit, random_state=0, test_size=0.2)

np.save("data/train_data/train_smiles.npy",        x_train)
np.save("data/train_data/train_chemvae_reg.npy",   y_reg_train)
np.save("data/train_data/train_chemvae_logit.npy", y_logit_train)
np.save("data/test_data/test_smiles.npy",          x_test)
np.save("data/test_data/test_chemvae_reg.npy",     y_reg_test)
np.save("data/test_data/test_chemvae_logit.npy",   y_logit_test)

disp_val_shapes(x_train, y_reg_train, y_logit_train, x_test, y_reg_test, y_logit_test, scope_=locals())

#=== ARRAY SHAPES ===
x_train.shape      : (199564, 109, 35)
y_reg_train.shape  : (199564, 2)
y_logit_train.shape: (199564, 1)
x_test.shape       : (49891, 109, 35)
y_reg_test.shape   : (49891, 2)
y_logit_test.shape : (49891, 1)


#### data for DeepBind

```sh
$ tree data
data
├── 250k_rndm_zinc_drugs_clean_3.csv
├── idx2smiles.json
├── test_data
│   ├── test_chemvae_logit.npy
│   ├── test_chemvae_reg.npy
│   └── test_smiles.npy
└── train_data
    ├── train_chemvae_logit.npy
    ├── train_chemvae_reg.npy
    └── train_smiles.npy
```