In [122]:
#import argparse
import os
import numpy as np
import pandas as pd
import deepchem as dc
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV
from deepchem.utils.data_utils import load_from_disk, save_to_disk
import umap
import matplotlib.pyplot as plt
import plotly.express as px

In [139]:
TRAIN = "../data/smiles_all_property.csv"
TARGET_COL = "logP"
SMILES_COL = "smiles"
PROPERTY = TARGET_COL


if TARGET_COL == "Decomposition Energy":
    PROPERTY = "Decomposition_Energy"
elif TARGET_COL == "HOMO/LUMO gap":
    PROPERTY = "HOMOLUMO_gap"

MODEL_DIR = "../model/" + PROPERTY
LOG_DIR = "../log/" + PROPERTY

DATA_DIR = "../data/" + PROPERTY

DATA_EXIST = os.path.exists(DATA_DIR + "/tasks.json")

In [140]:

def model_builder(**model_params):
    estimator = LGBMRegressor()
    estimator.set_params(**model_params)
    return dc.models.SklearnModel(estimator)

In [141]:
featurizer = dc.feat.RDKitDescriptors()

In [142]:
# 学習データの読み込み
loader = dc.data.CSVLoader(tasks=[TARGET_COL],
                            feature_field=SMILES_COL,
                            featurizer=featurizer)

In [143]:
if DATA_EXIST == True:
        dataset = dc.data.DiskDataset(DATA_DIR)
else:
        dataset = loader.create_dataset(TRAIN, data_dir=DATA_DIR)

print("Data loaded.")

Data loaded.


In [144]:
splitter = dc.splits.IndexSplitter()

In [145]:
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, frac_train=0.6, frac_valid=0.2, frac_test=0.2)

In [146]:
transformers = [dc.trans.NormalizationTransformer(transform_y=True,dataset=train_dataset)]

In [147]:
for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    #test_dataset = transformer.transform(test_dataset)

In [148]:
print(train_dataset.X.shape)
print(valid_dataset.X.shape)
print(test_dataset.X.shape)

(52576, 208)
(17526, 208)
(17526, 208)


In [149]:
train_dataset.X

array([[ 8.65305556,  0.08796296,  8.65305556, ...,  0.        ,
         0.        ,  0.        ],
       [ 8.69802469,  0.24481481,  8.69802469, ...,  0.        ,
         0.        ,  0.        ],
       [10.19856481, -0.67361111, 10.19856481, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 4.9600463 ,  0.53303655,  4.9600463 , ...,  0.        ,
         0.        ,  0.        ],
       [12.91320122, -0.21767248, 12.91320122, ...,  0.        ,
         0.        ,  0.        ],
       [13.09113958, -3.95995607, 13.09113958, ...,  0.        ,
         0.        ,  0.        ]])

In [150]:
print(dataset)
print(train_dataset)

<DiskDataset X.shape: (87628, 208), y.shape: (87628, 1), w.shape: (87628, 1), task_names: ['logP']>
<DiskDataset X.shape: (52576, 208), y.shape: (52576, 1), w.shape: (52576, 1), task_names: ['logP']>


In [151]:
import multiprocessing
multiprocessing.cpu_count()

8

In [158]:
standard_embedding = umap.UMAP(n_components=3, 
                               n_neighbors=15, 
                               metric='euclidean', 
                               random_state=42).fit_transform(dataset.X)


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [159]:
df_umap = pd.DataFrame(standard_embedding)
df_umap['property'] = dataset.y
df_umap

Unnamed: 0,0,1,2,property
0,16.121729,9.098937,5.862377,1.09780
1,15.919868,8.811953,6.026783,2.04560
2,16.109858,9.114559,5.947010,0.23918
3,16.083319,9.136003,6.018345,1.09344
4,16.099524,9.130603,5.995924,1.09344
...,...,...,...,...
87623,7.456668,0.176043,7.393705,3.26300
87624,13.846591,0.190046,7.164008,2.42940
87625,7.349648,2.420997,6.600700,3.78532
87626,6.327453,3.135284,6.756073,3.62800


In [160]:
fig_3d = px.scatter_3d(
    df_umap, x=0, y=1, z=2,
    width=800, height=800, color='property'
)

fig_3d.update_traces(marker_size=1)
fig_3d.show()