# ESOL Dataset

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import numpy as np
import tensorflow as tf
import deepchem as dc

## Datasets

In [None]:
import pathlib

def dataset(filename):
    DATA_DIR = pathlib.Path('.', 'data')
    filepath = DATA_DIR / pathlib.Path(filename)
    return filepath

In [None]:
datasets = dict()
datasets["esol"] = dataset("esol.csv")

## Load and Featurize

In [None]:
def load(filepath):
    featurizer = dc.feat.CircularFingerprint(size=1024)
    loader = dc.data.CSVLoader(tasks=["measured log solubility in mols per litre"], 
                               smiles_field="smiles",
                               featurizer=featurizer)
    file = loader.featurize(filepath)
    print(file.shape)
    return file

In [None]:
datasets["esol"] = load(datasets["esol"])

## Cross Validation Split

In [None]:
def split(file):
    splitter = dc.splits.ScaffoldSplitter(file)
    train, valid, test = splitter.train_valid_test_split(file)
    return dict(train=train, valid=valid, test=test)

In [None]:
datasets["esol"] = split(datasets["esol"], transformers=transformers)

## Transform

In [None]:
def transformers(dataset):
    transformers = [lambda x: dc.trans.NormalizationTransformer(transform_y=True, 
                                                                dataset=x)]
    transformers = [transformer(dataset) for transformer in transformers]
    return transformers

def transform(dataset, transformers):   
    for key in ["train", "valid", "test"]:
        for transformer in transformers:
            dataset[key] = transformer.transform(dataset[key])

In [None]:
transformers = dict()
transformers["esolv"] = transformers(datasets["esolv"]["train"])

datasets["esolv"] = transform(datasets["esolv"], transformers["esolv"])

## Optimize

In [None]:
def optimize(model, params, dataset, transformers, metric):
    optimizer = dc.hyper.HyperparamOpt(model)
    optimized = optimizer.hyperparam_search(params,
                                            dataset["train"], dataset["valid"],
                                            transformers, metric=metric)
    return optimized

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from deepchem.utils.evaluate import Evaluator

def model(dataset, transformers, metric):
    model = dc.models.SklearnModel(RandomForestRegressor(n_estimators=100))
    model.fit(dataset["train"])
    # evaluator = Evaluator(model.valid_dataset, transformers)
    evaluator = Evaluator(model, dataset["valid"], transformers)
    r2score = evaluator.compute_model_performance([metric])
    return model, evaluator, r2score
    
def rf_model_builder(model_params, model_dir):
    return dc.models.SklearnModel(RandomForestRegressor(**model_params),model_dir)

In [None]:
params = {
    "n_estimators": [10, 100],
    "max_features": ['auto', 'sqrt', 'log2', None]
}

metric = dc.metrics.Metric(dc.metrics.r2_score)
model, evaluator, r2score = model(datasets["esol"], transformers["esol"], metric)

optimized = optimize(rf_model_builder, params, datasets["esol"], transformers["esol"], metric)
best_rf, best_rf_hyperparams, all_rf_results = optimized

## Multitask Network _(tensorflow)_

In [None]:
def NNselector(model_params, model_dir):
    n_features = train_dataset.get_data_shape()[0]
    model = dc.models.TensorflowMultiTaskRegressor(
        1, n_features, layer_sizes=[1000], dropouts=[.25], batch_size=50,
        **model_params)
    return model

In [None]:
params = {
    "learning_rate": np.power(10.,np.random.uniform(-5,-3,size=1)),
    "decay": np.power(10,np.random.uniform(-6,-4,size=1)),
    "nb_epoch": [20]
}

optimized = optimize(NNselector, params, datasets["esol"], transformers["esol"], metric)
best_dnn, best_dnn_hyperparams, all_dnn_results = optimized

## Multitask Network

In [None]:
def NNselector(model_params, model_dir):
    n_features = train_dataset.get_data_shape()[0]
    model = dc.models.MultiTaskRegressor(
        1, n_features, layer_sizes=[1000], dropouts=[.25], batch_size=50,
        **model_params)
    return model

In [None]:
params = {
    "learning_rate": np.power(10., np.random.uniform(-5, -3, size=1)),
    "decay": np.power(10, np.random.uniform(-6, -4, size=1)),
    "nb_epoch": [20]
}

optimized = optimize(NNselector, params, datasets["esol"], transformers["esol"], metric)
best_dnn, best_dnn_hyperparams, all_dnn_results = optimized

## Evaluation

In [None]:
rf_test_evaluator = Evaluator(best_rf, datasets["esol"]["test"], transformers["esol"])
rf_test_r2score = rf_test_evaluator.compute_model_performance([metric])
print("RF Test set R^2 %f" % (rf_test_r2score["r2_score"]))

task = "measured log solubility in mols per litre"
predicted_test = best_rf.predict(datasets["esol"]["test"])