# QM7 Dataset  
[reference examples](https://github.com/deepchem/deepchem/tree/master/examples/qm7)

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import numpy as np
import tensorflow as tf
import sklearn 
import deepchem as dc

In [None]:
import pathlib
model_dir = pathlib.Path('../models')

**Random Forest**  
[reference example](https://github.com/deepchem/deepchem/blob/master/examples/)

In [None]:
qm7_tasks, qm7_datasets, transformers = dc.molnet.load_qm7_from_mat(
    featurizer='Raw', split='stratified')
train_dataset, valid_dataset, test_dataset = qm7_datasets

In [None]:
from sklearn.ensemble import RandomForestRegressor

def rf_model_builder(model_params, model_dir):
    rf_model = RandomForestRegressor(model_dir, **model_params)
    return dc.models.SklearnModel(rf_model, model_dir)

In [None]:
metrix = [
] # TODO

params = {
} # TODO

model = rf_model_builder(params, model_dir)

print("training RF model")
model.fit(train_dataset)

print("evaluating RF model on {train,valid,test} sets")
train_scores = model.evaluate(train_dataset, metric, transformers)
print("train scores: [kcal/mol]")
print(train_scores)

valid_scores = model.evaluate(valid_dataset, metric, transformers)
print("valid scores: [kcal/mol]")
print(valid_scores)

test_scores = model.evaluate(test_dataset, metric, transformers)
print("test scores: [kcal/mol]")
print(test_scores)

**Multitask Network**  
[reference example](https://github.com/deepchem/deepchem/blob/master/examples/qm7)

In [None]:
qm7_tasks, qm7_datasets, transformers = dc.molnet.load_qm7_from_mat(
    featurizer='Raw', split='stratified')
train_dataset, valid_dataset, test_dataset = qm7_datasets

In [None]:
from dc.models import MultiTaskRegressor

def mtr_model_builder(model_params, model_dir):
    mtr_model = MultiTaskRegressor(model_dir, **model_params)
    return mn_model

In [None]:
metrics = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]

params = {
    "n_tasks": len(qm7_tasks)
}

model = mtr_model_builder(params, model_dir)

print("training MTR model")
model.fit(train_dataset, nb_epoch=50)

print("evaluating MTR model on {train,valid,test} sets")
train_scores = model.evaluate(train_dataset, metric, transformers)
print("train scores: [kcal/mol]")
print(train_scores)

valid_scores = model.evaluate(valid_dataset, metric, transformers)
print("valid scores: [kcal/mol]")
print(valid_scores)

test_scores = model.evaluate(test_dataset, metric, transformers)
print("test scores: [kcal/mol]")
print(test_scores)

**Kernel Ridge Regression**  
[reference example](https://github.com/deepchem/deepchem/blob/master/examples/qm7)

In [None]:
qm7_tasks, qm7_datasets, transformers = dc.molnet.load_qm7_from_mat(
    featurizer='Raw', split='stratified')
train_dataset, valid_dataset, test_dataset = qm7_datasets

In [None]:
from sklearn.kernel_ridge import KernelRidge

def krr_model_builder(model_params, model_dir):
    krr_model = KernelRidge(model_dir, **model_params)
    sklearn_model_builder = lambda model_dir: dc.models.SklearnModel(krr_model, model_dir)
    return dc.models.SingletaskToMultitask(qm7_tasks, 
                                           sklearn_model_builder, 
                                           model_dir)

In [None]:
metric = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression")
]

params = {
    "kernel": "rbf",
    "alpha": 5e-4,
    "gamma": 0.008
}

print("training KRR model")
model = krr_model_builder(params, model_dir)
model.fit(train_dataset)

print("evaluating KRR model on {train,valid,test} sets")
train_evaluator = dc.utils.evaluate.Evaluator(model, train_dataset, transformers)
train_scores = train_evaluator.compute_model_performance(metric)
print("train scores: [kcal/mol]")
print(train_scores)

valid_evaluator = dc.utils.evaluate.Evaluator(model, valid_dataset, transformers)
valid_scores = valid_evaluator.compute_model_performance(metric)
print("valid scores: [kcal/mol]")
print(valid_scores)

test_evaluator = dc.utils.evaluate.Evaluator(model, test_dataset, transformers)
test_scores = test_evaluator.compute_model_performance(metric)
print("test scores: [kcal/mol]")
print(test_scores)

**Graph Convolution**  
[reference example](https://github.com/deepchem/deepchem/blob/master/examples/qm7/qm7_tensorgraph_GraphConv.py)

In [None]:
qm7_tasks, qm7_datasets, transformers = dc.molnet.load_qm7_from_mat(
    featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = qm7_datasets

In [None]:
from dc.models.tensorgraph.models.graph_models import GraphConvTensorGraph

def gc_model_builder(model_params, model_dir):
    return GraphConvTensorGraph(mode="regression", # tensorboard=True, 
                                model_dir=model_dir, 
                                **model_params)

In [None]:
metric = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]

params = {
    "n_tasks": len(qm7_tasks),
    "batch_size": 64,
    "learning_rate": 0.001
}

model = gc_model_builder(params, model_dir)

print("training GC model")
model.fit(train_dataset, nb_epoch=50)

print("evaluating GC model on {train,valid,test} sets")
train_scores = model.evaluate(train_dataset, metric, transformers)
print("train scores: [kcal/mol]")
print(train_scores)

valid_scores = model.evaluate(valid_dataset, metric, transformers)
print("valid scores: [kcal/mol]")
print(valid_scores)

test_scores = model.evaluate(test_dataset, metric, transformers)
print("test scores: [kcal/mol]")
print(test_scores)

In [None]:
# import subprocess
# subprocess.run(['tensorboard', '--logdir=model.model_dir'])

**Multitask Network** _(Coulomb Matrix Featurizer)_  
[reference example](https://github.com/deepchem/deepchem/blob/master/examples/)

In [None]:
qm7_tasks, qm7_datasets, transformers = dc.molnet.load_qm7_from_mat(
    featurizer='CoulombMatrix', split='stratified')
train_dataset, valid_dataset, test_dataset = qm7_datasets

In [None]:
from dc.models import MultiTaskRegressor

def mtr_model_builder(model_params, model_dir):
    mtr_model = MultiTaskRegressor(model_dir, **model_params)
    return mn_model

In [None]:
metrics = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]

params = {
    "n_tasks": len(qm7_tasks) # TODO
}

model = mtr_model_builder(params, model_dir)

print("training MTR model")
model.fit(train_dataset, nb_epoch=50)

print("evaluating MTR model on {train,valid,test} sets")
train_scores = model.evaluate(train_dataset, metric, transformers)
print("train scores: [kcal/mol]")
print(train_scores)

valid_scores = model.evaluate(valid_dataset, metric, transformers)
print("valid scores: [kcal/mol]")
print(valid_scores)

test_scores = model.evaluate(test_dataset, metric, transformers)
print("test scores: [kcal/mol]")
print(test_scores)

**Kernel Ridge Regression** _(Coulomb Matrix Featurizer)_  
[reference example](https://github.com/deepchem/deepchem/blob/master/examples/qm7/qm7_sklearn.py)

In [None]:
qm7_tasks, qm7_datasets, transformers = dc.molnet.load_qm7_from_mat(
    featurizer='CoulombMatrix', split='stratified')
train_dataset, valid_dataset, test_dataset = qm7_datasets

In [None]:
from sklearn.kernel_ridge import KernelRidge

def krr_model_builder(model_params, model_dir):
    krr_model = KernelRidge(model_dir, **model_params)
    sklearn_model_builder = lambda model_dir: dc.models.SklearnModel(krr_model, model_dir)
    return dc.models.SingletaskToMultitask(qm7_tasks, 
                                           sklearn_model_builder,
                                           model_dir)

In [None]:
metric = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression")
]

params = {
    "kernel": "rbf",
    "alpha": 5e-4,
    "gamma": 0.008
}

model = krr_model_builder(params, model_dir)

print("training KRR model")
model.fit(train_dataset)

print("evaluating KRR model on {train,valid,test} sets")
train_evaluator = dc.utils.evaluate.Evaluator(model, train_dataset, transformers)
train_scores = train_evaluator.compute_model_performance(metric)
print("train scores: [kcal/mol]")
print(train_scores)

valid_evaluator = dc.utils.evaluate.Evaluator(model, valid_dataset, transformers)
valid_scores = valid_evaluator.compute_model_performance(metric)
print("valid scores: [kcal/mol]")
print(valid_scores)

test_evaluator = dc.utils.evaluate.Evaluator(model, test_dataset, transformers)
test_scores = test_evaluator.compute_model_performance(metric)
print("test scores: [kcal/mol]")
print(test_scores)

**Deep Tensor Neural Network**  
[reference example](https://github.com/deepchem/deepchem/blob/master/examples/qm7/qm7_tensorgraph_DTNN.py)

In [None]:
qm7_tasks, qm7_datasets, transformers = dc.molnet.load_qm7_from_mat(
    featurizer='Raw', split='stratified')
train_dataset, valid_dataset, test_dataset = qm7_datasets

In [None]:
def dtnn_model_builder(model_params, model_dir):
    return dc.models.DTNNModel(output_activation=False,
                               use_queue=False,
                               mode="regression", 
                               model_dir=model_dir, 
                               **model_params)

In [None]:
from deepchem.models.tensorgraph.optimizers import ExponentialDecay

metric = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
    dc.metrics.Metric(dc.metric.pearson_r2_score, mode="regression")
]

params = {
    "n_tasks": len(qm7_tasks),
    "n_embedding": 30,
    "n_hidden": 15,
    "n_distance": 51,
    "distance_min": -1.0,
    "distance_max": 9.2,
    "batch_size": 50,
    "learning_rate": ExponentialDecay(0.0001, 0.97, 5000)
}

model = dtnn_model_builder(params, model_dir)

print("training dtnn model")
model.fit(train_dataset, nb_epoch=30)

print("evaluating dtnn model on {train,valid,test} sets")
train_scores = model.evaluate(train_dataset, metric, transformers)
print("train scores: [kcal/mol]\n" + str(train_scores))
valid_scores = model.evaluate(valid_dataset, metric, transformers)
print("valid scores [kcal/mol]\n" + str(valid_scores))
test_scores = model.evaluate(test_dataset, metric, transformers)
print("test scores [kcal/mol]\n" + str(test_scores))


**ANI-1**  
[reference example](https://github.com/deepchem/deepchem/blob/master/examples/qm7/qm7_ANI.py)

In [None]:
HARTREE_TO_KCAL_PER_MOL = 627.509

qm7_tasks, qm7_datasets, transformers = dc.molnet.load_qm7_from_mat(
    featurizer='BPSymmetryFunction', split='index', reload=False)

# ... (see 'qm7_ANI.py) ...

In [None]:
# HARTREE_TO_KCAL_PER_MOL = 627.509

# tasks, datasets, transformers = dc.molnet.load_qm7_from_mat(
#     featurizer='BPSymmetryFunction', split='index', reload=False)
# all_dataset = dc.data.DiskDataset.merge(datasets)
# invalid_inds = []
# X = all_dataset.X
# for i in range(X.shape[0]):
#   # Exclude all molecules having S
#   if 16 in X[i, :, 0]:
#     invalid_inds.append(i)
# valid_inds = np.delete(np.arange(all_dataset.y.shape[0]), invalid_inds)
# dataset = all_dataset.select(valid_inds)

# splitter = dc.splits.RandomSplitter()
# train, valid, test = splitter.train_valid_test_split(dataset)

# y = dc.trans.undo_transforms(train.y, transformers) / HARTREE_TO_KCAL_PER_MOL
# train = dc.data.DiskDataset.from_numpy(
#     train.X, y, w=train.w, ids=train.ids, tasks=train.tasks)

# y = dc.trans.undo_transforms(valid.y, transformers) / HARTREE_TO_KCAL_PER_MOL
# valid = dc.data.DiskDataset.from_numpy(
#     valid.X, y, w=valid.w, ids=valid.ids, tasks=valid.tasks)

# y = dc.trans.undo_transforms(test.y, transformers) / HARTREE_TO_KCAL_PER_MOL
# test = dc.data.DiskDataset.from_numpy(
#     test.X, y, w=test.w, ids=test.ids, tasks=test.tasks)

In [None]:
def ani_model_builder(model_params, model_dir):
    return dc.models.ANIRegression(exp_loss=False, 
                                   use_queue=False, 
                                   mode="regression", 
                                   model_dir=model_dir, 
                                   **model_params)

In [None]:
metric = [
    dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
    dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]

params = {
    "n_tasks": len(qm7_tasks),
    "max_atoms": 23,
    "layer_structures": [64, 64, 32],
    "atom_number_cases": [1, 6, 7, 8],
    "batch_size": 128,
    "learning_rate": 1e-3
}

valid_best = 100.0

# learning_rates = [1e-3, 1e-4, 1e-5, 3e-6, 1e-6, 3e-7, 1e-7, 3e-8, 1e-8, 3e-9, 1e-9]
# for learning_rate in learning_rates:

model = ani_model_builder(params, model_dir)
model.fit(train_dataset, nb_epoch=10)
local_ct = 0 # no idea what this is
while local_ct < 100:
    local_ct += 1
    model.fit(train_dataset, nb_epoch=1)
    
    train_scores = model.evaluate(train_dataset, metric)
    valid_scores = model.evaluate(valid_dataset, metric)
    
    print("train scores  [MAE(kcal/mol)]")
    print(train_scores['mean_absolute_error']) # * HARTREE_TO_KCAL_PER_MOL
    
    print("valid scores  [MAE(kcal/mol)]")
    print(valid_scores['mean_absolute_error']) # * HARTREE_TO_KCAL_PER_MOL
    
    if valid_scores['mean_absolute_error'] < valid_best:
        local_ct = 0
        valid_best = valid_scores['mean_absolute_error']
        test_scores = model.evaluate(test_dataset, metric)
        
        print("test scores [MAE(kcal/mol)]")
        print(test_scores['mean_absolute_error']) # * HARTREE_TO_KCAL_PER_MOL