# Distribution learning on MOSES dataset: calculating metrics

You can calculate metrics from checkpoints using this notebook. Note that training the models takes ~30h per model on Titan X (Pascal); computing MOSES metrics for checkpoints takes ~40h per model.

To reproduce the models and statistics, run the following bash script:
```{bash}
for SEED in 1 2 3
do
    for PROPOSAL in gaussian triweight
    do
        python train.py --config configs/moses/VAE_$PROPOSAL\_seed$SEED.ini --device cuda:0
        python train.py --config configs/moses/DD-VAE_$PROPOSAL\_seed$SEED.ini --device cuda:0
    done
done
```

This script will save models into `models/moses` folder and tensorboard logs into `logs/moses` folder.

The notebook below will create files with all MOSES metrics for each checkpoint. `moses_plots.ipynb` will use logs and MOSES metrics to build final plots.

In [1]:
import os
import glob
import pickle
import gc
from time import sleep

import rdkit
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import torch
from moses.metrics import get_all_metrics

from dd_vae.vae_rnn import VAE_RNN
from dd_vae.utils import prepare_seed

rdkit.rdBase.DisableLog('rdApp.*')



In [2]:
DEVICE = 'cuda:0'
N_JOBS = 32

def load_csv(path):
    df = pd.read_csv(path, compression='gzip', dtype='str', header=None)
    return list(df[0].values)

In [3]:
test = load_csv('data/moses/test.csv.gz')
test_scaffolds = load_csv('data/moses/test_scaffolds.csv.gz')
train = load_csv('data/moses/train.csv.gz')

test_stats = np.load('data/moses/test_stats.npz', allow_pickle=True)['stats'].item()
test_scaffold_stats = np.load('data/moses/test_scaffolds_stats.npz', allow_pickle=True)['stats'].item()

In [4]:
def prepare_metrics(name, checkpoint_id, overwrite=False, device='cpu', n_jobs=1):
    path = f'models/moses/{name}/checkpoint_{checkpoint_id}.pt'
    output_path = f'metrics/{name}/{checkpoint_id}.pkl'
    os.makedirs(f'metrics/{name}/', exist_ok=True)
    if os.path.exists(output_path) and not overwrite:
        raise ValueError(f"Metrics file {output_path} already exists")
    model = VAE_RNN.load(path).to(device)
    prepare_seed(1)
    with torch.no_grad():
        smiles = sum([model.sample(100) for _ in tqdm(range(300))], [])
    model.to(device)
    del model
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()
    if device == 'cpu':
        gpu = -1
    else:
        gpu = int(device.split(':')[1])

    metrics = get_all_metrics(
        test=test, gen=smiles,
        test_scaffolds=test_scaffolds, gpu=gpu, n_jobs=n_jobs,
        ptest=test_stats,
        ptest_scaffolds=test_scaffold_stats,
        train=train)

    with open(output_path, 'wb') as f:
        pickle.dump(metrics, f)
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()

In [5]:
def get_epoch_id(path):
    try:
        return int(path.split('_')[-1][:-3])
    except ValueError:
        return None

In [6]:
checkpoints = glob.glob('models/moses/*/*.pt')
checkpoints = [x for x in checkpoints if get_epoch_id(x) is not None]
for checkpoint in checkpoints:
    try:
        epoch_id = int(checkpoint.split('_')[-1][:-3])
    except ValueError:
        continue
    config_id = checkpoint.split('/')[-2]
    print(f"Processing {checkpoint}")
    try:
        prepare_metrics(config_id, epoch_id, device=DEVICE, n_jobs=N_JOBS)
    except ValueError:
        pass