# Evaluation of CosmoFlow training

In [None]:
%pwd

In [None]:
import os
import sys
import yaml
import pickle
import glob

sys.path.append('..')

In [None]:
import sklearn
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

In [None]:
from models import get_model
from data.cosmo import construct_dataset
from utils.optimizers import get_optimizer

In [None]:
%matplotlib inline

In [None]:
plt.rcParams.update({'font.size': 14})

## Collected data

In [None]:
%env MLPERF_HPC_ROOT=/home/lukasd/src/mlperf

# TODO: change weak_scaling -> submission_candidates
%env MLPERF_COSMO_BENCHMARK_TIMESTAMP=weak_scaling/2020-08-03_12-43-00_daint101

In [None]:
%%bash

echo "## Data benchmark experiments ##"
set -x
tree --filelimit 20  ${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_BENCHMARK_TIMESTAMP}/scaling-gpu-n64/ # alternatively -d
set +x

## Load the results

In [None]:
def load_config(result_dir):
    config_file = os.path.join(result_dir, 'config.pkl')
    with open(config_file, 'rb') as f:
        return pickle.load(f)

def load_history(result_dir):
    history_file = os.path.join(result_dir, 'history.csv')
    return pd.read_csv(history_file)

In [None]:
ls ${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results

In [None]:
result_dir =  os.path.expandvars('${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_BENCHMARK_TIMESTAMP}/scaling-gpu-n64/')

In [None]:
ls -l $result_dir

In [None]:
config = load_config(result_dir)
train_history = load_history(result_dir)

In [None]:
config

## View training history

In [None]:
train_history

In [None]:
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(14, 6))

ax0.plot(train_history.epoch, train_history.loss, 'o-', label='train')
ax0.plot(train_history.epoch, train_history.val_loss, 'o-', label='validation')
ax0.set_xlabel('Epoch')
ax0.set_ylabel('Loss')
ax0.legend(loc=0)
ax0.grid()

ax1.plot(train_history.epoch, train_history.mean_absolute_error, 'o-', label='train')
ax1.plot(train_history.epoch, train_history.val_mean_absolute_error, 'o-', label='validation')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Mean absolute error')
ax1.legend(loc=0)
ax1.grid()

plt.tight_layout()

In [None]:
# Additional plots
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12,4))

# Plot the learning rate
ax0.plot(train_history.epoch, train_history.lr, 'o-', label='learning rate')
ax0.set_xlabel('Epoch')
ax0.set_ylabel('Learning rate')
ax0.grid()

ax1.plot(train_history.epoch, train_history.time/60, 'o-', label='time [min]')
ax1.set_ylim(bottom=0)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Epoch time [min]')
ax1.grid()

plt.tight_layout()

In [None]:
# Choose best epoch based on validation loss
best_epoch = train_history.epoch.loc[train_history.val_loss.idxmin()]
print('Best epoch:', best_epoch)

train_history.loc[[best_epoch]]

## Compare submission candidates (analogous to HPOAnalysis)

Cf.

In [None]:
result_dirs = sorted(glob.glob(os.path.expandvars('${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_BENCHMARK_TIMESTAMP}/') + '/scaling-*'))
result_dirs

In [None]:
def load_result(path):
    # Load the config
    with open(os.path.join(path, 'config.pkl'), 'rb') as f:
        config = pickle.load(f)
    # Load the history
    try:
        history = pd.read_csv(os.path.join(path, 'history.csv'))
    except Exception:
        history = None
    return config, history

In [None]:
results = [load_result(p) for p in result_dirs]
results = [r for r in results if r[1] is not None]
#results

In [None]:
def make_summary(config, history):
    best = history.val_loss.idxmin()
    return dict(
        #path=config['output_dir'],
        n_ranks=config['n_ranks'],
        batch_size=config['data']['batch_size'], # log=config['data']['apply_log'],
        conv_size=config['model']['conv_size'],
        fc1_size=config['model']['fc1_size'],
        fc2_size=config['model']['fc2_size'],
        act=config['model']['hidden_activation'],
        dropout=config['model']['dropout'],
        optimizer=config['optimizer']['name'], # lr=config['optimizer']['lr'],
        train_loss=history.loss[best],
        val_loss=history.val_loss[best],
        val_mae=history.val_mean_absolute_error[best],
        last_epoch=history.epoch.max(),
        best_epoch=history.epoch[best],
    )

# Make a summary table
summaries = pd.DataFrame([make_summary(*r) for r in results])
summaries.sort_values('val_loss')

In [None]:
best = summaries.val_loss.idxmin()

In [None]:
best_config, best_history = results[best]

In [None]:
best_config

In [None]:
best_history

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(14, 10))

for i, ax in zip(summaries.sort_values('val_loss').index, axs.flatten()):
    
    history = results[i][1]
    ax.set_title(i)
    
    ax.plot(history.epoch, history.loss, 'o-', label='Training')
    ax.plot(history.epoch, history.val_loss, 'o-', label='Validation')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.legend(loc=0)
    ax.grid()

plt.tight_layout()

## Runtime statistics

In [None]:
plt.figure(figsize=(9,6))

for r in results:
    history = r[1]
    
    plt.plot(history.epoch, history.time/60, 'o-')
    plt.xlabel('Epoch')
    plt.ylabel('Epoch time [min]')

In [None]:
epoch_times = pd.concat([r[1].time for r in results])
first_epoch_times = pd.Series([history.iloc[0].time for (config,history) in results])
later_epoch_times = pd.concat([history.iloc[1:].time for (config,history) in results])

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(14, 10), sharey=True, sharex=True)

hist_args = dict(bins=20, range=(0,100))

for (title, e_times, ax) in zip(['total', 'first epoch', 'later epochs'],
                                [epoch_times, first_epoch_times, later_epoch_times], axs.flatten()):
#     plt.figure(figsize=(9,5))

    ax.hist(e_times / 1., **hist_args)
    # plt.set_xlim(60, 120)
    ax.set_xlabel('Epoch time [s]')
    ax.set_ylabel('Counts');
    ax.set_title(title)