# CosmoFlow Benchmark Scaling Analysis

In [None]:
import os
import pickle

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
plt.rcParams.update({'font.size': 14})

In [None]:
def load_config(result_dir):
    config_file = os.path.join(result_dir, 'config.pkl')
    with open(config_file, 'rb') as f:
        return pickle.load(f)

def load_result(result_dir):
    history_file = os.path.join(result_dir, 'history.csv')
    return pd.read_csv(history_file)

def compute_mean_time(r):
    return r[r.epoch>0].time.mean()

def get_num_samples(config, ranks):
    dconf = config['data']
    n = dconf['n_train'] + dconf['n_valid']
    if not dconf['shard']:
        n *= ranks
    return n

def get_scaling_results(path_pattern, ranks):
    """
    Loops over ranks with specified file path pattern and computes scaling metrics.
    Returns results in a dataframe.
    """
    configs, results = [], []
    for r in ranks:
        result_dir = path_pattern % r
        configs.append(load_config(result_dir))
        results.append(load_result(result_dir).assign(ranks=r))
    samples = np.array([get_num_samples(c,r) for (c,r) in zip(configs, ranks)])    
    times = np.array([compute_mean_time(r) for r in results])
    throughputs = samples / times
    ideal = ranks * throughputs[0]
    eff = throughputs / ideal
    return pd.DataFrame(dict(ranks=ranks, samples=samples,
                             times=times, throughputs=throughputs,
                             ideal=ideal, eff=eff))

In [None]:
%env MLPERF_HPC_ROOT=/home/lukasd/src/mlperf

#%env MLPERF_COSMO_GPU_TIMESTAMP=weak_scaling/2020-08-03_12-43-00_daint101

# New measurement
%env MLPERF_COSMO_GPU_TIMESTAMP=weak_scaling/2020-08-06_16-34-59_daint101


%env MLPERF_COSMO_CPU_TIMESTAMP=weak_scaling/2020-08-03_15-15-51_daint101

In [None]:
%%bash

echo "## GPU experiments ##"
ls -l ${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_GPU_TIMESTAMP}
echo ""
echo "## CPU experiments ##"
ls -l ${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_CPU_TIMESTAMP}


## Daint GPU scaling

In [None]:
results_gpu = get_scaling_results(
    os.path.expandvars('${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_GPU_TIMESTAMP}/scaling-gpu-n%i'),
    ranks=np.array([1, 2, 4, 8, 16, 32, 64]))

results_gpu_dummy = get_scaling_results(
    os.path.expandvars('${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_GPU_TIMESTAMP}/scaling-gpu-dummy-n%i'),
    ranks=np.array([1, 2, 4, 8, 16, 32, 64]))

In [None]:
# Summary table
results_gpu.merge(results_gpu_dummy, on='ranks', suffixes=(None,'_dummy'))

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(8,8),
                               gridspec_kw=dict(height_ratios=[.8, .2], hspace=0))

ax0.set_title('Daint GPU scaling')
ax0.plot(results_gpu.ranks, results_gpu.throughputs, 'o-', ms=8, label='Real data')
ax0.plot(results_gpu_dummy.ranks, results_gpu_dummy.throughputs, '^-', ms=8, label='Dummy data')
ax0.plot(results_gpu.ranks, results_gpu.ideal, '--', label='Ideal')
ax0.set_ylabel('Training throughput [samples/s]')
ax0.set_yscale('log')
ax0.legend(loc=0)
ax0.grid()

# Scaling efficiency
ax1.plot(results_gpu.ranks, results_gpu.eff, 'o-', ms=8)
#ax1.plot(results_gpu_dummy.ranks, results_gpu.eff, '^-', ms=8)
ax1.set_xlabel('Number of workers')
ax1.set_ylabel('Efficiency')
ax1.set_ylim(bottom=0.75)
ax1.yaxis.set_major_locator(plt.MultipleLocator(0.1))
ax1.set_xscale('log')
ax1.set_xticks(results.ranks)
ax1.xaxis.set_major_formatter(plt.ScalarFormatter())
ax1.grid()

# Customize y-axis
throughput_ticks = np.array([(1.*scale, 3.*scale) for scale in np.logspace(1,3,3)]).flatten()[1:-1] #[100, 300, 1000, 3000, 10000, 30000, 100000]
ax0.set_yticks(throughput_ticks)
ax0.yaxis.set_major_formatter(plt.ScalarFormatter())

ax1.set_yticks(np.linspace(0.75, 1., 6))
ax1.yaxis.set_major_formatter(plt.ScalarFormatter())

plt.tight_layout()



## Daint CPU scaling

In [None]:
results = get_scaling_results(
    os.path.expandvars('${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_CPU_TIMESTAMP}/scaling-cpu-n%i'),
    ranks=np.array([1, 2, 4, 8, 16, 32, 64]))

results_dummy = get_scaling_results(
    os.path.expandvars('${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_CPU_TIMESTAMP}/scaling-cpu-dummy-n%i'),
    ranks=np.array([1, 2, 4, 8, 16, 32, 64]))

In [None]:
%%script false --no-raise-error

path_pattern = os.path.expandvars('${MLPERF_HPC_ROOT}/cosmoflow-benchmark/results/${MLPERF_COSMO_CPU_TIMESTAMP}/scaling-cpu-dummy-n%i')
ranks=np.array([1, 2, 4]) #, 8, 16, 32, 64])


def compute_mean_time(r):
    return r[r.epoch>0].time.mean()

configs, results = [], []
for r in ranks:
    result_dir = path_pattern % r
    configs.append(load_config(result_dir))
    results.append(load_result(result_dir).assign(ranks=r))
samples = np.array([get_num_samples(c,r) for (c,r) in zip(configs, ranks)])    
times = np.array([r.time.mean() for r in results])
throughputs = samples / times
ideal = ranks * throughputs[0]
eff = throughputs / ideal
pd.DataFrame(dict(ranks=ranks, samples=samples,
                  times=times, throughputs=throughputs,
                  ideal=ideal, eff=eff))


In [None]:
%%script false --no-raise-error

dconf = configs[0]['data']
n = dconf['n_train'] + dconf['n_valid']
if not dconf['shard']:
    n *= ranks
n

In [None]:
# Summary table
results.merge(results_dummy, on='ranks', suffixes=(None,'_dummy'))

In [None]:
fig, (ax0, ax1) = plt.subplots(nrows=2, sharex=True, figsize=(8,8),
                               gridspec_kw=dict(height_ratios=[.8, .2], hspace=0))

ax0.set_title('Daint CPU scaling')
ax0.plot(results.ranks, results.throughputs, 'o-', ms=8, label='Real data')
ax0.plot(results_dummy.ranks, results_dummy.throughputs, '^-', ms=8, label='Dummy data')
ax0.plot(results.ranks, results.ideal, '--', label='Ideal')
ax0.set_ylabel('Training throughput [samples/s]')
ax0.set_yscale('log')
ax0.legend(loc=0)
ax0.grid()

# Scaling efficiency
ax1.plot(results.ranks, results.eff, 'o-', ms=8)
ax1.set_xlabel('Number of workers')
ax1.set_ylabel('Efficiency')
ax1.set_ylim(bottom=0.5)
ax1.yaxis.set_major_locator(plt.MultipleLocator(0.1))
ax1.set_xscale('log')
ax1.set_xticks(results.ranks)
ax1.xaxis.set_major_formatter(plt.ScalarFormatter())
ax1.grid()

# Customize y-axis
throughput_ticks = np.array([(1.*scale, 3.*scale) for scale in np.logspace(-1,1,3)]).flatten()[1:] #[100, 300, 1000, 3000, 10000, 30000, 100000]
ax0.set_yticks(throughput_ticks)
ax0.yaxis.set_major_formatter(plt.ScalarFormatter())

ax1.set_yticks(np.linspace(0.5, 1., 6))
ax1.yaxis.set_major_formatter(plt.ScalarFormatter())

plt.tight_layout()

## Discussion

TBD
