In [1]:
import itertools as it
import os

import biom
from matplotlib import rcParams
import matplotlib.colors as mplc
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sn
import statsmodels.api as sms
import statsmodels.formula.api as smf
import skbio

from qiime2 import Artifact, Metadata, MetadataColumn

In [2]:
rcParams['font.sans-serif'] = ['Helvetica', 'Arial']
rcParams['pdf.fonttype'] = 42
np.set_printoptions(precision=5, suppress=True)  # suppress scientific float notation

In [3]:
%matplotlib inline

In [4]:
optivag_benchmark_dir = 'data/output/benchmark/real/'
benchmark_files = os.listdir(optivag_benchmark_dir)

In [5]:
benchmark_files

['reconstruct_taxonomy.tsv',
 'denoise_v13.tsv',
 'filter_greengenes_reference.tsv',
 'trim_v34.tsv',
 'extract_sidle_regions_v34.tsv',
 'vaginal',
 'prepare_extracted_region_v13.tsv',
 'align_sidle_regions_v34.tsv',
 'extract_sidle_regions_v13.tsv',
 'prepare_extracted_region_v34.tsv',
 'align_sidle_regions_v13.tsv',
 'reconstruct_table_single.tsv',
 'trim_v13.tsv',
 'trim_data_v13.tsv',
 'denoise_v34.tsv']

In [6]:
timestamp = pd.DataFrame({
    fp_.split('.')[0]: pd.read_csv(os.path.join(optivag_benchmark_dir, fp_), 
                                   sep='\t', dtype=str).iloc[0]
    for fp_ in benchmark_files 
    if os.path.isfile(os.path.join(optivag_benchmark_dir, fp_))
}).T
timestamp.index.set_names('filename', inplace=True)
timestamp['command'] = pd.Series({id_: id_.split("_v")[0] for id_ in timestamp.index})
region_f = lambda id_: 'v{}'.format(id_.split("_v")[1]) if ('_v' in id_) else np.nan
timestamp['region'] = pd.Series({id_: region_f(id_) for id_ in timestamp.index})
timestamp['runtime'] = pd.to_timedelta(timestamp['s'].astype(float), unit='s')
timestamp['per_sample'] = timestamp['runtime'] / 24
timestamp = timestamp[['command', 'region', 'runtime', 'per_sample']]

In [16]:
timestamp['runtime'].round("s")

filename
reconstruct_taxonomy           0 days 00:00:06
denoise_v13                    0 days 00:05:20
filter_greengenes_reference    0 days 00:01:19
trim_v34                       0 days 00:00:06
extract_sidle_regions_v34      0 days 00:05:49
prepare_extracted_region_v13   0 days 00:00:46
align_sidle_regions_v34        0 days 00:06:25
extract_sidle_regions_v13      0 days 00:05:33
prepare_extracted_region_v34   0 days 00:01:21
align_sidle_regions_v13        0 days 00:05:33
reconstruct_table_single       0 days 00:00:17
trim_v13                       0 days 00:00:06
trim_data_v13                  0 days 00:01:05
denoise_v34                    0 days 00:10:49
Name: runtime, dtype: timedelta64[ns]

In [7]:
timestamp['command_type'] = timestamp['command'].replace({
    'denoise': 'denoise',
    'trim': 'denoise',
    'trim_data': 'denoise',
    'align_sidle_regions': 'align',
    'extract_sidle_regions': 'database',
    'prepare_extracted_region': 'database',
    'reconstruct_table_single': 'reconstruction',
    'reconstruct_taxonomy': 'reconstruction',
})

In [17]:
timestamp

Unnamed: 0_level_0,command,region,runtime,per_sample,command_type
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
reconstruct_taxonomy,reconstruct_taxonomy,,0 days 00:00:05.507499999,0 days 00:00:00.229479166,reconstruction
denoise_v13,denoise,v13,0 days 00:05:19.624100,0 days 00:00:13.317670833,denoise
filter_greengenes_reference,filter_greengenes_reference,,0 days 00:01:19.451000,0 days 00:00:03.310458333,filter_greengenes_reference
trim_v34,trim,v34,0 days 00:00:05.822600,0 days 00:00:00.242608333,denoise
extract_sidle_regions_v34,extract_sidle_regions,v34,0 days 00:05:48.673100,0 days 00:00:14.528045833,database
prepare_extracted_region_v13,prepare_extracted_region,v13,0 days 00:00:46.421600,0 days 00:00:01.934233333,database
align_sidle_regions_v34,align_sidle_regions,v34,0 days 00:06:25.251000,0 days 00:00:16.052125,align
extract_sidle_regions_v13,extract_sidle_regions,v13,0 days 00:05:32.979000,0 days 00:00:13.874125,database
prepare_extracted_region_v34,prepare_extracted_region,v34,0 days 00:01:20.629700,0 days 00:00:03.359570833,database
align_sidle_regions_v13,align_sidle_regions,v13,0 days 00:05:32.655300,0 days 00:00:13.860637500,align


In [18]:
timestamp.groupby('command_type')['runtime'].sum().round('s')

command_type
align                         0 days 00:11:58
database                      0 days 00:13:29
denoise                       0 days 00:17:26
filter_greengenes_reference   0 days 00:01:19
reconstruction                0 days 00:00:23
Name: runtime, dtype: timedelta64[ns]

In [20]:
timestamp.loc[timestamp['command_type'].isin(['database', 'filter_greengenes_reference']), 'runtime'].sum().round('s')

Timedelta('0 days 00:14:48')

In [21]:
timestamp.loc[['reconstruct_taxonomy', 'reconstruct_table_single'], 'runtime'].dt.round('s')

filename
reconstruct_taxonomy       0 days 00:00:06
reconstruct_table_single   0 days 00:00:17
Name: runtime, dtype: timedelta64[ns]

In [22]:
(timestamp.loc[timestamp['command_type'] != 'database', 'runtime'].sum() / 24).round('s')

Timedelta('0 days 00:01:18')

In [23]:
(timestamp.groupby('command_type')['runtime'].sum() / 24).dt.round('s')

command_type
align                         0 days 00:00:30
database                      0 days 00:00:34
denoise                       0 days 00:00:44
filter_greengenes_reference   0 days 00:00:03
reconstruction                0 days 00:00:01
Name: runtime, dtype: timedelta64[ns]

In [39]:
(timestamp.loc[~timestamp['command_type'].isin(['database', 'filter_greengenes_reference']), 'runtime'].sum() / 24).round('s')

Timedelta('0 days 00:01:14')

In [24]:
(timestamp.loc[['reconstruct_taxonomy', 'reconstruct_table_single'], 'runtime'] / 24).dt.round('s')

filename
reconstruct_taxonomy       0 days 00:00:00
reconstruct_table_single   0 days 00:00:01
Name: runtime, dtype: timedelta64[ns]

In [25]:
timestamp.groupby(['command_type', 'region'])['runtime'].sum().groupby('command_type').describe()['mean'].round('s')

command_type
align      0 days 00:05:59
database   0 days 00:06:44
denoise    0 days 00:08:43
Name: mean, dtype: timedelta64[ns]

In [26]:
timestamp.groupby(['command_type', 'region'])['runtime'].sum().groupby('command_type').describe()['std'].round('s')

command_type
align      0 days 00:00:37
database   0 days 00:00:35
denoise    0 days 00:03:06
Name: std, dtype: timedelta64[ns]

In [29]:
pd.Timedelta(5028.675 / 2, 's').round('s')

Timedelta('0 days 00:41:54')

In [33]:
(timestamp['runtime'].sum() / 24).round('s')

Timedelta('0 days 00:01:51')

In [42]:
(pd.Timedelta('00:21:25') / 24).round('s')

Timedelta('0 days 00:00:54')