In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import seaborn as sns
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
sns.set(style="whitegrid")

In [None]:
methods = !cat ../data/implementation_list.txt
networks = !cat ../data/networks.txt
networks = networks[0].split()
results_df = pd.read_csv("../result/stat_summary.tsv", sep='\t')
col_lst = ["Loading time", "Preprocessing time", "Walking time", "Training time"]

def convert_time(s):
    # converting time string to float
    return None if s == '-' else float(s)

def convert_to_sec(t_str):
    if t_str == '-': return None
    t = 0
    t_lst = t_str.split(':')
    for i, j in enumerate(reversed(t_lst)):
        t += float(j) * 60 ** i
    return t

def Mres_to_bytes(s):
    # convert Mres to bytes
    if s == '-':
        return None
    power_dict = {'GB':3, 'MB':2, 'KB':1}
    val = float(s[:-2]) * 1024 ** power_dict[s[-2:]]
    return val

def check_valid():
    # non-zero training time indicates that random walking finished
    for i,j in results_df.iterrows():
        if j['Training time'] == '-':
            results_df.iloc[i]['Total time'] = '-'
            results_df.iloc[i]['Total time in second'] = '-'
            results_df.iloc[i]['Maximum resident size'] = '-'
            
check_valid()

results_df['Total time in second'] = results_df['Total time in second'].apply(convert_time)
results_df['Max Res in bytes'] = results_df['Maximum resident size'].apply(Mres_to_bytes)

# convert time to seconds
for col in col_lst:
    results_df[col] = results_df[col].apply(convert_to_sec)

pd.options.display.max_rows = 999
results_df

In [None]:
mode = "Multi" # multi-core computation resource configuration
# mode = "Single" # single-core computation resource configuration
network_color_dict = {
    'PPI': 'lightsalmon',
    'BlogCatalog': 'springgreen',
    'Wikipedia': 'orangered',
    'BioGRID': 'dodgerblue',
    'STRING': 'slategray',
    'SSN200': 'k',
    'GIANT-TN-c01': 'blue',
    'GIANT-TN': 'indigo'
}
methods_dict = {
    'orig-py': 'Original\nPython',
    'orig-cpp': 'Original\nC++',
    'nodevectors': 'nodevectors',
    'pecanpy-PreComp': 'PecanPy\nPreComp',
    'pecanpy-SparseOTF': 'PecanPy\nSparseOTF',
    'pecanpy-DenseOTF': 'Pecanpy\nDenseOTF'
}
plot_annot = ['A)', 'B)', 'C)', 'D)']

# break down individual stages and compare across methods
fig, axes = plt.subplots(4, 1, figsize=(6.3,9), sharex=True)
plt.subplots_adjust(hspace=0.07)

groups = results_df[results_df["Setup"] == mode].groupby("Method")
for i, stage in enumerate(reversed(col_lst)):
    tmp_df = pd.DataFrame()
    tmp_df["Network"] = networks
    
    ax = axes[i]
    # need to make sure network order in sync!!
    for method in methods:
        group = groups.get_group(method)
        group.index = group['Network'].values
        group = group.reindex(networks) # sort by network
        tmp_df[method] = group[stage].values

    tmp_df.replace(0, 0.1, inplace=True) # replace 0 with 0.1 for log

    pd.plotting.parallel_coordinates(tmp_df, 'Network', methods, marker='o', lw=2.2, 
                                     color=[network_color_dict[j] for j in networks], 
                                     ax=ax, axvlines=False)
    
    ax.set_yscale("log")
    ax.get_legend().remove()
    ax.set_ylabel(stage)
    ax.text(-0.5, ax.get_ylim()[1], plot_annot[i], fontweight='bold')
    
    ax.grid()

plt.xticks(np.arange(len(methods)), list(map(methods_dict.get, methods)), fontsize=11)
ax.legend(loc='lower center', ncol=4, bbox_to_anchor=(0.49, -0.69))
# plt.savefig("FigS4.png", dpi=600)
plt.tight_layout()
plt.show()