In [2]:
import numpy as np
import matplotlib as mpl
%matplotlib inline

import pandas as pd

import json
import glob
import os

In [3]:
data = {}

for path in glob.glob("../../data/results/all_real/*.json") + glob.glob("../../data/results/all_real_seq/*.json") + glob.glob("../../data/results/gossip_map/*.json") + glob.glob("../../data/results/all_real_cluster_cmp/*.json"):
  for typename, items in json.load(open(path)).items():
      if typename in data:
        for key, object_data in items.items():
          if key in data[typename]:
            data[typename][key].update(object_data)
          else:
            data[typename][key] = object_data
      else:
        data[typename] = items

frames = { typename: pd.DataFrame.from_dict(items, orient='index') for typename, items in data.items() }

In [4]:
dlslm_label = 'DSLM-Mod'
dlslm_me_label = 'DSLM-Map'
seq_postfix = ' w. Seq.'
no_contraction_postfix = ' w/o Contraction'
dlslm_ws_label = dlslm_label + seq_postfix
dlslm_nc_label = dlslm_label + no_contraction_postfix
seq_louvain_label = 'Seq. Louvain'
seq_infomap_label = 'Seq. InfoMap'
plm_label = 'PLM'
relax_map_label = 'RelaxMap'
gossip_map_label = 'GossipMap'

algo_name_mapping = {
    'synchronous local moving with map equation': dlslm_me_label,
    'synchronous local moving with modularity': dlslm_label,
    'sequential louvain': seq_louvain_label,
    'sequential infomap': seq_infomap_label,
    'relax map': relax_map_label,
    'gossip map': gossip_map_label
}

frames['algorithm_run'].replace({ 'algorithm': algo_name_mapping }, inplace=True)

frames['algorithm_run']['algorithm'] += frames['algorithm_run'].merge(frames['program_run'], left_on='program_run_id', right_index=True, how='left')['switch_to_seq'].map({ False: '', True: seq_postfix, np.NaN: '' })
frames['algorithm_run']['algorithm'] += frames['algorithm_run'].merge(frames['program_run'], left_on='program_run_id', right_index=True, how='left')['contraction'].map({ False: no_contraction_postfix, True: '', np.NaN: '' })

In [5]:
frames['algorithm_run']['runtime'].fillna((frames['algorithm_run']['done_ts'] - frames['algorithm_run']['start_ts']) / 1000000.0, inplace=True)

In [6]:
frames['program_run']['graph_path'] = frames['program_run']['graph']

graph_names = { 
    'data/graphs/uk-2002.metis-preprocessed-*.bin': 'uk-2002', 
    'data/graphs/uk-2007-05.metis-preprocessed-*.bin': 'uk-2007-05', 
    'data/graphs/in-2004.metis-preprocessed-*.bin': 'in-2004', 
    'data/graphs/com-friendster-preprocessed-*.bin': 'com-friendster', 
    'data/graphs/com-lj.ungraph-preprocessed-*.bin': 'com-lj', 
    'data/graphs/com-orkut.ungraph-preprocessed-*.bin': 'com-orkut', 
    'data/graphs/com-youtube.ungraph-preprocessed-*.bin': 'com-youtube', 
    'data/graphs/com-amazon.ungraph-preprocessed-*.bin': 'com-amazon',
    'data/graphs/europe.osm-preprocessed-*.bin': 'osm-europe',
}

frames['program_run'].replace({ 'graph': graph_names }, inplace=True)

In [7]:
all_data = frames['clustering'] \
    .merge(frames['algorithm_run'], left_on='algorithm_run_id', right_index=True) \
    .merge(frames['program_run'], left_on='program_run_id', right_index=True) \
    .groupby(['graph', 'algorithm'])['runtime'].mean().round(1).to_frame() \
    .unstack()["runtime"][[seq_louvain_label, plm_label, dlslm_label, dlslm_nc_label, seq_infomap_label, relax_map_label, gossip_map_label, dlslm_me_label]]

all_data = all_data.loc[frames['program_run'].sort_values('edge_count')['graph'].dropna().unique()]

graph_data = frames['program_run'].dropna(subset=['hosts', 'edge_count']).groupby('graph').agg({ 'node_count': 'first', 'edge_count': 'first', 'hosts': 'max' })
graph_data['hosts'] = graph_data['hosts'].astype(int)
graph_data.columns = ['n', 'm', 'hosts']
res = graph_data.sort_values('m').merge(all_data, left_index=True, right_index=True)

# with open("../../dist-thrill-cluster/plots/real_world_runtimes.tex", "w") as file:
print(res.to_latex())
res

\begin{tabular}{lrrrrrrrrrrr}
\toprule
{} &          n &             m &  hosts &  Seq. Louvain &     PLM &  DSLM-Mod &  DSLM-Mod w/o Contraction &  Seq. InfoMap &  RelaxMap &  GossipMap &  DSLM-Map \\
graph          &            &               &        &               &         &           &                           &               &           &            &           \\
\midrule
com-amazon     &     334863 &  9.258720e+05 &      2 &           NaN &     NaN &       5.8 &                       1.0 &           NaN &       NaN &       46.3 &       4.7 \\
com-youtube    &    1134890 &  2.987624e+06 &      2 &           NaN &     NaN &       9.7 &                       3.6 &           NaN &       NaN &       80.0 &      14.4 \\
in-2004        &    1382867 &  1.359147e+07 &      4 &           NaN &     NaN &      12.1 &                       4.0 &           NaN &       NaN &       70.0 &      11.1 \\
com-lj         &    3997962 &  3.468119e+07 &      8 &          99.4 &    25.7 &      30.

Unnamed: 0_level_0,n,m,hosts,Seq. Louvain,PLM,DSLM-Mod,DSLM-Mod w/o Contraction,Seq. InfoMap,RelaxMap,GossipMap,DSLM-Map
graph,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
com-amazon,334863,925872.0,2,,,5.8,1.0,,,46.3,4.7
com-youtube,1134890,2987624.0,2,,,9.7,3.6,,,80.0,14.4
in-2004,1382867,13591470.0,4,,,12.1,4.0,,,70.0,11.1
com-lj,3997962,34681190.0,8,99.4,25.7,30.5,13.7,1273.4,252.0,372.5,49.3
osm-europe,50912018,54054660.0,8,,,448.7,47.8,,,1226.1,494.7
com-orkut,3072441,117185100.0,8,171.1,53.9,47.4,33.9,2322.0,721.7,700.0,84.3
uk-2002,18483186,261787300.0,8,567.5,142.3,46.1,21.5,6710.7,1044.7,681.8,52.1
com-friendster,65608366,1806067000.0,16,6365.3,1736.6,1047.2,742.4,,,13743.2,1161.2
uk-2007-05,105153952,3301877000.0,16,8038.4,2549.8,151.0,105.9,,,4210.6,214.3


In [27]:
modularity_algos = [dlslm_label, dlslm_nc_label, dlslm_ws_label, plm_label, seq_louvain_label]
mapeq_algos = [dlslm_me_label, relax_map_label, gossip_map_label, seq_infomap_label]

data = frames['clustering_comparison'] \
    .merge(frames['clustering'], left_on='base_clustering_id', right_index=True) \
    .merge(frames['algorithm_run'], left_on='algorithm_run_id', right_index=True) \
    .merge(frames['program_run'], left_on='program_run_id', right_index=True) \
    .merge(frames['clustering'], left_on='compare_clustering_id', right_index=True) \
    .merge(frames['algorithm_run'], left_on='algorithm_run_id_y', right_index=True) \
    .loc[lambda x: ((x.algorithm_x == seq_louvain_label) & x.algorithm_y.isin(modularity_algos)) | ((x.algorithm_y == seq_louvain_label) & x.algorithm_x.isin(modularity_algos)) | ((x.algorithm_x == seq_infomap_label) & x.algorithm_y.isin(mapeq_algos)) | ((x.algorithm_y == seq_infomap_label) & x.algorithm_x.isin(mapeq_algos))]

def other_algo(row):
    if row['algorithm_x'] == seq_louvain_label or row['algorithm_x'] == seq_infomap_label:
        return row['algorithm_y']
    return row['algorithm_x']
    
data['comparison_algo'] = data.apply(other_algo, axis=1)

comparison_data = data.groupby(['graph', 'comparison_algo'])['ARI'].mean().round(3).to_frame() \
    .unstack()["ARI"][[seq_louvain_label, plm_label, dlslm_label, dlslm_nc_label, seq_infomap_label, relax_map_label, gossip_map_label, dlslm_me_label]]

res = graph_data.sort_values('m').merge(comparison_data, left_index=True, right_index=True)

# with open("../../dist-thrill-cluster/plots/real_world_runtimes.tex", "w") as file:
print(res.to_latex())
res

\begin{tabular}{lrrrrrrrrrrr}
\toprule
{} &          n &             m &  hosts &  Seq. Louvain &    PLM &  DSLM-Mod &  DSLM-Mod w/o Contraction &  Seq. InfoMap &  RelaxMap &  GossipMap &  DSLM-Map \\
graph          &            &               &        &               &        &           &                           &               &           &            &           \\
\midrule
com-lj         &    3997962 &  3.468119e+07 &      8 &         0.581 &  0.576 &     0.544 &                     0.180 &         0.977 &     0.368 &      0.780 &     0.768 \\
com-orkut      &    3072441 &  1.171851e+08 &      8 &         0.661 &  0.627 &     0.627 &                     0.231 &         0.941 &     0.805 &      0.488 &     0.832 \\
uk-2002        &   18483186 &  2.617873e+08 &      8 &         0.731 &  0.728 &     0.675 &                     0.047 &         0.985 &     0.928 &      0.862 &     0.968 \\
com-friendster &   65608366 &  1.806067e+09 &     16 &         0.560 &  0.617 &     0.547 &   

Unnamed: 0_level_0,n,m,hosts,Seq. Louvain,PLM,DSLM-Mod,DSLM-Mod w/o Contraction,Seq. InfoMap,RelaxMap,GossipMap,DSLM-Map
graph,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
com-lj,3997962,34681190.0,8,0.581,0.576,0.544,0.18,0.977,0.368,0.78,0.768
com-orkut,3072441,117185100.0,8,0.661,0.627,0.627,0.231,0.941,0.805,0.488,0.832
uk-2002,18483186,261787300.0,8,0.731,0.728,0.675,0.047,0.985,0.928,0.862,0.968
com-friendster,65608366,1806067000.0,16,0.56,0.617,0.547,0.353,,,,
uk-2007-05,105153952,3301877000.0,16,0.865,0.869,0.815,0.278,,,,


In [7]:
all_data = frames['clustering'] \
    .merge(frames['algorithm_run'], left_on='algorithm_run_id', right_index=True) \
    .merge(frames['program_run'], left_on='program_run_id', right_index=True) \
    .groupby(['algorithm', 'graph']).agg({ 'hosts': 'first', 'runtime': 'mean', 'modularity': 'mean', 'map_equation': 'mean', 'cluster_count': 'mean' }) \
    .stack() \
    .unstack(['algorithm']).unstack()

all_data[('', 'hosts')] = all_data[(dlslm_me_label, 'hosts')].astype(int)

all_data = all_data \
    [[('', 'hosts'),
      (seq_louvain_label, 'runtime'),
      (seq_louvain_label, 'modularity'),
      (seq_louvain_label, 'map_equation'),
      (seq_louvain_label, 'cluster_count'),
      (dlslm_nc_label, 'runtime'),
      (dlslm_nc_label, 'modularity'),
      (dlslm_nc_label, 'map_equation'),
      (dlslm_nc_label, 'cluster_count'),
      (seq_infomap_label, 'runtime'),
      (seq_infomap_label, 'modularity'),
      (seq_infomap_label, 'map_equation'),
      (seq_infomap_label, 'cluster_count'),
      (dlslm_me_label, 'runtime'),
      (dlslm_me_label, 'modularity'),
      (dlslm_me_label, 'map_equation'),
      (dlslm_me_label, 'cluster_count')]]

all_data = all_data.loc[frames['program_run'].sort_values('edge_count')['graph'].dropna().unique()]

print(all_data.to_latex().replace('NaN', '   ').replace('runtime', 'runtime [s]'))

all_data

\begin{tabular}{lrrrrrrrrrrrrrrrrr}
\toprule
algorithm & \multicolumn{4}{l}{Seq. Louvain} & \multicolumn{4}{l}{DSLM-Mod w/o Contraction} & \multicolumn{4}{l}{Seq. Infomap} & \multicolumn{4}{l}{DSLM-Map} \\
{} & hosts &      runtime [s] & modularity & map\_equation & cluster\_count &                  runtime [s] & modularity & map\_equation & cluster\_count &      runtime [s] & modularity & map\_equation & cluster\_count &      runtime [s] & modularity & map\_equation & cluster\_count \\
graph          &       &              &            &              &               &                          &            &              &               &              &            &              &               &              &            &              &               \\
\midrule
com-amazon     &     2 &      1.18867 &   0.926283 &      8.26193 &         244.0 &                 1.018540 &   0.662349 &     6.423891 &       58321.0 &      23.8371 &   0.833251 &      5.24015 &       15450.0 &     4.62511

algorithm,Unnamed: 1_level_0,Seq. Louvain,Seq. Louvain,Seq. Louvain,Seq. Louvain,DSLM-Mod w/o Contraction,DSLM-Mod w/o Contraction,DSLM-Mod w/o Contraction,DSLM-Mod w/o Contraction,Seq. Infomap,Seq. Infomap,Seq. Infomap,Seq. Infomap,DSLM-Map,DSLM-Map,DSLM-Map,DSLM-Map
Unnamed: 0_level_1,hosts,runtime,modularity,map_equation,cluster_count,runtime,modularity,map_equation,cluster_count,runtime,modularity,map_equation,cluster_count,runtime,modularity,map_equation,cluster_count
graph,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
com-amazon,2,1.18867,0.926283,8.26193,244.0,1.01854,0.662349,6.423891,58321.0,23.8371,0.833251,5.24015,15450.0,4.625115,0.831016,5.308886,14195.0
com-youtube,2,6.93669,0.718357,10.2581,7153.0,3.884698,0.593793,9.459313,205037.0,113.097,0.581635,8.44782,59257.0,14.685556,0.575558,8.543954,52631.0
in-2004,4,16.4854,0.980143,7.2624,929.0,4.042109,0.878717,6.666957,104124.0,131.183,0.935296,6.28732,28653.0,11.697303,0.937861,6.298318,27122.0
com-lj,8,94.7533,0.752447,11.8418,2272.0,11.369681,0.571526,10.504783,334165.0,1093.8,0.642551,9.9004,84945.0,45.284255,0.632626,9.980078,76991.0
osm-europe,8,1607.29,0.998942,9.84196,3037.0,45.878178,0.485844,10.453295,22737159.0,,,,,164.115861,0.938606,4.349963,2188584.0
com-orkut,8,164.769,0.667288,12.9111,33.0,34.275958,0.537024,12.263079,27384.0,2478.87,0.558472,11.8249,14821.0,83.48925,0.540117,11.896035,15326.0
uk-2002,8,529.38,0.989801,8.23784,4952.0,20.346798,0.876513,7.068022,962802.0,5613.98,0.95854,6.45794,198601.0,52.091819,0.960123,6.468501,186116.0
com-friendster,16,5499.12,0.621828,15.6447,32442.0,1093.074086,0.575267,15.38197,1456400.0,,,,,1143.842553,0.472372,14.788328,585763.0
uk-2007-05,16,7260.0,0.996251,9.06702,20235.0,108.757773,0.907177,8.470246,2321666.0,,,,,220.287635,0.972456,8.056724,375587.0
