In [2]:
import glob
import pandas as pd
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numpy.testing.print_coercion_tables import print_new_cast_table
from scipy.stats import gmean
import scipy.stats as stats
import re
import os

from functools import reduce
from matplotlib.colors import to_rgb, to_hex
import colorsys

from mpl_toolkits.axes_grid1.inset_locator import inset_axes, mark_inset

In [3]:
def safe_gmean(x):
    x = np.array(x)
    x = x[x > 0]  # only positive values
    return gmean(x) if len(x) > 0 else np.nan

# Helper to classify size
def size_category(size):
    if size < 8000:
        return "Small"
    elif size <= 15000:
        return "Middle"
    else:
        return "Large"
    
palette = {
    'MeDiH-MM - dyn_vs_base': 'blue',
    'MeDiH-MM - stat_vs_base': 'darkblue',
    'MeDiH-BL - dyn_vs_base': 'orange',
    'MeDiH-BL - stat_vs_base': 'darkorange',
    'MeDiH-BLC - dyn_vs_base': 'green',
    'MeDiH-BLC - stat_vs_base': 'darkgreen',
    'MeDiH-MM - internal': 'blue',  
    'MeDiH-BL - internal': 'orange',   
    'MeDiH-BLC - internal': 'green',
}

palette_params1 = {
    'MeDiH-BL - dyn_vs_base': '#8172B3',
    'MeDiH-BL - stat_vs_base': '#64B5CD',
    'MeDiH-BLC - dyn_vs_base': '#C44E52',
    'MeDiH-BLC - stat_vs_base': '#8C564B',
    'MeDiH-MM - dyn_vs_base': '#7F7F7F',
    'MeDiH-MM - stat_vs_base': '#CCB974'
}

palette_params = {
    'dyn_vs_base-Small': '#C44E52',       # soft red
    'stat_vs_base-Small': '#8C2D31',      # deep red

    'dyn_vs_base-Middle': '#8172B3',      # lavender
    'stat_vs_base-Middle': '#5B4C94',     # dark violet

    'dyn_vs_base-Large': '#CCB974',       # golden beige
    'stat_vs_base-Large': '#9A853F'       # muted olive/brown
}


label_map = {
    "MeDiH-BL - stat_vs_base": "MeDiH-BL",
    "MeDiH-BL - dyn_vs_base": "DynMeDiH-BL",
    "MeDiH-BLC - stat_vs_base": "MeDiH-BLC",
    "MeDiH-BLC - dyn_vs_base": "DynMeDiH-BLC",
    "MeDiH-MM - stat_vs_base": "MeDiH-MM",
    "MeDiH-MM - dyn_vs_base": "DynMeDiH-MM" ,
    "MeDiH-BL - internal": "Internal MeDiH-BL",
    "MeDiH-BLC - internal": "Internal MeDiH-BLC",  
    "MeDiH-MM - internal": "Internal MeDiH-MM",
    "dyn_vs_base-Large": "Dynamic, Large",
    "dyn_vs_base-Middle": "Dynamic, Middle",
    "dyn_vs_base-Small": "Dynamic, Small",
    "stat_vs_base-Large": "Static, Large",
    "stat_vs_base-Middle": "Static, Middle",
    "stat_vs_base-Small": "Static, Small"
    
}
    

SyntaxError: EOL while scanning string literal (131340508.py, line 62)

In [7]:
def read_dfs(pathR, patternR, numGroups=2):
    dfs = {}
    
    for fname in glob.glob(pathR):
        #print(fname)
        df = pd.read_csv(fname, delimiter=' ', header=0, on_bad_lines="skip")
        basename = os.path.basename(fname)
        match = re.search(patternR, basename)
        
        if match:
            algorithm = match.group(1)
            variant = match.group(numGroups)
            dfs[(algorithm, variant)] = df
        else:
            print("No match found. " + basename)
    
    return dfs


In [7]:
def merge_correct_columns(dfsOurVar, lbs):
    renamed_dfs = []
    for df, label in zip(dfsOurVar, lbs):
        # Rename selected columns       
        renamed = df[['wf_name', 'inp_size', 'ms_1', 'ms_2']].copy()
        renamed = renamed.rename(columns={
            'ms_1': f'ms_1_{label}',
            'ms_2': f'ms_2_{label}',
        })    
        #print(renamed[renamed["wf_name"].str.contains("atacseq")])

        renamed_dfs.append(renamed)


    # Merge them all on 'wf_name'
   
    merged_df = reduce(lambda left, right: pd.merge(left, right, on=['wf_name', 'inp_size']), renamed_dfs)


    merged_df['size'] = merged_df['wf_name'].str.extract(r'_(\d+)\.')[0].fillna("100")
    merged_df['size'] = merged_df['size'].astype(int)


    for col in merged_df.columns:
        if col.startswith("ms_"):
            merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")
    
    return merged_df

In [16]:
# List your algorithms (excluding BASE)
algos = ['A1', 'A2', 'A3']
algo_aliases = {
        'A1': 'MeDiH-BL',
        'A2': 'MeDiH-BLC',
        'A3': 'MeDiH-MM'
    }

def buld_plot_df(merged_df):
    # For storing all ratios in long format
    ratios = []

    for algo in algos:
        df = merged_df.copy()
        df['size'] = df['size']  # group key

        # Compute the 3 relations
        df['internal'] = df[f'ms_1_{algo}'] / df[f'ms_2_{algo}']
        df['dyn_vs_base'] =  df['ms_2_BASE'] /df[f'ms_1_{algo}']
        df['stat_vs_base'] =  df['ms_2_BASE'] / df[f'ms_2_{algo}']
        #df['stat_vs_base'] = np.where((df['stat_vs_base'] > 1) & (df['size'] == 100), 1, df['stat_vs_base'])
        #df['dyn_vs_base'] = np.where((df['dyn_vs_base'] > 1) & (df['size'] == 100), 1, df['dyn_vs_base'])
        #df['internal'] = np.where((df['internal'] > 1) & (df['size'] == 100), 1, df['internal'])

        #print("df in build, before merge ", df)

        # Reshape to long format for seaborn
        melted = df[['size', 'wf_name', 'internal','inp_size', 'dyn_vs_base', 'stat_vs_base']].melt(#
            id_vars=['size', 'wf_name','inp_size'],
            var_name='relation',
            value_name='ratio'
        )
        melted['algorithm'] = algo

        ratios.append(melted)

    # Concatenate all into one DataFrame
    pl_df = pd.concat(ratios, ignore_index=True)

  

    pl_df['algorithm'] = pl_df['algorithm'].replace(algo_aliases)
    return pl_df 

In [13]:
def pltParams():
    plt.figure(figsize=(24, 12))
    plt.yticks(fontsize=18)
    plt.xticks(fontsize=18)
    plt.xlabel('size', fontsize=18)
    plt.ylabel('ratio', fontsize=18)
    plt.legend(fontsize=18)  
    
def removeUnnecessaryColumns(dfsVar, dfsBs):
    dv1=[]
    for df in dfsVar:
        df= df.drop('dur_alg1', axis=1)
        df= df.drop('ms_perc', axis=1)
        df= df.drop('dur_alg2', axis=1)
        dv1.append(df)

    db=[]
    for df in dfsBs:
        df= df.drop('dur_alg1', axis=1)
        df= df.drop('ms_perc', axis=1)
        df= df.drop('dur_alg2', axis=1)
        db.append(df) 
    
    return dv1

In [15]:
penalty_labels = {
    0: "1",
    1: f"$10$",
    2: f"$10^2$",
    3: f"$10^3$",
    4: f"$10^4$",
    5: f"$10^5$"
}

rw_labels = {
    0: "10xfaster",
    1: "no change",
    2: "10xslower",
    3: "100xslower",
    4: "1000xslower"
}


