In [1]:
# import pandas as pd
# import numpy as np
# from typing import List

# def hierarchical_max_sort(df: pd.DataFrame, 
#                          value_col: str,
#                          index_levels: List[str] = None) -> pd.DataFrame:
#     """
#     Sort a DataFrame hierarchically by maximum values at each level of the index.
    
#     Args:
#         df: DataFrame with multi-index
#         value_col: Name of the column to sort by
#         index_levels: List of index level names. If None, uses all levels in order
    
#     Returns:
#         Sorted DataFrame
#     """
#     if index_levels is None:
#         index_levels = list(df.index.names)
    
#     df_temp = df.copy()
    
#     # Add maximum value columns for each level
#     for i in range(len(index_levels)):
#         level_name = f'level{i+1}_max'
#         group_levels = index_levels[:i+1]
#         df_temp[level_name] = df.groupby(group_levels)[value_col].transform('max')
    
#     # Sort by all max columns
#     max_cols = [f'level{i+1}_max' for i in range(len(index_levels))]
#     df_sorted = df_temp.sort_values(
#         max_cols,
#         ascending=[False] * len(max_cols)
#     )[value_col].to_frame()
    
#     return df_sorted

# # Example usage:
# if __name__ == "__main__":
#     # Sample data
#     data = {
#         'i1': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
#         'i2': ['a', 'a', 'b', 'b', 'a', 'a', 'b', 'a', 'a', 'b', 'b', 'b'],
#         'i3': ['x', 'y', 'z', 'x', 'x', 'y', 'z', 'x', 'y', 'x', 'y', 'z'],
#         'v': [10, 5, 20, 15, 3, 7, np.nan, 30, 25, 8, 12, 6]
#     }
    
#     # Create DataFrame and set multi-index
#     df = pd.DataFrame(data)
#     df = df.set_index(['i1', 'i2', 'i3'])
    
#     # Sort the DataFrame
#     df_sorted = hierarchical_max_sort(df, 'v')
    
#     print("Original DataFrame:")
#     print(df)
#     print("\nSorted DataFrame:")
#     print(df_sorted)

In [2]:
import pandas as pd
import numpy as np
from typing import List

def hierarchical_max_sort(df: pd.DataFrame, 
                         value_col: str,
                         index_levels: List[str] = None) -> pd.DataFrame:
    """
    Sort a DataFrame hierarchically by maximum values at each level of the index,
    preserving all columns.
    
    Args:
        df: DataFrame with multi-index
        value_col: Name of the column to sort by
        index_levels: List of index level names. If None, uses all levels in order
    
    Returns:
        Sorted DataFrame with all original columns
    """
    if index_levels is None:
        index_levels = list(df.index.names)
    
    df_temp = df.copy()
    
    # Add maximum value columns for each level
    for i in range(len(index_levels)):
        level_name = f'level{i+1}_max'
        group_levels = index_levels[:i+1]
        df_temp[level_name] = df.groupby(group_levels)[value_col].transform('max')
    
    # Sort by all max columns
    max_cols = [f'level{i+1}_max' for i in range(len(index_levels))]
    df_sorted = df_temp.sort_values(
        max_cols,
        ascending=[False] * len(max_cols)
    )
    
    # Drop temporary sorting columns and return original columns
    df_sorted = df_sorted.drop(columns=max_cols)
    
    return df_sorted

# Test the generalized function
if __name__ == "__main__":
    # Sample data with additional columns
    data = {
        'i1': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
        'i2': ['a', 'a', 'b', 'b', 'a', 'a', 'b', 'a', 'a', 'b', 'b', 'b'],
        'i3': ['x', 'y', 'z', 'x', 'x', 'y', 'z', 'x', 'y', 'x', 'y', 'z'],
        'i4': [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
        'i5': ['p', 'q', 'p', 'q', 'p', 'q', 'p', 'q', 'p', 'q', 'p', 'q'],
        'v': [10, 5, 20, 15, 3, 7, np.nan, 30, 25, 8, 12, 6],
        'other_col1': range(12),
        'other_col2': ['A'] * 6 + ['B'] * 6
    }
    
    # Create DataFrame and set multi-index
    df = pd.DataFrame(data)
    df = df.set_index(['i1', 'i2', 'i3', 'i4', 'i5'])
    
    # Sort using only i1, i2, i3 but preserve all columns
    df_sorted = hierarchical_max_sort(df, 'v', index_levels=['i1', 'i2', 'i3'])
    
    print("Original DataFrame:")
    print(df)
    print("\nSorted DataFrame (using i1, i2, i3, preserving all columns):")
    print(df_sorted)

Original DataFrame:
                   v  other_col1 other_col2
i1 i2 i3 i4 i5                             
1  a  x  1  p   10.0           0          A
      y  2  q    5.0           1          A
   b  z  1  p   20.0           2          A
      x  2  q   15.0           3          A
2  a  x  1  p    3.0           4          A
      y  2  q    7.0           5          A
   b  z  1  p    NaN           6          B
3  a  x  2  q   30.0           7          B
      y  1  p   25.0           8          B
   b  x  2  q    8.0           9          B
      y  1  p   12.0          10          B
      z  2  q    6.0          11          B

Sorted DataFrame (using i1, i2, i3, preserving all columns):
                   v  other_col1 other_col2
i1 i2 i3 i4 i5                             
3  a  x  2  q   30.0           7          B
      y  1  p   25.0           8          B
   b  y  1  p   12.0          10          B
      x  2  q    8.0           9          B
      z  2  q    6.0          11      

In [3]:
import pandas as pd
import numpy as np

# Test with more complex data
data = {
    'i1': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3],
    'i2': ['a', 'a', 'b', 'b', 'a', 'a', 'b', 'a', 'a', 'b', 'b', 'b'],
    'i3': ['x', 'y', 'z', 'x', 'x', 'y', 'z', 'x', 'y', 'x', 'y', 'z'],
    'i4': [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
    'i5': ['p', 'q', 'p', 'q', 'p', 'q', 'p', 'q', 'p', 'q', 'p', 'q'],
    'v': [10, 5, 20, 15, 3, 7, np.nan, 30, 25, 8, 12, 6]
}

# Create DataFrame with all indices
df_full = pd.DataFrame(data)
df_full = df_full.set_index(['i1', 'i2', 'i3', 'i4', 'i5'])

# Test existing function with specific levels
df_sorted = hierarchical_max_sort(df_full, 'v', index_levels=['i1', 'i2', 'i3'])

print("Original DataFrame:")
print(df_full)
print("\nSorted DataFrame (using only i1, i2, i3):")
print(df_sorted)

# Also test without specifying levels (should use all 5 levels)
df_sorted_all = hierarchical_max_sort(df_full, 'v')

print("\nSorted DataFrame (using all levels):")
print(df_sorted_all)

Original DataFrame:
                   v
i1 i2 i3 i4 i5      
1  a  x  1  p   10.0
      y  2  q    5.0
   b  z  1  p   20.0
      x  2  q   15.0
2  a  x  1  p    3.0
      y  2  q    7.0
   b  z  1  p    NaN
3  a  x  2  q   30.0
      y  1  p   25.0
   b  x  2  q    8.0
      y  1  p   12.0
      z  2  q    6.0

Sorted DataFrame (using only i1, i2, i3):
                   v
i1 i2 i3 i4 i5      
3  a  x  2  q   30.0
      y  1  p   25.0
   b  y  1  p   12.0
      x  2  q    8.0
      z  2  q    6.0
1  b  z  1  p   20.0
      x  2  q   15.0
   a  x  1  p   10.0
      y  2  q    5.0
2  a  y  2  q    7.0
      x  1  p    3.0
   b  z  1  p    NaN

Sorted DataFrame (using all levels):
                   v
i1 i2 i3 i4 i5      
3  a  x  2  q   30.0
      y  1  p   25.0
   b  y  1  p   12.0
      x  2  q    8.0
      z  2  q    6.0
1  b  z  1  p   20.0
      x  2  q   15.0
   a  x  1  p   10.0
      y  2  q    5.0
2  a  y  2  q    7.0
      x  1  p    3.0
   b  z  1  p    NaN


In [4]:
import pandas as pd
import numpy as np

def create_test_data():
    """Create test data with various scenarios"""
    data = {
        'i1': [1,    1,    1,    2,    2,    3,    3],
        'i2': ['a',  'a',  'b',  'a',  'b',  'a',  'b'],
        'i3': ['x',  'y',  'z',  'x',  'z',  'x',  'z'],
        'i4': ['ours', 'theirs', 'ours', 'theirs', 'ours', 'ours', 'theirs'],
        'v':  [10,   20,    5,    30,     8,    15,   25]
    }
    
    df = pd.DataFrame(data)
    df = df.set_index(['i1', 'i2', 'i3', 'i4'])
    return df

# Test scenarios:
# i1=1: max is "theirs" (20) - should be demoted
# i1=2: max is "theirs" (30) - should be demoted
# i1=3: max is "theirs" (25) but has "ours" (15) - should use "ours" max

df = create_test_data()
print("Original DataFrame:")
print(df)

# Test existing function
df_sorted = hierarchical_max_sort(df, 'v', index_levels=['i1', 'i2', 'i3'])
print("\nCurrent sorting (incorrect for our new requirements):")
print(df_sorted)

Original DataFrame:
                  v
i1 i2 i3 i4        
1  a  x  ours    10
      y  theirs  20
   b  z  ours     5
2  a  x  theirs  30
   b  z  ours     8
3  a  x  ours    15
   b  z  theirs  25

Current sorting (incorrect for our new requirements):
                  v
i1 i2 i3 i4        
2  a  x  theirs  30
   b  z  ours     8
3  b  z  theirs  25
   a  x  ours    15
1  a  y  theirs  20
      x  ours    10
   b  z  ours     5


In [5]:
import pandas as pd
import numpy as np

# Sample data demonstrating the "ours" vs "theirs" priority
data = {
    'i1': [1,    1,    2,    2,    3,    3   ],  # group id
    'i2': ['a',  'a',  'b',  'b',  'c',  'c' ],  # subgroup
    'i4': ['ours','theirs','ours','theirs','ours','theirs'],
    'v':  [10,   20,    15,   10,    5,    3  ]
}

df = pd.DataFrame(data)
df = df.set_index(['i1', 'i2', 'i4'])

print("Original DataFrame:")
print(df)

# Expected sorting:
# Group 2 first (max=15 from "ours")
# Group 1 last (max=20 but from "theirs")
# Group 3 middle (max=5 from "ours")

print("\nDesired output:")
desired = df.reindex([
    (2, 'b', 'ours'),
    (2, 'b', 'theirs'),
    (3, 'c', 'ours'),
    (3, 'c', 'theirs'),
    (1, 'a', 'ours'),
    (1, 'a', 'theirs'),
])
print(desired)

Original DataFrame:
               v
i1 i2 i4        
1  a  ours    10
      theirs  20
2  b  ours    15
      theirs  10
3  c  ours     5
      theirs   3

Desired output:
               v
i1 i2 i4        
2  b  ours    15
      theirs  10
3  c  ours     5
      theirs   3
1  a  ours    10
      theirs  20


In [8]:
import pandas as pd
import numpy as np
from typing import List

def hierarchical_max_sort(df: pd.DataFrame, 
                         value_col: str,
                         index_levels: List[str] = None,
                         priority_level: str = None,
                         priority_tiers: List[List[str]] = None) -> pd.DataFrame:
    """
    Sort a DataFrame hierarchically by maximum values at each level of the index,
    with priority tiers for values in a given level.
    
    Args:
        df: DataFrame with multi-index
        value_col: Name of the column to sort by
        index_levels: List of index level names. If None, uses all levels in order
        priority_level: Name of the index level to check for priority values
        priority_tiers: List of lists of values in priority_level, from highest to lowest priority.
                       Values within the same inner list have equal priority.
    
    Returns:
        Sorted DataFrame with all original columns
    """
    if index_levels is None:
        index_levels = list(df.index.names)
    
    df_temp = df.copy()
    
    # Add maximum value columns for each level
    for i in range(len(index_levels)):
        level_name = f'level{i+1}_max'
        group_levels = index_levels[:i+1]
        df_temp[level_name] = df.groupby(group_levels)[value_col].transform('max')
    
    if priority_level and priority_tiers:
        # For each group, find the highest priority tier that achieves the max value
        group_levels = [level for level in index_levels if level != priority_level]
        df_temp['priority_rank'] = len(priority_tiers)  # default rank for values not in any tier
        
        for group_idx in df.groupby(group_levels).groups:
            group_data = df.loc[group_idx]
            max_val = group_data[value_col].max()
            max_rows = group_data[value_col] == max_val
            max_priorities = group_data[max_rows].index.get_level_values(priority_level)
            
            # Find the highest priority tier (lowest rank) that achieves max value
            best_rank = len(priority_tiers)
            for i, tier in enumerate(priority_tiers):
                if any(val in max_priorities for val in tier):
                    best_rank = i
                    break
            
            df_temp.loc[group_idx, 'priority_rank'] = best_rank
    
        # Sort by priority rank first, then by max values
        sort_cols = ['priority_rank'] + [f'level{i+1}_max' for i in range(len(index_levels))]
        ascending = [True] + [False] * (len(index_levels))  # True for rank (lower is better)
    else:
        # Original sorting behavior
        sort_cols = [f'level{i+1}_max' for i in range(len(index_levels))]
        ascending = [False] * len(sort_cols)
    
    # Sort and drop temporary columns
    df_sorted = df_temp.sort_values(sort_cols, ascending=ascending)
    df_sorted = df_sorted.drop(columns=sort_cols + (['priority_rank'] if priority_level else []))
    
    return df_sorted

# Test the function
if __name__ == "__main__":
    # Sample data with multiple priority tiers
    data = {
        'i1': [1,    1,    1,    2,    2,    2,    3,    3,    3   ],
        'i2': ['a',  'a',  'a',  'b',  'b',  'b',  'c',  'c',  'c' ],
        'i4': ['me', 'ours','theirs','someone_else','unknown','dont_mind','me','theirs','unknown'],
        'v':  [10,   20,    15,    12,         8,      5,        3,    2,     1    ]
    }
    
    df = pd.DataFrame(data)
    df = df.set_index(['i1', 'i2', 'i4'])
    
    # Sort with priority tiers
    df_sorted = hierarchical_max_sort(
        df, 
        value_col='v',
        index_levels=['i1', 'i2', 'i4'],
        priority_level='i4',
        priority_tiers=[
            ["me", "ours"],         # highest priority tier
            ["theirs"],
            ["someone_else"],
            ["unknown", "dont_mind"] # lowest priority tier
        ]
    )
    
    print("Original DataFrame:")
    print(df)
    print("\nSorted DataFrame (with priority tiers):")
    print(df_sorted)

Original DataFrame:
                     v
i1 i2 i4              
1  a  me            10
      ours          20
      theirs        15
2  b  someone_else  12
      unknown        8
      dont_mind      5
3  c  me             3
      theirs         2
      unknown        1

Sorted DataFrame (with priority tiers):
                     v
i1 i2 i4              
1  a  ours          20
      theirs        15
      me            10
3  c  me             3
      theirs         2
      unknown        1
2  b  someone_else  12
      unknown        8
      dont_mind      5


In [9]:
import pandas as pd
import numpy as np
from typing import List

# Create test data that mimics your specific case
data = {
    'Target': ['cnn_dailymail'] * 3,
    'Hardware': ['4 * H100 NVL'] * 3,
    'Method': ['AR', 'SLEM', 'SLEM'],
    'Drafter': ['No Drafter (Autoregressive)', 'Qwen2.5-0.5B-Instruct', 'vicuna-68m'],
    'Speedup': [1.0, 1.71, 2.1]
}

df = pd.DataFrame(data)
df = df.set_index(['Target', 'Hardware', 'Method', 'Drafter'])

# Test the sorting with priority tiers
df_sorted = hierarchical_max_sort(
    df, 
    value_col='Speedup',
    index_levels=['Target', 'Hardware'],
    priority_level='Method',
    priority_tiers=[
        ['SLEM'],  # highest priority
        ['SD'],    # medium priority
        ['AR']     # lowest priority
    ]
)

print("Original DataFrame:")
print(df)
print("\nSorted DataFrame:")
print(df_sorted)

Original DataFrame:
                                                               Speedup
Target        Hardware     Method Drafter                             
cnn_dailymail 4 * H100 NVL AR     No Drafter (Autoregressive)     1.00
                           SLEM   Qwen2.5-0.5B-Instruct           1.71
                                  vicuna-68m                      2.10

Sorted DataFrame:
                                                               Speedup
Target        Hardware     Method Drafter                             
cnn_dailymail 4 * H100 NVL AR     No Drafter (Autoregressive)     1.00
                           SLEM   Qwen2.5-0.5B-Instruct           1.71
                                  vicuna-68m                      2.10


In [11]:
import pandas as pd
import numpy as np

# Create sample data that mimics your structure with three target groups
data = {
    'Temperature': [0] * 9,
    'Target': [
        'gemma-2-9b-it', 'gemma-2-9b-it', 'gemma-2-9b-it',
        'DeepSeek-R1', 'DeepSeek-R1', 'DeepSeek-R1',
        'Mixtral-8x22B', 'Mixtral-8x22B', 'Mixtral-8x22B'
    ],
    'Dataset': [
        'dataset1', 'dataset1', 'dataset1',
        'dataset2', 'dataset2', 'dataset2',
        'dataset3', 'dataset3', 'dataset3'
    ],
    'Method': ['AR', 'SLEM', 'SD',
               'AR', 'SLEM', 'SD',
               'AR', 'SLEM', 'SD'],
    'Speedup': [
        1.0, 1.49, 1.23,      # gemma group
        1.0, 1.47, 1.42,      # DeepSeek group
        1.0, 2.1, 1.71        # Mixtral group (highest SLEM)
    ]
}

df = pd.DataFrame(data)
df = df.set_index(['Temperature', 'Target', 'Dataset', 'Method'])

print("Original DataFrame:")
print(df)

# Test sorting with priority tiers
df_sorted = hierarchical_max_sort(
    df, 
    value_col='Speedup',
    index_levels=['Temperature', 'Target', 'Dataset', 'Method'],
    priority_level='Method',
    priority_tiers=[['SLEM'], ['SD'], ['AR']]
)

print("\nSorted DataFrame:")
print(df_sorted)

Original DataFrame:
                                           Speedup
Temperature Target        Dataset  Method         
0           gemma-2-9b-it dataset1 AR         1.00
                                   SLEM       1.49
                                   SD         1.23
            DeepSeek-R1   dataset2 AR         1.00
                                   SLEM       1.47
                                   SD         1.42
            Mixtral-8x22B dataset3 AR         1.00
                                   SLEM       2.10
                                   SD         1.71

Sorted DataFrame:
                                           Speedup
Temperature Target        Dataset  Method         
0           Mixtral-8x22B dataset3 SLEM       2.10
                                   SD         1.71
                                   AR         1.00
            gemma-2-9b-it dataset1 SLEM       1.49
                                   SD         1.23
                                   AR      

  group_data = df.loc[group_idx]
  df_temp.loc[group_idx, 'priority_rank'] = best_rank
  group_data = df.loc[group_idx]
  df_temp.loc[group_idx, 'priority_rank'] = best_rank
  group_data = df.loc[group_idx]
  df_temp.loc[group_idx, 'priority_rank'] = best_rank
