In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import os
import pandas as pd

# Path to the directory
directory_path = "/home/khasmamad/workspace/auto-round/examples/language-modeling/tmp_autoround/khas-thesis-main"

# List to store folder names where 'results.csv' is missing
missing_files = []

# DataFrame to store the contents of all 'results.csv' files
combined_df = pd.DataFrame()

# Iterate through each folder in the directory
for folder_name in os.listdir(directory_path):
    folder_path = os.path.join(directory_path, folder_name)
    if os.path.isdir(folder_path):
        file_path = os.path.join(folder_path, 'results.csv')
        if os.path.exists(file_path):
            # Extract model name
            model_name = folder_name.split('-w4g128')[0]
            
            # Extract configuration details
            blcks = int(folder_name.split('blcks::')[1].split('-')[0])
            lkhd_blcks = int(folder_name.split('lkhd_blcks::')[1].split('-')[0])
            blck_step_size = int(folder_name.split('blck_step_size::')[1].split('-')[0]) if 'blck_step_size::' in folder_name else None
            
            if lkhd_blcks == 0:
                if blcks == 1:
                    configs = ["LA-0", "MB-1", "MBwO-1"]
                else:
                    configs = [f'MBwO-{blcks}' if blck_step_size == 1 else f'MB-{blcks}']
            else:
                configs = [f'LA-{lkhd_blcks}']
            
            for config in configs:
                # Read the CSV file and append its contents to the DataFrame
                df = pd.read_csv(file_path)
                df['model_name'] = model_name  # Add model name to the DataFrame
                df['config'] = config  # Add config to the DataFrame
                df = df[['model_name', 'config'] + [col for col in df.columns if col not in ['model_name', 'config']]]  # Reorder columns to make model_name and config the first columns
                combined_df = pd.concat([combined_df, df], ignore_index=True)       
        else:
            # Add the folder name to the list if 'results.csv' is missing
            missing_files.append(folder_name)

print("Folders missing 'results.csv':", missing_files)
print("Combined DataFrame:")
combined_df = combined_df.sort_values(by=['model_name', 'config'], ignore_index=True)
print(combined_df)

Folders missing 'results.csv': ['llama-2-7b-hf-w4g128-clean_lkhd::False-blcks::1-lkhd_blcks::3-lr::0.001-lr_scheduler::linear_decay-iters::1000-nsamples::512-optimizer::signed_sgd-seed::42', 'llama-2-7b-hf-w4g128-clean_lkhd::False-blcks::1-lkhd_blcks::2-lr::0.001-lr_scheduler::linear_decay-iters::1000-nsamples::512-optimizer::signed_sgd-seed::42', 'llama-2-7b-hf-w4g128-clean_lkhd::False-blcks::3-lkhd_blcks::0-lr::0.001-lr_scheduler::linear_decay-iters::1000-nsamples::512-optimizer::signed_sgd-seed::42']
Combined DataFrame:
       model_name  config   mmlu  lambada_openai  hellaswag  winogrande  \
0   llama-2-7b-hf    LA-0  39.23           71.26      56.13       68.35   
1   llama-2-7b-hf    LA-1  40.74           72.66      56.57       68.90   
2   llama-2-7b-hf    MB-1  39.23           71.26      56.13       68.35   
3   llama-2-7b-hf    MB-2  40.81           72.42      56.40       68.75   
4   llama-2-7b-hf    MB-4  41.62           73.04      56.73       68.43   
5   llama-2-7b-hf  MB

In [5]:
for dir_name in missing_files:
    print(dir_name)

llama-2-7b-hf-w4g128-clean_lkhd::False-blcks::4-lkhd_blcks::0-lr::0.001-lr_scheduler::linear_decay-iters::1000-nsamples::512-optimizer::signed_sgd-seed::42
llama-2-7b-hf-w4g128-clean_lkhd::False-blcks::1-lkhd_blcks::3-lr::0.001-lr_scheduler::linear_decay-iters::1000-nsamples::512-optimizer::signed_sgd-seed::42
llama-2-7b-hf-w4g128-clean_lkhd::False-blcks::1-lkhd_blcks::2-lr::0.001-lr_scheduler::linear_decay-iters::1000-nsamples::512-optimizer::signed_sgd-seed::42
llama-2-7b-hf-w4g128-clean_lkhd::False-blcks::3-lkhd_blcks::0-lr::0.001-lr_scheduler::linear_decay-iters::1000-nsamples::512-optimizer::signed_sgd-seed::42
