In [None]:
import csv
import sys
import os

import numpy as np
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

MIN_TRANSFORMERS_VERSION = "4.25.1"

# check transformers version
assert (
    transformers.__version__ >= MIN_TRANSFORMERS_VERSION
), f"Please upgrade transformers to version {MIN_TRANSFORMERS_VERSION} or higher."

root_dir = "/Users/kenzaamara/GithubProjects/syntax-shap/shap2"
sys.path.append(root_dir)

In [None]:
import pandas as pd
import os
import re

def parse_filename(filename):
    """Parse the filename to extract algorithm, dataset, and model."""
    pattern = r"_algorithm=(.*?)_dataset=(.*?)_model_name=(.*?)\.stdout"
    match = re.search(pattern, filename)
    if match:
        return match.groups()
    else:
        return None, None, None
    
def tail(filename, n=1):
    """Read the last n lines from a file."""
    with open(filename, 'rb') as f:
        f.seek(0, os.SEEK_END)
        end_byte = f.tell()
        lines_found = []
        while len(lines_found) < n and f.tell() > 0:
            f.seek(-2, os.SEEK_CUR)
            if f.read(1) == b'\n':
                lines_found.append(f.tell())
        f.seek(0)
        if lines_found:
            f.seek(lines_found[-1])
        else:
            f.seek(0)
        last_lines = f.read(end_byte - f.tell()).decode('utf-8').splitlines()
        return last_lines[-n:]


def get_last_values(filename):
    """Retrieve the last two values from the last two lines of a file."""
    try:
        last_lines = tail(filename, n=4)
        time_lines = last_lines[:2]
        if last_lines[-2].startswith("Done!"):
            last_values = [eval(line.split()[-1]) for line in time_lines]
            return last_values
        else:
            return [None, None]
    except IOError:
        print(f"Error reading file: {filename}")
        return [None, None]

def process_folder(folder_path):
    """Process all .out files in the specified folder."""
    data = []
    for file in os.listdir(folder_path):
        if file.endswith(".stdout"):
            file_path = os.path.join(folder_path, file)
            last_values = get_last_values(file_path)
            algorithm, dataset, model = parse_filename(file)
            # Add the extracted details as new columns
            data.append({
                "algorithm": algorithm,
                "dataset": dataset,
                "model": model,
                "time": last_values[0],
                "avg_time": last_values[1]
            })
    df = pd.DataFrame(data)
    return df

# Specify the folder path here
folder_path = "/cluster/work/zhang/kamara/syntax-shap/checkpoints/computation_time/logs"
df = process_folder(folder_path)

# Print the DataFrame
df.groupby(["model", "dataset", "algorithm"]).mean()
