In [51]:
import os
import re
import pandas as pd

data_dir = '../../src/pnlp/ESM_TL/logs'

for model_folder in os.listdir(data_dir):
    model_folder_path = os.path.join(data_dir, model_folder)
    if os.path.isdir(model_folder_path):  # Check if the path is a directory

        for version_folder in os.listdir(model_folder_path):
            version_folder_path = os.path.join(model_folder_path, version_folder)
            if os.path.isdir(version_folder_path):

                for file in os.listdir(version_folder_path):
                    file_path = os.path.join(version_folder_path, file)
                    if file == 'metrics.csv':
                        
                        # Extract data from metrics file
                        df = pd.read_csv(file_path, sep=',', header=0)

                        # Merge rows for each epoch and keep 'epoch' as a column
                        merged_df = df.groupby("epoch").ffill().bfill().drop_duplicates().reset_index()

                        # Ensure 'epoch' is correctly named and exists
                        if "epoch" not in merged_df.columns:
                            merged_df.rename(columns={"index": "epoch"}, inplace=True)  # Rename if it was reset incorrectly

                        # Divide epoch by 2 to correct numbering
                        merged_df.loc[:, "epoch"] = (merged_df["epoch"] // 2).astype(int)

                        # Ensure sorting is correct
                        merged_df = merged_df.sort_values("epoch").reset_index(drop=True)

                        if model_folder.endswith("be"):
                            # Get best (min) rmse row
                            best_rmse_row = merged_df.loc[merged_df["val_be_rmse"].idxmin()]
                            epoch = best_rmse_row["epoch"]
                            rmse = best_rmse_row["val_be_rmse"]
                            b_rmse = best_rmse_row["val_binding_rmse"]
                            e_rmse = best_rmse_row["val_expression_rmse"]
                            print(f"{model_folder}-{version_folder.split('_')[1]}: \n\t(best be) epoch {int(epoch)}, rmse {rmse:.4f}; b_rmse {b_rmse:.4f}, e_rmse {e_rmse:.4f}")

                            best_binding_rmse_row = merged_df.loc[merged_df["val_binding_rmse"].idxmin()]
                            epoch = best_rmse_row["epoch"]
                            b_rmse = best_rmse_row["val_binding_rmse"]
                            print(f"\t(best binding) epoch {int(epoch)}, b_rmse {b_rmse:.4f}")

                            best_expression_rmse_row = merged_df.loc[merged_df["val_expression_rmse"].idxmin()]
                            epoch = best_rmse_row["epoch"]
                            e_rmse = best_rmse_row["val_expression_rmse"]
                            print(f"\t(best expression) epoch {int(epoch)}, e_rmse {e_rmse:.4f}")

                        else:
                            # Get best (min) rmse row
                            best_rmse_row = merged_df.loc[merged_df["val_rmse"].idxmin()]
                            epoch = best_rmse_row["epoch"]
                            rmse = best_rmse_row["val_rmse"]
                            print(f"{model_folder}-{version_folder.split('_')[1]}: \n\t(best) epoch {int(epoch)}, rmse {rmse:.4f}")
                        



esm_blstm-21774790: 
	(best) epoch 687, rmse 0.3497
esm_blstm-21774791: 
	(best) epoch 999, rmse 0.4444
esm_fcn-21774809: 
	(best) epoch 236, rmse 0.5097
esm_fcn-21774811: 
	(best) epoch 920, rmse 0.3277
esm_gcn-21774823: 
	(best) epoch 306, rmse 1.1718
esm_gcn-21774824: 
	(best) epoch 499, rmse 0.6067
esm_blstm_be-21774745: 
	(best be) epoch 801, rmse 0.6693; b_rmse 0.5647, e_rmse 0.3593
	(best binding) epoch 801, b_rmse 0.5647
	(best expression) epoch 801, e_rmse 0.3593
