# Gather split statistics

Problem: Split statistics are held in individual txt-files. 
To make use of them more easily, it would be convenient to gather all this information into a single csv file.

In [None]:
import pathlib
import re
import sys
sys.path.append(str(pathlib.Path().absolute().parent))

import pandas as pd

from src.util.definitions import DATA_ROOT

In [None]:
# read all the statistics files
rows = []
for split in (DATA_ROOT / "splits").iterdir():
    match_split = re.search(r"synferm_dataset_2023-12-20_(\dD)_split_*(.*)", split.name)
    match_split_synthetic = re.search(r"synferm_dataset_2024-01-31_synthetic_(\dD)_split_*(.*)", split.name)
    if split.is_dir():
        if match_split:
            split_name = "_".join(match_split.groups()).strip("_")
            for file in split.iterdir():
                match = re.search(r"fold(\d+)_statistics\.txt", file.name)
                if match:
                    with open(file, "r") as f:
                        content = {"split_name": split_name, "fold": match.group(1)}
                        content.update({k:v  for k,v in [line.strip("\n").split(": ") for line in f.readlines()]})
                        rows.append(content)                
        elif match_split_synthetic:
            split_name = "_".join(match_split_synthetic.groups()).strip("_")
            for file in split.iterdir():
                match = re.search(r"fold(\d+)_statistics\.txt", file.name)
                if match:
                    with open(file, "r") as f:
                        content = {"split_name": f"{split_name}_syn", "fold": match.group(1)}
                        content.update({k:v  for k,v in [line.strip("\n").split(": ") for line in f.readlines()]})
                        rows.append(content)  

In [None]:
# create df to hold all info
df = pd.DataFrame(rows).sort_values(by=["split_name", "fold"], ignore_index=True)
df

In [None]:
# save to file for later use
df.to_csv(DATA_ROOT / "splits" / "split_statistics_2023-12-20.csv", index=False)

In [None]:
df.loc[df["split_name"] == "2D"]