# Gather split statistics

Problem: Split statistics are held in individual txt-files. 
To make use of the more easily, it would be convenient to gather all this information into a single csv file.

In [1]:
import pathlib
import re
import sys
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))

import pandas as pd

from src.util.definitions import DATA_ROOT

In [4]:
# read all the statistics files
rows = []
for split in (DATA_ROOT / "splits").iterdir():
    match_split = re.search(r"synferm_dataset_2023-09-05_(\dD)_split_*(.*)", split.name)
    if split.is_dir() and match_split:
        split_name = "_".join(match_split.groups()).strip("_")
        for file in split.iterdir():
            match = re.search(r"fold(\d+)_statistics\.txt", file.name)
            if match:
                with open(file, "r") as f:
                    content = {"split_name": split_name, "fold": match.group(1)}
                    content.update({k:v  for k,v in [line.strip("\n").split(": ") for line in f.readlines()]})
                    rows.append(content)                

In [5]:
# create df to hold all info
df = pd.DataFrame(rows).sort_values(by=["split_name", "fold"], ignore_index=True)
df

Unnamed: 0,split_name,fold,Train samples,Val samples,Test samples,Not used,Train samples binary_A has label 1,Train samples binary_B has label 1,Train samples binary_C has label 1,Val samples binary_A has label 1,...,Chance level average precision macro on test set,Train initiators,Val initiators,Test initiators,Train monomers,Val monomers,Test monomers,Train terminators,Val terminators,Test terminators
0,0D,0,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26042 (81.3%),18286 (57.1%),9105 (28.4%),3224 (80.6%),...,0.548,,,,,,,,,
1,0D,1,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26049 (81.4%),18293 (57.1%),9079 (28.4%),3262 (81.5%),...,0.545,,,,,,,,,
2,0D,2,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26007 (81.2%),18255 (57.0%),9060 (28.3%),3263 (81.5%),...,0.558,,,,,,,,,
3,0D,3,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26046 (81.4%),18292 (57.1%),9019 (28.2%),3242 (81.0%),...,0.551,,,,,,,,,
4,0D,4,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26031 (81.3%),18185 (56.8%),9066 (28.3%),3229 (80.7%),...,0.562,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,3D_80,4,19749 (49.4%),34 (0.1%),55 (0.1%),20180 (50.4%),15754 (79.8%),11536 (58.4%),5464 (27.7%),30 (88.2%),...,0.545,53,6,7,56,8,8,32,4,5
339,3D_80,5,19761 (49.4%),42 (0.1%),52 (0.1%),20163 (50.4%),15798 (79.9%),10907 (55.2%),5512 (27.9%),33 (78.6%),...,0.615,53,7,7,56,8,7,32,4,5
340,3D_80,6,21450 (53.6%),29 (0.1%),37 (0.1%),18502 (46.2%),17467 (81.4%),12645 (59.0%),6413 (29.9%),29 (100.0%),...,0.414,53,6,4,56,7,6,32,4,5
341,3D_80,7,20481 (51.2%),49 (0.1%),71 (0.2%),19417 (48.5%),16946 (82.7%),12236 (59.7%),5895 (28.8%),37 (75.5%),...,0.469,53,7,7,56,7,7,32,4,5


In [6]:
# save to file for later use
df.to_csv(DATA_ROOT / "splits" / "split_statistics.csv", index=False)

In [8]:
df.loc[df["split_name"] == "0D"]

Unnamed: 0,split_name,fold,Train samples,Val samples,Test samples,Not used,Train samples binary_A has label 1,Train samples binary_B has label 1,Train samples binary_C has label 1,Val samples binary_A has label 1,...,Chance level average precision macro on test set,Train initiators,Val initiators,Test initiators,Train monomers,Val monomers,Test monomers,Train terminators,Val terminators,Test terminators
0,0D,0,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26042 (81.3%),18286 (57.1%),9105 (28.4%),3224 (80.6%),...,0.548,,,,,,,,,
1,0D,1,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26049 (81.4%),18293 (57.1%),9079 (28.4%),3262 (81.5%),...,0.545,,,,,,,,,
2,0D,2,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26007 (81.2%),18255 (57.0%),9060 (28.3%),3263 (81.5%),...,0.558,,,,,,,,,
3,0D,3,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26046 (81.4%),18292 (57.1%),9019 (28.2%),3242 (81.0%),...,0.551,,,,,,,,,
4,0D,4,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26031 (81.3%),18185 (56.8%),9066 (28.3%),3229 (80.7%),...,0.562,,,,,,,,,
5,0D,5,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26015 (81.3%),18231 (56.9%),9061 (28.3%),3255 (81.3%),...,0.558,,,,,,,,,
6,0D,6,32014 (80.0%),4002 (10.0%),4002 (10.0%),,25995 (81.2%),18269 (57.1%),9026 (28.2%),3265 (81.6%),...,0.562,,,,,,,,,
7,0D,7,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26053 (81.4%),18372 (57.4%),9047 (28.3%),3231 (80.7%),...,0.553,,,,,,,,,
8,0D,8,32014 (80.0%),4002 (10.0%),4002 (10.0%),,26020 (81.3%),18236 (57.0%),9137 (28.5%),3227 (80.6%),...,0.555,,,,,,,,,
