# Gather split statistics

Problem: Split statistics are held in individual txt-files. 
To make use of the more easily, it would be convenient to gather all this information into a single csv file.

In [5]:
import pathlib
import re
import sys
sys.path.append(str(pathlib.Path("__file__").absolute().parents[1]))

import pandas as pd

from src.util.definitions import DATA_ROOT

In [6]:
# read all the statistics files
rows = []
for split in (DATA_ROOT / "splits").iterdir():
    match_split = re.search(r"synferm_dataset_2023-12-20_(\dD)_split_*(.*)", split.name)
    if split.is_dir() and match_split:
        split_name = "_".join(match_split.groups()).strip("_")
        for file in split.iterdir():
            match = re.search(r"fold(\d+)_statistics\.txt", file.name)
            if match:
                with open(file, "r") as f:
                    content = {"split_name": split_name, "fold": match.group(1)}
                    content.update({k:v  for k,v in [line.strip("\n").split(": ") for line in f.readlines()]})
                    rows.append(content)                

In [7]:
# create df to hold all info
df = pd.DataFrame(rows).sort_values(by=["split_name", "fold"], ignore_index=True)
df

Unnamed: 0,split_name,fold,Train samples,Val samples,Test samples,Train samples binary_A has label 1,Train samples binary_B has label 1,Train samples binary_C has label 1,Val samples binary_A has label 1,Val samples binary_B has label 1,...,Train initiators,Val initiators,Test initiators,Train monomers,Val monomers,Test monomers,Train terminators,Val terminators,Test terminators,Not used
0,0D_0.625,0,246 (0.6%),19743 (50.0%),19497 (49.4%),195 (79.3%),123 (50.0%),67 (27.2%),16267 (82.4%),11439 (57.9%),...,,,,,,,,,,
1,0D_0.625,1,246 (0.6%),19743 (50.0%),19497 (49.4%),197 (80.1%),145 (58.9%),77 (31.3%),16227 (82.2%),11458 (58.0%),...,,,,,,,,,,
2,0D_0.625,2,246 (0.6%),19743 (50.0%),19497 (49.4%),203 (82.5%),148 (60.2%),78 (31.7%),16201 (82.1%),11430 (57.9%),...,,,,,,,,,,
3,0D_0.625,3,246 (0.6%),19743 (50.0%),19497 (49.4%),197 (80.1%),141 (57.3%),65 (26.4%),16214 (82.1%),11456 (58.0%),...,,,,,,,,,,
4,0D_0.625,4,246 (0.6%),19743 (50.0%),19497 (49.4%),204 (82.9%),120 (48.8%),56 (22.8%),16223 (82.2%),11448 (58.0%),...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,3D_80,4,19544 (49.5%),43 (0.1%),26 (0.1%),16373 (83.8%),11539 (59.0%),5475 (28.0%),29 (67.4%),17 (39.5%),...,53,7,4,48,7,8,32,4,4,19873 (50.3%)
330,3D_80,5,19795 (50.1%),38 (0.1%),39 (0.1%),16226 (82.0%),10824 (54.7%),5663 (28.6%),35 (92.1%),16 (42.1%),...,53,7,7,48,7,8,32,4,5,19614 (49.7%)
331,3D_80,6,18709 (47.4%),48 (0.1%),68 (0.2%),15472 (82.7%),10854 (58.0%),4864 (26.0%),44 (91.7%),37 (77.1%),...,53,7,7,47,6,9,32,4,5,20661 (52.3%)
332,3D_80,7,19819 (50.2%),49 (0.1%),46 (0.1%),15976 (80.6%),11722 (59.1%),6187 (31.2%),47 (95.9%),29 (59.2%),...,53,6,7,48,7,7,32,4,5,19572 (49.6%)


In [9]:
# save to file for later use
df.to_csv(DATA_ROOT / "splits" / "split_statistics_2023-12-20.csv", index=False)

In [8]:
df.loc[df["split_name"] == "1D"]

Unnamed: 0,split_name,fold,Train samples,Val samples,Test samples,Train samples binary_A has label 1,Train samples binary_B has label 1,Train samples binary_C has label 1,Val samples binary_A has label 1,Val samples binary_B has label 1,...,Train initiators,Val initiators,Test initiators,Train monomers,Val monomers,Test monomers,Train terminators,Val terminators,Test terminators,Not used
73,1D,0,27196 (68.9%),5672 (14.4%),6618 (16.8%),22402 (82.4%),15429 (56.7%),7746 (28.5%),4277 (75.4%),3422 (60.3%),...,,,,,,,,,,
74,1D,1,25001 (63.3%),8543 (21.6%),5942 (15.0%),20347 (81.4%),14227 (56.9%),6881 (27.5%),7424 (86.9%),5302 (62.1%),...,,,,,,,,,,
75,1D,2,28990 (73.4%),5536 (14.0%),4960 (12.6%),24239 (83.6%),16853 (58.1%),8631 (29.8%),4214 (76.1%),3305 (59.7%),...,,,,,,,,,,
76,1D,3,25495 (64.6%),9037 (22.9%),4954 (12.5%),20427 (80.1%),14342 (56.3%),6415 (25.2%),8317 (92.0%),5985 (66.2%),...,,,,,,,,,,
77,1D,4,27626 (70.0%),6433 (16.3%),5427 (13.7%),22505 (81.5%),15898 (57.5%),7994 (28.9%),5115 (79.5%),3473 (54.0%),...,,,,,,,,,,
78,1D,5,26904 (68.1%),6701 (17.0%),5881 (14.9%),22187 (82.5%),15431 (57.4%),7698 (28.6%),5309 (79.2%),3951 (59.0%),...,,,,,,,,,,
79,1D,6,28292 (71.7%),5161 (13.1%),6033 (15.3%),22839 (80.7%),16777 (59.3%),8612 (30.4%),4477 (86.7%),2222 (43.1%),...,,,,,,,,,,
80,1D,7,27420 (69.4%),5605 (14.2%),6461 (16.4%),23560 (85.9%),18246 (66.5%),9601 (35.0%),3992 (71.2%),1341 (23.9%),...,,,,,,,,,,
81,1D,8,28020 (71.0%),5449 (13.8%),6017 (15.2%),22754 (81.2%),16454 (58.7%),7849 (28.0%),4982 (91.4%),3519 (64.6%),...,,,,,,,,,,
