In [None]:
import sys
import pathlib
import pandas as pd
import numpy as np

src = pathlib.Path().absolute().parent
sys.path.append(str(src))

from src.data.classification import TumorClassificationDataset
from src.enums import DataSplit

### Classification Dataset Label Distribution

In [None]:
counts = []
for split in DataSplit:
    if split == DataSplit.VALIDATION:
        continue
    data = TumorClassificationDataset(src / 'datasets', split=split)
    file_names, labels = zip(*data.samples)
    labels = np.array([data.idx_to_class[label] for label in labels])
    df = pd.DataFrame([file_names, labels], index=['file_name', 'label']).T

    # get distribution of labels
    label_counts = df['label'].value_counts()
    label_counts = label_counts.sort_index()
    counts.append((split, label_counts))

# create one table with row being split and columns being label counts
df = pd.concat([count for _, count in counts], axis=1)
df.columns = [split.name for split in DataSplit if split != DataSplit.VALIDATION]
df = df.fillna(0)
df = df.astype(int)

# create separate table with row being split and columns being label distribution
df_dist = df.div(df.sum(axis=0), axis=1)
distribution = df_dist.map(lambda x: f'{x:.2%}')

print(df)
print(distribution)