In [None]:
import pandas as pd
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import sys
from torchvision.transforms import ToTensor

plt.style.use('seaborn-v0_8-darkgrid')


root = pathlib.Path().absolute().parent
DATASET_PATH = root / 'datasets'
sys.path.append(str(root))

from src.data.segmentation import LGGSegmentationDataset
from src.utils.visualize import plot_images_and_masks
from src.enums import DataSplit
from src.utils.transforms import DualInputTransform


### LGG Dataset Exploration

Specifically looking at class inbalance....

In [None]:
LGG_PATH = DATASET_PATH / 'lgg-mri-segmentation'

train_info = pd.read_csv(LGG_PATH / 'train' / 'train.csv')
test_info = pd.read_csv(LGG_PATH / 'test' / 'test.csv')

# label 0: No-Tumor and 1: Tumor for Diagnosis Column
train_info['Diagnosis'] = train_info['Diagnosis'].map({0: 'No-Tumor', 1: 'Tumor'})
test_info['Diagnosis'] = test_info['Diagnosis'].map({0: 'No-Tumor', 1: 'Tumor'})

In [None]:
train_info.head()

In [None]:
test_info.head()

In [None]:
train_counts = train_info['Diagnosis'].value_counts()
test_counts = test_info['Diagnosis'].value_counts()

stacks = {
    'No-Tumor': [train_counts['No-Tumor'], test_counts['No-Tumor']],
    'Tumor': [train_counts['Tumor'], test_counts['Tumor']]
}

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
width = 0.5

ax.bar(['Train', 'Test'], stacks['No-Tumor'], color='orange', alpha=0.7, linewidth=2, label='No-Tumor', width=width)
ax.bar(['Train', 'Test'], stacks['Tumor'], color='blue', alpha=0.7, linewidth=2, label='Tumor', bottom=stacks['No-Tumor'], width=width)
ax.set_title('Diagnosis Distribution')
ax.set_ylabel('Count')
ax.set_xlabel('Diagnosis')
ax.legend()
plt.show()

In [None]:
distribution = pd.DataFrame(columns=['Train', 'Test'], index=['No-Tumor', 'Tumor'], data=np.array([train_counts / train_counts.sum(), test_counts / test_counts.sum()]).T) 

# show df as percentage str
distribution = distribution.map(lambda x: f'{x:.2%}')
distribution

In [None]:
N_SAMPLES = 6
# choose N_SAMPLES random images from the train set
tumor_samples = train_info[train_info['Diagnosis'] == 'Tumor'].sample(N_SAMPLES)['ID'].index.values
no_tumor_samples = train_info[train_info['Diagnosis'] == 'No-Tumor'].sample(N_SAMPLES)['ID'].index.values

dataset = LGGSegmentationDataset(DATASET_PATH, split=DataSplit.TRAIN)

transformer = DualInputTransform(ToTensor())

pairs_tumor = [transformer(*dataset._get_image_mask(i)) for i in tumor_samples]
pairs_no_tumor = [transformer(*dataset._get_image_mask(i)) for i in no_tumor_samples]


### Positive Images and Masks

In [None]:
tumor_images, tumor_masks = zip(*pairs_tumor)

plot_images_and_masks(tumor_images, tumor_masks, include_overlay=True)

### Negative Images and Masks

In [None]:
no_tumor_images, no_tumor_masks = zip(*pairs_no_tumor)
plot_images_and_masks(no_tumor_images, no_tumor_masks, include_overlay=False)