# Dataset Statistics

## Check Setup

In [1]:
from helpers_functions.setup import *

path_archive = "archive"

path_train_val_list = "archive/train_val_list_NIH.txt"
path_test_list = "archive/test_list_NIH.txt"

path_all_data_csv = "archive/Data_Entry_2017.csv"

path_folder_images = "archive/images-224/images-224"

### Check structure

In [2]:
import json

path = path_archive
print(json.dumps(list_tree(path, max_depth=1), indent=2))


{
  "_files": [
    "BBox_List_2017_Official_NIH.csv",
    "test_list_NIH.txt",
    "pretrained_model.h5",
    "train_val_list_NIH.txt",
    "Data_Entry_2017.csv"
  ],
  "images-224": {
    "images-224": "..."
  }
}


## Data Preprocessing

### Create class-label linking

In [3]:
from helpers_functions.multi_hot import *

class_label_str_to_idx, class_label_idx_to_str = create_class_mappings(path_all_data_csv)

print(class_label_str_to_idx)
print(class_label_idx_to_str)

{'Atelectasis': 0, 'Cardiomegaly': 1, 'Consolidation': 2, 'Edema': 3, 'Effusion': 4, 'Emphysema': 5, 'Fibrosis': 6, 'Hernia': 7, 'Infiltration': 8, 'Mass': 9, 'No Finding': 10, 'Nodule': 11, 'Pleural_Thickening': 12, 'Pneumonia': 13, 'Pneumothorax': 14}
{0: 'Atelectasis', 1: 'Cardiomegaly', 2: 'Consolidation', 3: 'Edema', 4: 'Effusion', 5: 'Emphysema', 6: 'Fibrosis', 7: 'Hernia', 8: 'Infiltration', 9: 'Mass', 10: 'No Finding', 11: 'Nodule', 12: 'Pleural_Thickening', 13: 'Pneumonia', 14: 'Pneumothorax'}


### Multi-hot encoding

In [4]:
# Assume class_to_idx and idx_to_class are already created
image_to_multihot = create_image_multihot_mapping_from_dicts(path_all_data_csv, class_label_str_to_idx)

# Check the first image mapping
first_image = list(image_to_multihot.keys())[0]
print(first_image, image_to_multihot[first_image])


00000001_000.png [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


## General stats

In [5]:
from helpers_functions.data_stats import *

stats_general = analyze_image_folder(path_folder_images)
print(json.dumps(stats_general, indent=2))


{
  "num_images": 112120,
  "shape_counts": {
    "(224, 224, 3)": 112120
  },
  "pixel_stats": {
    "min": 0.0,
    "max": 255.0,
    "mean": 127.01726531982422,
    "std": 63.28774642944336
  }
}


## Class-wise stats

In [6]:
stats_classwise = analyze_image_folder_classwise(path_folder_images, image_to_multihot)
print(json.dumps(stats_classwise, indent=2))

{
  "0": {
    "num_images": 11535,
    "shape_counts": {
      "(224, 224, 3)": 11535
    },
    "pixel_stats": {
      "min": 0.0,
      "max": 255.0,
      "mean": 127.17862701416016,
      "std": 62.6441535949707
    }
  },
  "1": {
    "num_images": 2772,
    "shape_counts": {
      "(224, 224, 3)": 2772
    },
    "pixel_stats": {
      "min": 0.0,
      "max": 255.0,
      "mean": 123.82685852050781,
      "std": 64.670654296875
    }
  },
  "2": {
    "num_images": 4667,
    "shape_counts": {
      "(224, 224, 3)": 4667
    },
    "pixel_stats": {
      "min": 0.0,
      "max": 255.0,
      "mean": 121.89340209960938,
      "std": 61.55959701538086
    }
  },
  "3": {
    "num_images": 2303,
    "shape_counts": {
      "(224, 224, 3)": 2303
    },
    "pixel_stats": {
      "min": 0.0,
      "max": 255.0,
      "mean": 120.49010467529297,
      "std": 58.52485275268555
    }
  },
  "4": {
    "num_images": 13307,
    "shape_counts": {
      "(224, 224, 3)": 13307
    },
    "pi