# Create metadata_csv and test_csv

This notebook implements zero-shot classification on chest X-ray images using pretrained models from TorchXRayVision.

## Setup and Imports

In [1]:
# Colab-specific setup (uncomment if using Colab)
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/Shareddrives/CS231N/chestxray-classification

Mounted at /content/drive
/content/drive/Shareddrives/CS231N/chestxray-classification


In [None]:
# Install dependencies (uncomment if needed)
!pip install torchxrayvision python-dotenv scikit-learn matplotlib tqdm --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.0/29.0 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m105.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Standard imports
import os
import sys
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.autonotebook import tqdm
import torch
import torchxrayvision as xrv
from PIL import Image


  from tqdm.autonotebook import tqdm


In [None]:
# Add the parent directory to path (if running in notebooks/)
import shutil

src = "/content/drive/Shareddrives/CS231N/chestxray-classification"
dst = "/content/chestxray-classification"

shutil.copytree(
    src,
    dst,
    ignore=shutil.ignore_patterns(
        "assignment4", ".git", "__pycache__", "*.ipynb_checkpoints"
    ),
    dirs_exist_ok=True
)
sys.path.insert(0, "/content/chestxray-classification")


In [None]:
# Import custom modules
from utils.config import Config
from models.xray_models_load import load_models

In [None]:
# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    stream=sys.stdout,      # Redirect to notebook output
    force=True
)
logger = logging.getLogger("xray_evaluation")

## Configuration

In [None]:
# Check if paths are set correctly
print(f"Data path: {Config.data_path}")
print(f"Image path: {Config.image_path}")
print(f"CSV path: {Config.csv_path}")
print(f"Output path: {Config.output_path}")
print(f"Using device: {Config.device}")
print(f"Target labels: {Config.target_labels}")

Data path: /content/drive/Shareddrives/CS231N/assignment4/cs231n/datasets/nih-chestxray
Image path: /content/drive/Shareddrives/CS231N/assignment4/cs231n/datasets/nih-chestxray/images
CSV path: /content/drive/Shareddrives/CS231N/assignment4/cs231n/datasets/nih-chestxray/Data_Entry_2017_v2020.csv
Output path: results
Using device: cpu
Target labels: ['Cardiomegaly', 'Atelectasis', 'Effusion', 'Pneumothorax']


In [None]:
# Update paths if needed
# Config.data_path = "/path/to/data"
# Config.image_path = os.path.join(Config.data_path, "images")
# Config.csv_path = os.path.join(Config.data_path, "Data_Entry_2017_v2020.csv")

# Create output directory
os.makedirs(Config.output_path, exist_ok=True)

## Load Models

In [None]:
# List available models
print("Available pretrained models:")
for model_name in xrv.models.model_urls.keys():
    print(f"- {model_name}")

Available pretrained models:
- all
- densenet121-res224-all
- nih
- densenet121-res224-nih
- pc
- densenet121-res224-pc
- chex
- densenet121-res224-chex
- rsna
- densenet121-res224-rsna
- mimic_nb
- densenet121-res224-mimic_nb
- mimic_ch
- densenet121-res224-mimic_ch
- resnet50-res512-all


In [None]:
# Load models
model_mimic, model_chex, pathologies = load_models(Config.device)

2025-05-21 18:26:13 - INFO - models.xray_models_load - Loading DenseNet121 pretrained on MIMIC-CXR...
Downloading weights...
If this fails you can run `wget https://github.com/mlmed/torchxrayvision/releases/download/v1/mimic_ch-densenet121-d121-tw-lr001-rot45-tr15-sc15-seed0-best.pt -O /root/.torchxrayvision/models_data/mimic_ch-densenet121-d121-tw-lr001-rot45-tr15-sc15-seed0-best.pt`
[██████████████████████████████████████████████████]
2025-05-21 18:26:15 - INFO - models.xray_models_load - Loading DenseNet121 pretrained on CheXpert...
Downloading weights...
If this fails you can run `wget https://github.com/mlmed/torchxrayvision/releases/download/v1/chex-densenet121-d121-tw-lr001-rot45-tr15-sc15-seed0-best.pt -O /root/.torchxrayvision/models_data/chex-densenet121-d121-tw-lr001-rot45-tr15-sc15-seed0-best.pt`
[██████████████████████████████████████████████████]
2025-05-21 18:26:16 - INFO - models.xray_models_load - Models loaded successfully with 18 disease classes


In [None]:
# Display pathologies
print("\nSupported pathologies:")
for i, pathology in enumerate(pathologies):
    print(f"{i}: {pathology}")

# Highlight target pathologies
print("\nTarget pathologies:")
for label in Config.target_labels:
    idx = pathologies.index(label)
    print(f"{idx}: {label}")


Supported pathologies:
0: Atelectasis
1: Consolidation
2: 
3: Pneumothorax
4: Edema
5: 
6: 
7: Effusion
8: Pneumonia
9: 
10: Cardiomegaly
11: 
12: 
13: 
14: Lung Lesion
15: Fracture
16: Lung Opacity
17: Enlarged Cardiomediastinum

Target pathologies:
10: Cardiomegaly
0: Atelectasis
7: Effusion
3: Pneumothorax


## Load and Prepare Data

In [None]:
def encode_labels(row, all_labels):
    """
    Encode multi-label findings as binary vector

    Parameters:
    row (pandas.Series): DataFrame row containing 'Finding Labels'
    all_labels (list): List of all possible labels

    Returns:
    list: Binary encoding for each label
    """
    findings = row["Finding Labels"].split("|")
    return [int(label in findings) for label in all_labels]

def load_and_prepare_data(config, pathologies):
    """
    Load and prepare dataset with proper error handling

    Parameters:
    config (Config): Configuration object with paths and settings
    pathologies (list): List of all pathologies supported by models

    Returns:
    tuple: (test_df, label_indices)
    """
    try:
        logger.info(f"Loading metadata from {config.csv_path}")
        metadata_df = pd.read_csv(config.csv_path)
        logger.info(f"Raw metadata contains {len(metadata_df)} entries")

        # Get available images
        available_images = {f for f in os.listdir(config.image_path) if f.endswith(".png")}
        logger.info(f"Found {len(available_images)} PNG images in directory")

        # Filter to match available images
        metadata_df = metadata_df[metadata_df["Image Index"].isin(available_images)]
        logger.info(f"Filtered to {len(metadata_df)} entries with available images")

        # Encode labels based on pathologies list
        metadata_df["encoded_labels"] = metadata_df.apply(
            lambda row: encode_labels(row, pathologies), axis=1
        )
        metadata_df["encoded_array"] = metadata_df["encoded_labels"].apply(np.array)

        # Find indices for target labels
        label_indices = [pathologies.index(label) for label in config.target_labels]

        # Helper to check if all target diseases are negative
        def is_all_target_negative(row):
            return sum(row[label_indices]) == 0

        metadata_df["is_all_target_negative"] = metadata_df["encoded_array"].apply(
            lambda row: is_all_target_negative(row)
        )
        metadata_df.to_csv(os.path.join(config.output_path, "nih_metadata.csv"), index=False)
        return metadata_df, label_indices

    except Exception as e:
        logger.error(f"Failed to prepare data: {e}")
        raise

In [None]:
# Load and prepare data
metadata_df, label_indices = load_and_prepare_data(Config, pathologies)

2025-05-21 18:30:42 - INFO - xray_evaluation - Loading metadata from /content/drive/Shareddrives/CS231N/assignment4/cs231n/datasets/nih-chestxray/Data_Entry_2017_v2020.csv
2025-05-21 18:30:42 - INFO - xray_evaluation - Raw metadata contains 112120 entries
2025-05-21 18:31:30 - INFO - xray_evaluation - Found 54999 PNG images in directory
2025-05-21 18:31:30 - INFO - xray_evaluation - Filtered to 54999 entries with available images


In [2]:
!pip install python-dotenv
import os
from dotenv import load_dotenv

load_dotenv(".env")  # loads GH_TOKEN into os.environ

# Now safely retrieve the token
token = os.getenv("GH_TOKEN")
username = "havindh"
repo = "chestxray-classification"
push_url = f"https://{username}:{token}@github.com/{username}/{repo}.git"

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [3]:
!git status

Refresh index: 100% (21/21), done.
On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

It took 2.72 seconds to compute the branch ahead/behind values.
You can use '--no-ahead-behind' to avoid this.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   assignment4[m
	[31mmodified:   notebooks/create_metadata_csv.ipynb[m
	[31mmodified:   notebooks/xray_classification_notebook.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git add notebooks/create_metadata_csv.ipynb
!git add results/


In [None]:
!git status
!git add notebooks/create_metadata_csv.ipynb

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   assignment4[m
	[31mmodified:   notebooks/create_metadata_csv.ipynb[m
	[31mmodified:   notebooks/xray_classification_notebook.ipynb[m

no changes added to commit (use "git add" and/or "git commit -a")


In [None]:
!git commit -m "Edited metadata csv notebook to get test csv"

[main c3f11cc] Edited metadata csv notebook to get test csv
 1 file changed, 1 insertion(+), 1 deletion(-)


In [None]:
!git push {push_url} main

Enumerating objects: 7, done.
Counting objects:  14% (1/7)Counting objects:  28% (2/7)Counting objects:  42% (3/7)Counting objects:  57% (4/7)Counting objects:  71% (5/7)Counting objects:  85% (6/7)Counting objects: 100% (7/7)Counting objects: 100% (7/7), done.
Delta compression using up to 2 threads
Compressing objects:  25% (1/4)Compressing objects:  50% (2/4)Compressing objects:  75% (3/4)Compressing objects: 100% (4/4)Compressing objects: 100% (4/4), done.
Writing objects:  25% (1/4)Writing objects:  50% (2/4)Writing objects:  75% (3/4)Writing objects: 100% (4/4)Writing objects: 100% (4/4), 865 bytes | 108.00 KiB/s, done.
Total 4 (delta 3), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/havindh/chestxray-classification.git
   3f0598e..c3f11cc  main -> main


In [None]:
metadata_df.columns

In [None]:
metadata_df.head()

In [None]:
def get_test_set(metadata_df, label_indices, config):
  # Create test set with balanced classes
  test_df = create_balanced_test_set(metadata_df, label_indices, config)
  return test_df

def create_balanced_test_set(metadata_df, label_indices, config):
    """
    Create a balanced test set with same number of samples per disease

    Parameters:
    metadata_df (pandas.DataFrame): DataFrame with encoded labels
    label_indices (list): Indices of target labels
    config (Config): Configuration object with settings

    Returns:
    pandas.DataFrame: Balanced test set
    """
    # Helper to check if a disease is positive
    def is_positive(label_index):
        return metadata_df["encoded_array"].apply(lambda x: x[label_index] == 1)



    # Sample positive examples for each disease
    # Sample 400 positive per disease
    positive_samples = []

    for idx, label in zip(label_indices, config.target_labels):
        positives = metadata_df[is_positive(idx)]
        # Handle case where there might be fewer than 400 samples
        n_samples = min(400, len(positives))
        sampled = positives.sample(n=n_samples, random_state=42)
        positive_samples.append(sampled)

    # Combine all disease-positive samples
    positive_samples = pd.concat(positive_samples)

    # Sample negative examples (zero for all target diseases)
    negatives = metadata_df[metadata_df["is_all_target_negative"]]
    logger.info(f"Found {len(negatives)} cases negative for all target diseases")
    if len(negatives) < config.samples_per_disease:
        logger.warning(
            f"Only {len(negatives)} negative samples available, "
            f"wanted {config.samples_per_disease}"
        )
        negative_df = negatives
    else:
        negative_df = negatives.sample(n=config.samples_per_disease, random_state=config.random_seed)

    # Combine, deduplicate and prepare test set
    test_df = pd.concat([positive_samples, negative_df])
    test_df = test_df.drop_duplicates(subset="Image Index").reset_index(drop=True)
    logger.info(f"Created test set with {len(test_df)} unique samples after deduplication")

    # Extract individual columns for each target disease
    for i, label in zip(label_indices, config.target_labels):
        test_df[label] = test_df["encoded_array"].apply(lambda x: x[i])

    # Save test set for reproducibility
    os.makedirs(config.output_path, exist_ok=True)
    test_df[["Image Index"]].to_csv(os.path.join(config.output_path, "nih_test_set_balanced.csv"), index=False)

    return test_df

In [None]:
# Load and prepare test_data set
test_df = get_test_set(metadata_df,label_indices, Config)

In [None]:
# Visualize class distribution
plt.figure(figsize=(10, 5))
test_df[Config.target_labels].sum().plot(kind="bar", color="blue")
plt.title("Positive Case Count per Disease")
plt.ylabel("Count")
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## Visualize Sample Images

In [None]:
# Function to display sample X-ray images
def display_sample_images(test_df, image_path, num_samples=3):
    """Display sample X-ray images for each disease category"""
    fig, axes = plt.subplots(len(Config.target_labels) + 1, num_samples,
                            figsize=(num_samples * 4, (len(Config.target_labels) + 1) * 4))

    # For each disease, show samples
    for i, disease in enumerate(Config.target_labels):
        disease_samples = test_df[test_df[disease] == 1].sample(num_samples)

        for j, (_, row) in enumerate(disease_samples.iterrows()):
            img_file = os.path.join(image_path, row["Image Index"])
            img = Image.open(img_file).convert('L')
            axes[i, j].imshow(img, cmap='gray')
            axes[i, j].set_title(f"{disease}\n{row['Image Index']}")
            axes[i, j].axis('off')

    # Show negative samples (no target disease)
    negative_samples = test_df[(test_df[Config.target_labels] == 0).all(axis=1)].sample(num_samples)

    for j, (_, row) in enumerate(negative_samples.iterrows()):
        img_file = os.path.join(image_path, row["Image Index"])
        img = Image.open(img_file).convert('L')
        axes[-1, j].imshow(img, cmap='gray')
        axes[-1, j].set_title(f"No Target Disease\n{row['Image Index']}")
        axes[-1, j].axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
# Display sample images
display_sample_images(test_df, Config.image_path, num_samples=3)

## Conculison

Metadata:  ???

Test dataset balanced

Found 42511 cases negative for all target diseases

Created test set with 1969 unique samples after deduplication