# Chest X-ray EDA

This notebook visualizes the training data class distribution and samples.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

# Load Data
df = pd.read_csv('../data/processed/train.csv')

# Statistics
print(f"Total Training Images: {len(df)}")
print("Class Balance:")
print(df['label_str'].value_counts())
print("\nRatio:")
print(df['label_str'].value_counts(normalize=True))

In [None]:
# Visualization
try:
    normal_samples = df[df['label_str'] == 'NORMAL'].sample(8, random_state=42)
    pneumonia_samples = df[df['label_str'] == 'PNEUMONIA'].sample(8, random_state=42)
except ValueError:
    print("Not enough data to sample 8 images per class.")

samples = pd.concat([normal_samples, pneumonia_samples]).reset_index(drop=True)

fig, axes = plt.subplots(4, 4, figsize=(12, 12))
axes = axes.flatten()

for i, ax in enumerate(axes):
    if i < len(samples):
        row = samples.iloc[i]
        img_path = row['path']
        try:
            img = Image.open(img_path).convert('RGB')
            ax.imshow(img, cmap='gray')
            ax.set_title(row['label_str'])
            ax.axis('off')
        except Exception as e:
            print(f"Error loading {img_path}: {e}")

plt.tight_layout()
plt.show()