# Deep Learning Prediction Model for Chest Disease using X-rays

## Questions for Investigation

## Data Processing

In [1]:
import pandas as pd     

chest_disease_data = pd.read_csv("sample_labels.csv")
chest_disease_data.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139,0.139
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168,0.168
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168,0.168
3,00000030_001.png,Atelectasis,1,30,079Y,M,PA,2992,2991,0.143,0.143
4,00000032_001.png,Cardiomegaly|Edema|Effusion,1,32,055Y,F,AP,2500,2048,0.168,0.168


In [9]:
# Number of rows
num_rows = chest_disease_data.shape[0]
print(f"Number of rows: {num_rows}")

# Number of variables (columns)
num_columns = chest_disease_data.shape[1]
print(f"Number of columns: {num_columns}")

# Unique labels in the 'Finding Labels' column
num_unique_labels = chest_disease_data['Finding Labels'].nunique()
print(f"Number of unique labels: {num_unique_labels}") 


# Optional: Count of each label
label_counts = chest_disease_data['Finding Labels'].value_counts()
print("\nLabel counts:")
print(label_counts)

Number of rows: 1582
Number of columns: 25
Number of unique labels: 14

Label counts:
Finding Labels
Infiltration          503
Effusion              203
Atelectasis           192
Nodule                144
Pneumothorax          114
Mass                   99
Consolidation          72
Pleural_Thickening     65
Cardiomegaly           50
Emphysema              42
Edema                  41
Fibrosis               38
Pneumonia              14
Hernia                  5
Name: count, dtype: int64


In [10]:
# Removing multi-label images
single_label_data = chest_disease_data[~chest_disease_data['Finding Labels'].str.contains('\|', regex=True)]

print(f"Number of rows after removing multi-labels: {single_label_data.shape[0]}")

chest_disease_data = single_label_data

num_unique_labels = chest_disease_data['Finding Labels'].nunique()
print(f"Number of unique labels: {num_unique_labels}") 

Number of rows after removing multi-labels: 1582
Number of unique labels: 14


  single_label_data = chest_disease_data[~chest_disease_data['Finding Labels'].str.contains('\|', regex=True)]


In [11]:
# We remove no_findings label since it significantly outnumbers other labels and we want to prevent potentialy bias toward predicting "nothing", predicting disease accurately
chest_disease_data = chest_disease_data[chest_disease_data['Finding Labels'] != "No Finding"]

# Number of rows
num_rows = chest_disease_data.shape[0]
print(f"Number of rows: {num_rows}")

Number of rows: 1582


In [12]:
missing_labels = chest_disease_data['Finding Labels'].isnull().sum()
print(f"Missing labels: {missing_labels}")

Missing labels: 0


In [6]:
# One-Hot Encoding for disease classes
one_hot_labels = pd.get_dummies(chest_disease_data['Finding Labels'])

# Merge one-hot labels with original dataframe (optional)
chest_disease_data = pd.concat([chest_disease_data, one_hot_labels], axis=1)
chest_disease_data

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,...,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
3,00000030_001.png,Atelectasis,1,30,079Y,M,PA,2992,2991,0.143000,...,False,False,False,False,False,False,False,False,False,False
8,00000061_002.png,Effusion,2,61,077Y,M,PA,2992,2991,0.143000,...,True,False,False,False,False,False,False,False,False,False
11,00000079_000.png,Mass,0,79,063Y,M,PA,2500,2048,0.168000,...,False,False,False,False,False,True,False,False,False,False
14,00000084_000.png,Effusion,0,84,057Y,F,PA,2048,2500,0.171000,...,True,False,False,False,False,False,False,False,False,False
15,00000096_006.png,Effusion,6,96,067Y,F,PA,2242,2546,0.143000,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5589,00030634_000.png,Effusion,0,30634,060Y,F,PA,2544,3056,0.139000,...,True,False,False,False,False,False,False,False,False,False
5592,00030636_010.png,Pneumothorax,10,30636,046Y,M,PA,2021,2021,0.194311,...,False,False,False,False,False,False,False,False,False,True
5593,00030636_012.png,Consolidation,12,30636,046Y,M,PA,2020,2021,0.194311,...,False,False,False,False,False,False,False,False,False,False
5599,00030695_002.png,Atelectasis,2,30695,052Y,F,PA,2021,2021,0.194311,...,False,False,False,False,False,False,False,False,False,False


In [7]:
from torchvision import transforms

# Preprocessing steps for DenseNet
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # 224x224 standard size
    transforms.Grayscale(num_output_channels=3),  # If your X-rays are grayscale!
    transforms.ToTensor(),  # Convert to tensor and scale pixels to [0,1]
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # ImageNet means
                         std=[0.229, 0.224, 0.225])   # ImageNet stds
])

In [8]:
%pip install torchvision




## Data Cleaning

## Exploratory Data Analysis

## Feature Engineering

## Model Selection