In [1]:
# working on CXR8 dataset
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score,recall_score,precision_score
from sklearn.model_selection import train_test_split


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms



In [2]:

data_dir = 'images'
images = os.listdir(data_dir)


test_images = pd.read_csv('CXR8/test_list.txt', header=None)


test_images

train_images = pd.read_csv('CXR8/train_val_list.txt', header=None)

train_images

labels = pd.read_csv('CXR8/Data_Entry_2017_v2020.csv')



all_labels = '|'.join(labels['Finding Labels'].unique())
all_labels = all_labels.split('|')
all_labels = list(set(all_labels))



for label in all_labels:
    labels[label] = labels['Finding Labels'].apply(lambda x: 1 if label in x else 0)


labels


tenso = torch.tensor(labels[all_labels].values).float()
data = pd.DataFrame()
data['Image Index'] = labels['Image Index']
data[all_labels] = tenso


data = data.drop(columns=['No Finding'])


all_labels

all_labels_without_no_finding = all_labels.copy()
all_labels_without_no_finding.remove('No Finding')
all_labels_without_no_finding


['Pneumonia',
 'Pneumothorax',
 'Pleural_Thickening',
 'Hernia',
 'Fibrosis',
 'Emphysema',
 'Cardiomegaly',
 'Mass',
 'Edema',
 'Infiltration',
 'Consolidation',
 'Effusion',
 'Nodule',
 'Atelectasis']

In [3]:
 # place the images in train_images in the train set and the images in test_images in the test set
    
train_val_data = data[data['Image Index'].isin(train_images[0].values)]
test_data = data[data['Image Index'].isin(test_images[0].values)]

patients_ids = pd.read_csv('CXR8/Data_Entry_2017_v2020.csv')
patients_ids = patients_ids[['Image Index', 'Patient ID']]

train_val_data = pd.merge(train_val_data, patients_ids, on='Image Index')



unique_patient_ids = train_val_data['Patient ID'].unique()

    # Split patient IDs into training and validation sets
train_patient_ids, val_patient_ids = train_test_split(
    unique_patient_ids, 
    test_size=0.1, 
    random_state=42
    )

    # Create train and validation data based on the patient ID split
train_data = train_val_data[train_val_data['Patient ID'].isin(train_patient_ids)]
val_data = train_val_data[train_val_data['Patient ID'].isin(val_patient_ids)]

train_data = train_data.drop(columns=['Patient ID'])
val_data = val_data.drop(columns=['Patient ID'])

In [4]:
train_data

Unnamed: 0,Image Index,Pneumonia,Pneumothorax,Pleural_Thickening,Hernia,Fibrosis,Emphysema,Cardiomegaly,Mass,Edema,Infiltration,Consolidation,Effusion,Nodule,Atelectasis
0,00000001_000.png,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,00000002_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000004_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86519,00030789_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
86520,00030793_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
86521,00030795_000.png,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86522,00030801_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
val_data

Unnamed: 0,Image Index,Pneumonia,Pneumothorax,Pleural_Thickening,Hernia,Fibrosis,Emphysema,Cardiomegaly,Mass,Edema,Infiltration,Consolidation,Effusion,Nodule,Atelectasis
15,00000008_000.png,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,00000008_001.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,00000008_002.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
66,00000038_000.png,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
67,00000038_001.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86451,00030701_001.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
86452,00030701_002.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86506,00030772_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86507,00030772_001.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [9]:
test_data

Unnamed: 0,Image Index,Pneumonia,Pneumothorax,Pleural_Thickening,Hernia,Fibrosis,Emphysema,Cardiomegaly,Mass,Edema,Infiltration,Consolidation,Effusion,Nodule,Atelectasis
4,00000003_001.png,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,00000003_002.png,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,00000003_003.png,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,00000003_004.png,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,00000003_005.png,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112113,00030800_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112116,00030802_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112117,00030803_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112118,00030804_000.png,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# sum of the labels for each class
train_data[all_labels_without_no_finding].info()

<class 'pandas.core.frame.DataFrame'>
Index: 77988 entries, 0 to 86523
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Pneumonia           77988 non-null  float32
 1   Pneumothorax        77988 non-null  float32
 2   Pleural_Thickening  77988 non-null  float32
 3   Hernia              77988 non-null  float32
 4   Fibrosis            77988 non-null  float32
 5   Emphysema           77988 non-null  float32
 6   Cardiomegaly        77988 non-null  float32
 7   Mass                77988 non-null  float32
 8   Edema               77988 non-null  float32
 9   Infiltration        77988 non-null  float32
 10  Consolidation       77988 non-null  float32
 11  Effusion            77988 non-null  float32
 12  Nodule              77988 non-null  float32
 13  Atelectasis         77988 non-null  float32
dtypes: float32(14)
memory usage: 4.8 MB


In [14]:
val_data[all_labels_without_no_finding].info()

<class 'pandas.core.frame.DataFrame'>
Index: 8536 entries, 15 to 86508
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Pneumonia           8536 non-null   float32
 1   Pneumothorax        8536 non-null   float32
 2   Pleural_Thickening  8536 non-null   float32
 3   Hernia              8536 non-null   float32
 4   Fibrosis            8536 non-null   float32
 5   Emphysema           8536 non-null   float32
 6   Cardiomegaly        8536 non-null   float32
 7   Mass                8536 non-null   float32
 8   Edema               8536 non-null   float32
 9   Infiltration        8536 non-null   float32
 10  Consolidation       8536 non-null   float32
 11  Effusion            8536 non-null   float32
 12  Nodule              8536 non-null   float32
 13  Atelectasis         8536 non-null   float32
dtypes: float32(14)
memory usage: 533.5 KB


In [15]:
test_data[all_labels_without_no_finding].info()

<class 'pandas.core.frame.DataFrame'>
Index: 25596 entries, 4 to 112119
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Pneumonia           25596 non-null  float32
 1   Pneumothorax        25596 non-null  float32
 2   Pleural_Thickening  25596 non-null  float32
 3   Hernia              25596 non-null  float32
 4   Fibrosis            25596 non-null  float32
 5   Emphysema           25596 non-null  float32
 6   Cardiomegaly        25596 non-null  float32
 7   Mass                25596 non-null  float32
 8   Edema               25596 non-null  float32
 9   Infiltration        25596 non-null  float32
 10  Consolidation       25596 non-null  float32
 11  Effusion            25596 non-null  float32
 12  Nodule              25596 non-null  float32
 13  Atelectasis         25596 non-null  float32
dtypes: float32(14)
memory usage: 1.6 MB


In [8]:
# load the data from CXR8/images/images_001.tar.gz
import tarfile
import cv2
import shutil


# extract the tar.gz file
tar = tarfile.open('dataset/images/images/images_001.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_002.tar.gz
tar = tarfile.open('dataset/images/images/images_002.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_003.tar.gz
tar = tarfile.open('dataset/images/images/images_003.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_004.tar.gz
tar = tarfile.open('dataset/images/images/images_004.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_005.tar.gz
tar = tarfile.open('dataset/images/images/images_005.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_006.tar.gz
tar = tarfile.open('dataset/images/images/images_006.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_007.tar.gz
tar = tarfile.open('dataset/images/images/images_007.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_008.tar.gz
tar = tarfile.open('dataset/images/images/images_008.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_009.tar.gz
tar = tarfile.open('dataset/images/images/images_009.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_010.tar.gz
tar = tarfile.open('dataset/images/images/images_010.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_011.tar.gz
tar = tarfile.open('dataset/images/images/images_011.tar.gz')
tar.extractall()
tar.close()

# load the data from CXR8/images/images_012.tar.gz
tar = tarfile.open('dataset/images/images/images_012.tar.gz')
tar.extractall()
tar.close()




  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
  tar.extractall()
