In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.data.dataset import ImageDataset
from sklearn.multioutput import MultiOutputClassifier

In [2]:
base_path = os.path.join(os.getcwd(), "..")
image_path = os.path.join(base_path, "data", "raw")
train_csv_path = os.path.join(base_path, "data", "raw", "CheXpert-v1.0-small", "train.csv")

return_labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
'Pleural Effusion']
map_option = {
    'Atelectasis': "U-one",
    'Cardiomegaly': 'Random', 
    'Consolidation': 'Random', 
    'Edema': 'Random',
    'Pleural Effusion': 'Random',
}

train_dataset = ImageDataset(label_csv_path=train_csv_path, image_path_base=image_path)


In [9]:
df = train_dataset.load(return_labels=return_labels, without_image=True, return_X_y=False)
df = df.drop('Path', axis=1)

In [10]:
df.describe()

Unnamed: 0,Sex,Age,Frontal/Lateral,AP/PA,Atelectasis,Cardiomegaly,Consolidation,Edema,Pleural Effusion
count,223414.0,223414.0,223414.0,223414.0,223414.0,223414.0,223414.0,223414.0,223414.0
mean,0.593683,60.430653,0.855036,0.868316,-0.001625,0.084654,-0.058004,0.175737,0.333726
std,0.491146,17.820925,0.352065,0.338147,0.548092,0.387148,0.43241,0.510967,0.571356
min,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,0.0,49.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,62.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,74.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
max,1.0,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
corr_matrix = df.corr()

In [8]:
corr_matrix

Unnamed: 0,Sex,Age,Frontal/Lateral,AP/PA,Atelectasis,Cardiomegaly,Consolidation,Edema,Pleural Effusion
Sex,1.0,-0.037064,-0.032549,-0.049047,0.010667,0.015929,0.002688,-0.0081,-0.001362
Age,-0.037064,1.0,0.033023,0.074238,-0.027659,0.038231,-0.031331,0.075755,0.08448
Frontal/Lateral,-0.032549,0.033023,1.0,-0.160349,0.003883,0.01664,-0.009338,0.106656,0.079447
AP/PA,-0.049047,0.074238,-0.160349,1.0,2.9e-05,0.024636,-0.011831,0.111398,0.082157
Atelectasis,0.010667,-0.027659,0.003883,2.9e-05,1.0,-0.000217,0.320326,0.033944,0.027874
Cardiomegaly,0.015929,0.038231,0.01664,0.024636,-0.000217,1.0,-0.005694,0.104158,0.032219
Consolidation,0.002688,-0.031331,-0.009338,-0.011831,0.320326,-0.005694,1.0,0.013986,0.033059
Edema,-0.0081,0.075755,0.106656,0.111398,0.033944,0.104158,0.013986,1.0,0.134203
Pleural Effusion,-0.001362,0.08448,0.079447,0.082157,0.027874,0.032219,0.033059,0.134203,1.0


In [6]:
for label in return_labels:
    print(f'Label: {label}')
    values = corr_matrix[label].sort_values(ascending=False)
    print(values)
    print('-' *  32)

Label: Atelectasis
Atelectasis         1.000000
Consolidation       0.320326
Edema               0.033944
Pleural Effusion    0.027874
Sex                 0.010667
Frontal/Lateral     0.003883
AP/PA               0.000029
Cardiomegaly       -0.000217
Age                -0.027659
Name: Atelectasis, dtype: float64
--------------------------------
Label: Cardiomegaly
Cardiomegaly        1.000000
Edema               0.104158
Age                 0.038231
Pleural Effusion    0.032219
AP/PA               0.024636
Frontal/Lateral     0.016640
Sex                 0.015929
Atelectasis        -0.000217
Consolidation      -0.005694
Name: Cardiomegaly, dtype: float64
--------------------------------
Label: Consolidation
Consolidation       1.000000
Atelectasis         0.320326
Pleural Effusion    0.033059
Edema               0.013986
Sex                 0.002688
Cardiomegaly       -0.005694
Frontal/Lateral    -0.009338
AP/PA              -0.011831
Age                -0.031331
Name: Consolidation, d