In [1]:
import pandas as pd
import numpy as np

## 질병의 다중 분류하기 위한 데이터 생성

#### 1. 흉부영상을 보고 특이점을 적어둔 FeatureSelection

In [4]:
finding_and_impression_data_path = './data/findings_and_impression.csv'
label_data_path = './data/mimic-cxr-2.0.0-chexpert.csv'

finding_and_impression_data = pd.read_csv(finding_and_impression_data_path)

finding_data = finding_and_impression_data[['study_id', 'Findings']]

finding_data.head()

Unnamed: 0,study_id,Findings
0,50414267,"There is no focal consolidation, pleural effus..."
1,53189527,"The cardiac, mediastinal and hilar contours ar..."
2,53911762,Single frontal view of the chest provided.\r\n...
3,56699142,"The lungs are clear of focal consolidation, pl..."
4,57375967,PA and lateral views of the chest provided. ...


#### 2. 다중 분류를 위한 label

 환자id를 제외한 14개 값 (축소 필요)
 ---
 - 폐 관련 질환 (Lung Conditions)
    - Atelectasis (무기폐), Consolidation (폐경화), Lung Lesion (폐 병변), Lung Opacity (폐 불투명도), Pneumonia (폐렴), Pneumothorax (기흉)
 - 심장 및 순환기 관련 질환 (Cardiac and Cirulatory Conditions)
    - Cardiomegaly (심장비대증), Edema (부종), Enlarged Cardiomediastinum (확대된 심장 종격동)
 - 흉막 관련 질환 (Pleural Conditions)
    - Pleural Effusion (흉막 삼출액), Pleural Other (흉막 기타)
 - 골격계 질환 (Skeletal Conditions)
    - Fracture (골절)
 - 정상 (Normal Conditions)
    - No Finding 

In [24]:
label_data = pd.read_csv(label_data_path)

label_data = label_data.fillna(0)
classification_label_data = label_data.drop(['Support Devices', 'subject_id'], axis=1)

for idx in classification_label_data:
    if idx == 'study_id':
        continue
    print(f'{idx} : {classification_label_data[idx].value_counts(normalize=True)}')

Atelectasis : Atelectasis
 0.0    0.753607
 1.0    0.201065
-1.0    0.045328
Name: proportion, dtype: float64
Cardiomegaly : Cardiomegaly
 0.0    0.776638
 1.0    0.196838
-1.0    0.026525
Name: proportion, dtype: float64
Consolidation : Consolidation
 0.0    0.933682
 1.0    0.047308
-1.0    0.019010
Name: proportion, dtype: float64
Edema : Edema
 0.0    0.823585
 1.0    0.118590
-1.0    0.057825
Name: proportion, dtype: float64
Enlarged Cardiomediastinum : Enlarged Cardiomediastinum
 0.0    0.927340
-1.0    0.041150
 1.0    0.031511
Name: proportion, dtype: float64
Fracture : Fracture
 0.0    0.978295
 1.0    0.019269
-1.0    0.002436
Name: proportion, dtype: float64
Lung Lesion : Lung Lesion
 0.0    0.967409
 1.0    0.027582
-1.0    0.005008
Name: proportion, dtype: float64
Lung Opacity : Lung Opacity
 0.0    0.757026
 1.0    0.226158
-1.0    0.016815
Name: proportion, dtype: float64
No Finding : No Finding
0.0    0.668806
1.0    0.331194
Name: proportion, dtype: float64
Pleural Eff

In [47]:
new_categorical_data_frame = pd.DataFrame(columns=['study_id', 'LungConditions', 'CardiacAndCirculatoryConditions', 'PleuralConditions', 'SkeletalConditions', 'NormalConditions'], index=None).astype(int)

new_categorical_data_frame['study_id'] = classification_label_data['study_id']
new_categorical_data_frame['LungConditions'] = classification_label_data[['Atelectasis', 'Consolidation', 'Lung Lesion', 'Lung Opacity', 'Pneumonia', 'Pneumothorax']].max(axis=1)
new_categorical_data_frame['CardiacAndCirculatoryConditions'] = classification_label_data[['Cardiomegaly', 'Enlarged Cardiomediastinum', 'Edema']].max(axis=1)
new_categorical_data_frame['PleuralConditions'] = classification_label_data[['Pleural Effusion', 'Pleural Other']].max(axis=1)
new_categorical_data_frame['SkeletalConditions'] = classification_label_data[['Fracture']].max(axis=1)
new_categorical_data_frame['NormalConditions'] = classification_label_data['No Finding']
# int형 변환
new_categorical_data_frame = new_categorical_data_frame.astype(int)

new_categorical_data_frame.to_csv('./data/merged_label_conditions.csv', index=False)