## data analysis
---

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../inputs/Data_Entry_2017.csv')
print(len(df.index))
df.head()

112120


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


### 病名（所見）について
---
|EN  |JP  |
|---|---|
|Cardiomegaly  |心拡大  |
|Emphysema  |肺気腫  |
|Effusion  |胸水  |
|Hernia  |ヘルニア  |
|Nodule  |結節  |
|Pneumothorax  |気胸  |
|Atelectasis  |無気肺  |
|Pleural_Thickening  |胸膜肥厚  |
|Mass  |腫瘤  |
|Edema  |浮腫  |
|Consolidation  |浸潤影  |
|Infiltration  |浸潤影  |
|Fibrosis  |線維症  |
|Pneumonia  |肺炎  |

In [3]:
#drop unused columns
df = df[['Image Index','Finding Labels']]

#create new columns for each decease
pathology_list = ['Cardiomegaly','Emphysema','Effusion','Hernia',
                  'Nodule','Pneumothorax','Atelectasis','Pleural_Thickening',
                  'Mass','Edema','Consolidation','Infiltration','Fibrosis',
                  'Pneumonia','No Finding']

for pathology in pathology_list :
    df[pathology] = df['Finding Labels'].apply(lambda x: 1 if pathology in x else 0)
    
df['Multi_Finding'] = df['Finding Labels'].apply(lambda x: 1 if x.find('|') > -1 else 0)

df.head()

Unnamed: 0,Image Index,Finding Labels,Cardiomegaly,Emphysema,Effusion,Hernia,Nodule,Pneumothorax,Atelectasis,Pleural_Thickening,Mass,Edema,Consolidation,Infiltration,Fibrosis,Pneumonia,No Finding,Multi_Finding
0,00000001_000.png,Cardiomegaly,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,00000001_002.png,Cardiomegaly|Effusion,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
3,00000002_000.png,No Finding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,00000003_000.png,Hernia,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
data1 = pd.melt(df,
             id_vars=['Image Index'],
             value_vars = list(pathology_list),
             var_name = 'Category',
             value_name = 'Count')
data1 = data1.loc[data1.Count>0]
data1.head()

Unnamed: 0,Image Index,Category,Count
0,00000001_000.png,Cardiomegaly,1
1,00000001_001.png,Cardiomegaly,1
2,00000001_002.png,Cardiomegaly,1
23,00000008_000.png,Cardiomegaly,1
63,00000013_025.png,Cardiomegaly,1


In [5]:
data1_grouped = data1.groupby('Category')['Count'].sum()
data1_grouped

Category
Atelectasis           11559
Cardiomegaly           2776
Consolidation          4667
Edema                  2303
Effusion              13317
Emphysema              2516
Fibrosis               1686
Hernia                  227
Infiltration          19894
Mass                   5782
No Finding            60361
Nodule                 6331
Pleural_Thickening     3385
Pneumonia              1431
Pneumothorax           5302
Name: Count, dtype: int64

In [6]:
import matplotlib.pyplot as plt

data1_grouped.plot(kind='bar')
plt.show()

<Figure size 640x480 with 1 Axes>

**analysize Emphysema**

In [7]:
df_Emphysema = data1[data1['Category'] == 'Emphysema']
print('a number of Image that diagnosed with Emphysema:', len(df_Emphysema.index))

a number of Image that diagnosed with Emphysema: 2516


In [8]:
# a image has single label
print(len(df[(df['Finding Labels'].str.contains('Emphysema')) & (df['Multi_Finding'] == 0)]))
# a image has multi labels
print(len(df[(df['Finding Labels'].str.contains('Emphysema')) & (df['Multi_Finding'] == 1)]))
df[(df['Finding Labels'].str.contains('Emphysema')) & (df['Multi_Finding'] == 0)].head()

892
1624


Unnamed: 0,Image Index,Finding Labels,Cardiomegaly,Emphysema,Effusion,Hernia,Nodule,Pneumothorax,Atelectasis,Pleural_Thickening,Mass,Edema,Consolidation,Infiltration,Fibrosis,Pneumonia,No Finding,Multi_Finding
26,00000009_000.png,Emphysema,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
79,00000013_041.png,Emphysema,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
80,00000013_042.png,Emphysema,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
108,00000027_000.png,Emphysema,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
176,00000034_000.png,Emphysema,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# target group として肺気腫画像のファイル名を取得
target_df = df[(df['Finding Labels'].str.contains('Emphysema')) & (df['Multi_Finding'] == 0)][['Image Index', 'Emphysema']]
target_df.head()

Unnamed: 0,Image Index,Emphysema
26,00000009_000.png,1
79,00000013_041.png,1
80,00000013_042.png,1
108,00000027_000.png,1
176,00000034_000.png,1


In [10]:
# # 
# others_df = df[~df['Finding Labels'].str.contains('Emphysema')]
# others_df_sample = others_df.sample(n=1000, random_state=0)
# others_img = others_df_sample.sort_values(by=["Image Index"], ascending=True)['Image Index'].tolist()
# len(others_img)

In [11]:
# control groupとして異常なし画像のファイル名を取得
control_df = df[df['Finding Labels'].str.contains('No Finding')]
control_df_sample = control_df.sample(n=1000, random_state=0)[['Image Index', 'Emphysema']]
control_df_sample.head()

Unnamed: 0,Image Index,Emphysema
39025,00010216_003.png,0
6607,00001775_003.png,0
48824,00012369_003.png,0
13905,00003591_000.png,0
44144,00011379_013.png,0


In [12]:
data_df = pd.concat([target_df, control_df_sample], ignore_index=True)
data_df = data_df.sort_values(by=["Image Index"], ascending=True)
data_df.head()

Unnamed: 0,Image Index,Emphysema
0,00000009_000.png,1
1,00000013_041.png,1
2,00000013_042.png,1
3,00000027_000.png,1
4,00000034_000.png,1


In [13]:
data_df.to_csv('../outputs/target_idx.csv', index=False)