# Data preparation #2

## Imports

In [1]:
import os
import pandas as pd

## Constants

In [2]:
DATA_DIR = 'data/'
CSV_FILE = os.path.join(DATA_DIR, 'nih_chest_xray_single_9c.csv')
BB_CSV_FILE = os.path.join(DATA_DIR, 'BBox_List_2017.csv')

## Download the prepared CSV

In [3]:
os.makedirs(DATA_DIR, exist_ok=True)

if not os.path.exists(CSV_FILE):
    ! gdown --id 1gJJ5ZRfRicuxIoBWLYAUnfUs4yhJ97sX -O $CSV_FILE
else:
    print('File exists: {}'.format(CSV_FILE))

if not os.path.exists(BB_CSV_FILE):
    ! gdown --id 12fA_VUjnt-tyo1d0M2htyQ5w3-nIAVH_ -O $BB_CSV_FILE
else:
    print('File exists: {}'.format(BB_CSV_FILE))

Downloading...
From: https://drive.google.com/uc?id=1gJJ5ZRfRicuxIoBWLYAUnfUs4yhJ97sX
To: /content/data/nih_chest_xray_single_9c.csv
2.68MB [00:00, 84.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=12fA_VUjnt-tyo1d0M2htyQ5w3-nIAVH_
To: /content/data/BBox_List_2017.csv
100% 92.4k/92.4k [00:00<00:00, 71.6MB/s]


## Explore the dataset

Tasks:

1. Show `df`'s and `df_bb`'s statistics

In [4]:
df = pd.read_csv(CSV_FILE)
df_bb = pd.read_csv(BB_CSV_FILE)

In [None]:
## Show `df`s statistics
df.head()
df.describe()

Unnamed: 0.1,Unnamed: 0,Follow-up #,Patient ID,Patient Age,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
count,36531.0,36531.0,36531.0,36531.0,36531.0,36531.0,36531.0,36531.0
mean,57175.990118,8.967096,14633.185897,46.894145,2656.422025,2496.899921,0.155192,0.155192
std,32468.713125,15.509073,8436.324884,16.521977,339.218395,404.171789,0.016254,0.016254
min,0.0,0.0,1.0,0.0,1244.0,966.0,0.115,0.115
25%,28889.5,0.0,7534.5,35.0,2500.0,2048.0,0.143,0.143
50%,57673.0,3.0,14294.0,49.0,2544.0,2544.0,0.143,0.143
75%,85507.5,10.0,21076.5,59.0,2992.0,2991.0,0.168,0.168
max,112114.0,173.0,30801.0,93.0,3827.0,3567.0,0.1988,0.1988


In [None]:
##
pd.Series(['a', 'a', 'b', 'c']).describe() # describe also works on non-numerical data

count     4
unique    3
top       a
freq      2
dtype: object

In [None]:
## Show `df_bb`s statistics
df_bb.head()
df_bb.describe()

Unnamed: 0,Bbox [x,y,w,h],Unnamed: 6,Unnamed: 7,Unnamed: 8
count,984.0,984.0,984.0,984.0,0.0,0.0,0.0
mean,398.806111,405.425364,256.334708,252.302547,,,
std,222.700868,166.309995,167.62962,159.443635,,,
min,5.417989,12.837934,27.306667,21.617778,,,
25%,203.093333,293.869045,136.533333,115.674074,,,
50%,340.249735,412.850794,214.340942,216.949153,,,
75%,607.959365,521.641995,311.832381,367.90243,,,
max,905.887831,876.980783,901.12,873.379894,,,


## Cleanup

Tasks:

1. Delete unused columns ('Unnamed: 0', 'Follow-up #', 'Patient ID', 'View Position', 'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]'). Resulting `df` should have 4 columns left.
1. Remove redundant columns from `df_bb`
1. Rename `Bbox [x`	`y`	`w`	`h]` columns to `bb_x`, `bb_y`, `bb_w`, `bb_h` in `df_bb`

In [None]:
## Delete unused columns ('Unnamed: 0', 'Follow-up #', 'Patient ID',
## 'View Position', 'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]').
## Resulting `df` should have 4 columns left.

df.drop(
    ['Unnamed: 0', 'Follow-up #', 'Patient ID', 'View Position', 
     'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]'],
    axis=1,
)

df = df.drop(['Unnamed: 0', 'Follow-up #', 'Patient ID', 'View Position', 
              'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]'],
             axis=1)
# can be done with inplace=True, but we won't be able to preview the result
df.head()

Unnamed: 0,Image Index,Finding Labels,Patient Age,Patient Gender
0,00000001_000.png,Cardiomegaly,57,M
1,00000005_003.png,No Finding,69,F
2,00000005_006.png,Infiltration,70,F
3,00000008_000.png,Cardiomegaly,68,F
4,00000008_002.png,Nodule,72,F


In [None]:
## Remove redundant columns from `df_bb`
df_bb.head()
df_bb.columns[-3:] # print

df_bb = df_bb.drop(df_bb.columns[-3:], axis=1)
df_bb.drop('Finding Label', axis=1, inplace=True)
df_bb.head()

Unnamed: 0,Image Index,Bbox [x,y,w,h]
0,00013118_008.png,225.084746,547.019217,86.779661,79.186441
1,00014716_007.png,686.101695,131.543498,185.491525,313.491525
2,00029817_009.png,221.830508,317.053115,155.118644,216.949153
3,00014687_001.png,726.237288,494.95142,141.016949,55.322034
4,00017877_001.png,660.067797,569.780787,200.677966,78.101695


In [None]:
## Rename `Bbox [x`	`y`	`w`	`h]` columns to `bb_x`, `bb_y`, `bb_w`, `bb_h` in `df_bb`
df_bb = df_bb.rename(columns={"Bbox [x": "bb_x", "y": "bb_y", "w": "bb_w", "h]": "bb_h"})
df_bb.head()

Unnamed: 0,Image Index,bb_x,bb_y,bb_w,bb_h
0,00013118_008.png,225.084746,547.019217,86.779661,79.186441
1,00014716_007.png,686.101695,131.543498,185.491525,313.491525
2,00029817_009.png,221.830508,317.053115,155.118644,216.949153
3,00014687_001.png,726.237288,494.95142,141.016949,55.322034
4,00017877_001.png,660.067797,569.780787,200.677966,78.101695


## Merge both data frames

Tasks:

1. Merge both dataframes

In [None]:
## Merge both dataframes
df.merge(df_bb, how='left', on='Image Index') # print

df = df.merge(df_bb, how='left', on='Image Index')
df

Unnamed: 0,Image Index,Finding Labels,Patient Age,Patient Gender,bb_x,bb_y,bb_w,bb_h
0,00000001_000.png,Cardiomegaly,57,M,,,,
1,00000005_003.png,No Finding,69,F,,,,
2,00000005_006.png,Infiltration,70,F,,,,
3,00000008_000.png,Cardiomegaly,68,F,,,,
4,00000008_002.png,Nodule,72,F,,,,
...,...,...,...,...,...,...,...,...
36526,00030789_000.png,Infiltration,51,F,,,,
36527,00030798_000.png,No Finding,29,M,,,,
36528,00030799_000.png,No Finding,32,M,,,,
36529,00030800_000.png,No Finding,33,F,,,,


In [None]:
##
df.describe() # look at max values, it's not right - we rescaled the images from 1024px to 256px in part 1

Unnamed: 0,Patient Age,bb_x,bb_y,bb_w,bb_h
count,36531.0,256.0,256.0,256.0,256.0
mean,46.894145,406.302674,415.182941,257.889035,237.403888
std,16.521977,214.953882,171.716414,168.555209,157.24257
min,0.0,38.115556,47.678307,28.173545,29.582222
25%,35.0,232.89965,301.511111,116.037091,99.84
50%,49.0,343.469379,428.021164,213.062434,201.495026
75%,59.0,608.352169,538.819048,398.915254,358.129101
max,93.0,905.887831,876.980783,771.413333,873.379894


In [None]:
## Rescale bounding box coordinates
df.apply(lambda row: row[['bb_x', 'bb_y', 'bb_w', 'bb_h']]//4, axis=1)

df[['bb_x', 'bb_y', 'bb_w', 'bb_h']] = df.apply(lambda row: row[['bb_x', 'bb_y', 'bb_w', 'bb_h']]//4, axis=1)

In [None]:
df.describe()

Unnamed: 0,Patient Age,bb_x,bb_y,bb_w,bb_h
count,36531.0,256.0,256.0,256.0,256.0
mean,46.894145,101.09375,103.292969,63.945312,58.839844
std,16.521977,53.745141,42.943603,42.175322,39.305163
min,0.0,9.0,11.0,7.0,7.0
25%,35.0,58.0,74.75,28.75,24.75
50%,49.0,85.0,107.0,52.5,50.0
75%,59.0,152.0,134.0,99.25,89.0
max,93.0,226.0,219.0,192.0,218.0


## Encode categories

Tasks:

1. Add one-hot encoded labels to `df`

In [5]:
## Add one-hot encoded labels to `df`
df['Finding Labels'].unique()

pd.get_dummies(df['Finding Labels'])

onehot = pd.get_dummies(df['Finding Labels'])
onehot.head()

onehot.drop_duplicates()  ## to show a single vector per encoded category

Unnamed: 0,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,No Finding,Nodule,Pneumonia,Pneumothorax
0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0
6,0,0,1,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0
10,0,0,0,0,1,0,0,0,0
11,0,0,0,0,0,0,0,0,1
83,0,0,0,0,0,0,0,1,0


In [None]:
## combine `onehot` with `df` data frame
pd.concat([df, onehot], axis=1)
df = pd.concat([df, onehot], axis=1)

In [None]:
## 
df.head()

Unnamed: 0,Image Index,Finding Labels,Patient Age,Patient Gender,bb_x,bb_y,bb_w,bb_h,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,No Finding,Nodule,Pneumonia,Pneumothorax
0,00000001_000.png,Cardiomegaly,57,M,,,,,0,1,0,0,0,0,0,0,0
1,00000005_003.png,No Finding,69,F,,,,,0,0,0,0,0,1,0,0,0
2,00000005_006.png,Infiltration,70,F,,,,,0,0,0,1,0,0,0,0,0
3,00000008_000.png,Cardiomegaly,68,F,,,,,0,1,0,0,0,0,0,0,0
4,00000008_002.png,Nodule,72,F,,,,,0,0,0,0,0,0,1,0,0


## Add column with file paths

Tasks:

1. Add column with file path (data/images/IMAGE_INDEX)

In [None]:
## Add column with file path (data/images/IMAGE_INDEX)

df.apply(lambda row: 'data/images/{}'.format(row['Image Index']), axis=1)

df['File Path'] = df.apply(lambda row: 'data/images/{}'.format(row['Image Index']), axis=1)

In [None]:
##

df.head()

Unnamed: 0,Image Index,Finding Labels,Patient Age,Patient Gender,bb_x,bb_y,bb_w,bb_h,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,No Finding,Nodule,Pneumonia,Pneumothorax,File Path
0,00000001_000.png,Cardiomegaly,57,M,,,,,0,1,0,0,0,0,0,0,0,data/images/00000001_000.png
1,00000005_003.png,No Finding,69,F,,,,,0,0,0,0,0,1,0,0,0,data/images/00000005_003.png
2,00000005_006.png,Infiltration,70,F,,,,,0,0,0,1,0,0,0,0,0,data/images/00000005_006.png
3,00000008_000.png,Cardiomegaly,68,F,,,,,0,1,0,0,0,0,0,0,0,data/images/00000008_000.png
4,00000008_002.png,Nodule,72,F,,,,,0,0,0,0,0,0,1,0,0,data/images/00000008_002.png


## Save results

In [None]:
df.to_csv(os.path.join(DATA_DIR, 'nih_chest_xray_single_9c_bb_onehot.csv'), index=False)