Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Ian Pan committed Sep 9, 2019
0 parents commit 5f2c4c1
Show file tree
Hide file tree
Showing 53 changed files with 8,971 additions and 0 deletions.
28 changes: 28 additions & 0 deletions README.md
@@ -0,0 +1,28 @@
# Kaggle SIIM-ACR Pneumothorax Segmentation
## 8th Place

## Hardware
Ubuntu 16.04 LTS
64 GB RAM / 2 TB HDD
1x NVIDIA Titan V100 32GB
1x Titan V 12GB

## Software
Python 3.7.4
CUDA 10.0
cuDNN 7.6
PyTorch 1.1

## Model checkpoints
Download from here:

Models should be unzipped into `./segment/checkpoints/` in order to run code as is. There should be 3 folders:
```
./segment/checkpoints/TRAIN_V100/
./segment/checkpoints/TRAIN_SEGMENT/
./segment/checkpoints/TRAIN_DEEPLABXY/
```

See `entry_points.md` for reproducing results. Relative filepaths and directories are used, so the code should work as is.

Note that `TRAIN_V100` and `TRAIN_DEEPLABXY` models require V100 32GB GPUs to train with the current configurations. If you wish to train these models on a lower capacity GPU, I suggest using the following flag options: `--grad-accum 8 --batch-size 2` or `--grad-accum 16 --batch-size 1`. Model performance is not guaranteed to be the same with these modifications.
9 changes: 9 additions & 0 deletions directory_structure.txt
@@ -0,0 +1,9 @@
.
./submit
./etl
./segment
./segment/loss
./segment/utils
./segment/scripts
./segment/model
./segment/data
60 changes: 60 additions & 0 deletions entry_points.md
@@ -0,0 +1,60 @@
## Setup environment

```
conda create -n siim-ptx python=3.7 pip
conda install pytorch=1.1 torchvision cudatoolkit=10.0 -c pytorch
# Install mmdetection
git clone https://github.com/open-mmlab/mmdetection/
pip install Cython
python setup.py develop
# pip install -v -e .
conda install pandas scikit-learn scikit-image
pip install albumentations pretrainedmodels pydicom adabound
```

## Download data

Data should be downloaded into:
`./data/dicom-images-train/`
`./data/dicom-images-stage2/`

Scripts to help with data downloading are available in `./etl/`, but make sure that data are in the appropriate directories. Note that we did not retrain models on stage 2 train. A list of image IDs to exclude from the stage 2 train data is available in `./stage1test.txt`.

## Process data

```
cd ./etl/
python 0_convert_data_to_png.py
python 1_get_png_masks_and_assign_labels.py
python 2_create_data_splits.py
```

## Train models

```
cd ./segment/scripts/
bash TRAIN_V100.sh
bash TRAIN_SEGMENT.sh
bash TRAIN_DEEPLABXY.sh
```

## Predict on stage 2 test data

```
cd ./segment/scripts/
bash STAGE2_PREDICT_V100.sh
bash STAGE2_PREDICT_SEGMENT.sh
bash STAGE2_PREDICT_DEEPLABXY.sh
```

## Create submission

```
cd ./submit/
python create_submission_partitioned.py
```

Submissions will be in `./submissions/` as `submission0.csv` (best) and `submission1.csv`.
Binary file added etl/.DS_Store
Binary file not shown.
89 changes: 89 additions & 0 deletions etl/0_convert_data_to_png.py
@@ -0,0 +1,89 @@
import pydicom
import cv2
import os
import re

import pandas as pd
import numpy as np

from tqdm import tqdm

def extract_meta(dicom):
return {'view': dicom.ViewPosition,
'sex': dicom.PatientSex,
'age': dicom.PatientAge,
'monochrome': dicom.PhotometricInterpretation,
'sop': dicom.SOPInstanceUID,
'series': dicom.SeriesInstanceUID,
'study': dicom.StudyInstanceUID}

def listify(dct):
for key in dct.keys():
dct[key] = [dct[key]]
return dct

def convert_and_extract(dicoms, image_save_dir, df_savefile):
list_of_dicom_df = []
for dcmfile in tqdm(dicoms, total=len(dicoms)):
tmp_dcm = pydicom.read_file(dcmfile, force=True)
tmp_meta = extract_meta(tmp_dcm)
tmp_meta['filename'] = dcmfile
tmp_meta['height'] = tmp_dcm.pixel_array.shape[0]
tmp_meta['width'] = tmp_dcm.pixel_array.shape[1]
tmp_meta_df = pd.DataFrame(listify(tmp_meta))
list_of_dicom_df.append(tmp_meta_df)
tmp_array = tmp_dcm.pixel_array
assert tmp_array.dtype == 'uint8'
if tmp_meta['monochrome'] == 'MONOCHROME1':
print('Inverting image ...')
tmp_array = np.invert(tmp_array)
status = cv2.imwrite(os.path.join(image_save_dir, tmp_meta['sop'][0] + '.png'), tmp_array)
#
dicom_df = pd.concat(list_of_dicom_df)
dicom_df.to_csv(df_savefile, index=False)

# Convert train
TRAIN_DICOM_DIR = '../data/dicom-images-train/'
TRAIN_IMAGE_DIR = '../data/pngs/train/'
TRAIN_DF_SAVEFILE = '../data/train_meta.csv'

if not os.path.exists(TRAIN_IMAGE_DIR): os.makedirs(TRAIN_IMAGE_DIR)

train_dicoms = []
for root, dirs, files in os.walk(TRAIN_DICOM_DIR):
for fi in files:
if re.search('dcm', fi):
train_dicoms.append(os.path.join(root, fi))

convert_and_extract(train_dicoms, TRAIN_IMAGE_DIR, TRAIN_DF_SAVEFILE)

# Convert test
TEST_DICOM_DIR = '../data/dicom-images-test/'
TEST_IMAGE_DIR = '../data/pngs/test/'
TEST_DF_SAVEFILE = '../data/test_meta.csv'

if not os.path.exists(TEST_IMAGE_DIR): os.makedirs(TEST_IMAGE_DIR)

test_dicoms = []
for root, dirs, files in os.walk(TEST_DICOM_DIR):
for fi in files:
if re.search('dcm', fi):
test_dicoms.append(os.path.join(root, fi))

convert_and_extract(test_dicoms, TEST_IMAGE_DIR, TEST_DF_SAVEFILE)

# Convert stage 2 test
TEST_DICOM_DIR = '../data/dicom-images-stage2/'
TEST_IMAGE_DIR = '../data/pngs/stage2/'
TEST_DF_SAVEFILE = '../data/stage2_meta.csv'

if not os.path.exists(TEST_IMAGE_DIR): os.makedirs(TEST_IMAGE_DIR)

test_dicoms = []
for root, dirs, files in os.walk(TEST_DICOM_DIR):
for fi in files:
if re.search('dcm', fi):
test_dicoms.append(os.path.join(root, fi))

convert_and_extract(test_dicoms, TEST_IMAGE_DIR, TEST_DF_SAVEFILE)

67 changes: 67 additions & 0 deletions etl/1_get_png_masks_and_assign_labels.py
@@ -0,0 +1,67 @@
import pandas as pd
import numpy as np
import cv2
import os

from tqdm import tqdm

def rle2mask(rle, width, height):
mask = np.zeros(width * height)
array = np.asarray([int(x) for x in rle.split()])
starts = array[0::2]
lengths = array[1::2]
#
current_position = 0
for index, start in enumerate(starts):
current_position += start
mask[current_position:current_position+lengths[index]] = 1
current_position += lengths[index]
# Need to rotate clockwise 90 degrees and flip vertically
return np.fliplr(np.rot90(mask.reshape(width, height), 3)).astype('uint8')

train_meta = pd.read_csv('../data/train_meta.csv')
train_rle = pd.read_csv('../data/train-rle.csv')

train = train_meta.merge(train_rle, left_on='sop', right_on='ImageId')

# Create binary labels for pneumothorax
train['ptx_binary'] = [0 if _ == ' -1' else 1 for _ in train[' EncodedPixels']]

TRAIN_MASKS_DIR = '../data/masks/train/'
TRAIN_MASKS_255_DIR = '../data/masks_255/train/'
if not os.path.exists(TRAIN_MASKS_DIR): os.makedirs(TRAIN_MASKS_DIR)


if not os.path.exists(TRAIN_MASKS_255_DIR): os.makedirs(TRAIN_MASKS_255_DIR)

# Generate masks from RLE and save to PNG files
# Include empty masks
mask_size_dict = {}
for pid, df in tqdm(train.groupby('ImageId'), total=len(np.unique(train['ImageId']))):
if df[' EncodedPixels'].iloc[0] == ' -1':
# If empty, image should only have 1 row
# Create empty mask
mask = np.zeros((df['width'].iloc[0], df['height'].iloc[0])).astype('uint8')
else:
mask = np.zeros((df['width'].iloc[0], df['height'].iloc[0])).astype('uint8')
for rownum, row in df.iterrows():
mask += rle2mask(row[' EncodedPixels'], df['width'].iloc[0], df['height'].iloc[0])
mask[mask > 1] = 1
mask_size_dict[pid] = np.sum(mask)
status = cv2.imwrite(os.path.join(TRAIN_MASKS_DIR, df['sop'].iloc[0] + '.png'), mask)
mask[mask == 1] = 255
status = cv2.imwrite(os.path.join(TRAIN_MASKS_255_DIR, df['sop'].iloc[0] + '.png'), mask)
# Mask files and image files should share same name in different folders

del train[' EncodedPixels']

train = train.drop_duplicates()
size_df = pd.DataFrame({'ImageId': list(mask_size_dict.keys()),
'mask_size': [mask_size_dict[pid] for pid in mask_size_dict.keys()]})
train = train.merge(size_df, on='ImageId')

train.to_csv('../data/train_labels.csv', index=False)




34 changes: 34 additions & 0 deletions etl/2_create_data_splits.py
@@ -0,0 +1,34 @@
# We will create 10 inner and 10 outer folds
# We will probably not use all of them
NSPLITS = 10

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, KFold

np.random.seed(88)

train_df = pd.read_csv('../data/train_labels.csv')

# Stratify based on mask size
train_df['strata'] = 0
train_df.loc[train_df['mask_size'] > 0, 'strata'] = pd.qcut(train_df['mask_size'][train_df['mask_size'] > 0], 10, labels=range(1, 11))

train_df['outer'] = 888
outer_skf = StratifiedKFold(n_splits=NSPLITS, shuffle=True, random_state=88)
outer_counter = 0
for outer_train, outer_test in outer_skf.split(train_df, train_df['strata']):
train_df.loc[outer_test, 'outer'] = outer_counter
inner_skf = StratifiedKFold(n_splits=NSPLITS, shuffle=True, random_state=88)
inner_counter = 0
train_df['inner{}'.format(outer_counter)] = 888
inner_df = train_df[train_df['outer'] != outer_counter].reset_index(drop=True)
# Determine which IDs should be assigned to inner train
for inner_train, inner_valid in inner_skf.split(inner_df, inner_df['strata']):
inner_train_ids = inner_df.loc[inner_valid, 'ImageId']
train_df.loc[train_df['ImageId'].isin(inner_train_ids), 'inner{}'.format(outer_counter)] = inner_counter
inner_counter += 1
outer_counter += 1

train_df.to_csv('../data/train_labels_with_splits.csv', index=False)
90 changes: 90 additions & 0 deletions etl/download_data.py
@@ -0,0 +1,90 @@
"""Script to download all instances in a DICOM Store."""
import os
import posixpath
from concurrent import futures
from retrying import retry
import google.auth
from google.auth.transport.requests import AuthorizedSession

import time
# URL of CHC API
CHC_API_URL = 'https://healthcare.googleapis.com/v1beta1'
PROJECT_ID = 'kaggle-siim-healthcare'
REGION = 'us-central1'
DATASET_ID = 'siim-pneumothorax'
TRAIN_DICOM_STORE_ID = 'dicom-images-train'
TEST_DICOM_STORE_ID = 'dicom-images-test'

#SLEEP = None
SLEEP = 0.1

@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
def download_instance(dicom_web_url, dicom_store_id, study_uid, series_uid,
instance_uid, credentials):
"""Downloads a DICOM instance and saves it under the current folder."""
instance_url = posixpath.join(dicom_web_url, 'studies', study_uid, 'series',
series_uid, 'instances', instance_uid)
if SLEEP: time.sleep(SLEEP)
authed_session = AuthorizedSession(credentials)
if SLEEP: time.sleep(SLEEP)
response = authed_session.get(
instance_url, headers={'Accept': 'application/dicom; transfer-syntax=*'})
file_path = posixpath.join(dicom_store_id, study_uid, series_uid,
instance_uid)
filename = '%s.dcm' % file_path
if not os.path.exists(filename):
os.makedirs(os.path.dirname(filename))
with open(filename, 'wb') as f:
f.write(response.content)


def download_all_instances(dicom_store_id, credentials):
"""Downloads all DICOM instances in the specified DICOM store."""
# Get a list of all instances.
dicom_web_url = posixpath.join(CHC_API_URL, 'projects', PROJECT_ID,
'locations', REGION, 'datasets', DATASET_ID,
'dicomStores', dicom_store_id, 'dicomWeb')
qido_url = posixpath.join(dicom_web_url, 'instances')
authed_session = AuthorizedSession(credentials)
response = authed_session.get(qido_url, params={'limit': '15000'})
if response.status_code != 200:
print(response.text)
return
content = response.json()
# DICOM Tag numbers
study_instance_uid_tag = '0020000D'
series_instance_uid_tag = '0020000E'
sop_instance_uid_tag = '00080018'
value_key = 'Value'
with futures.ThreadPoolExecutor() as executor:
future_to_study_uid = {}
for instance in content:
study_uid = instance[study_instance_uid_tag][value_key][0]
series_uid = instance[series_instance_uid_tag][value_key][0]
instance_uid = instance[sop_instance_uid_tag][value_key][0]
if SLEEP: time.sleep(SLEEP)
future = executor.submit(download_instance, dicom_web_url, dicom_store_id,
study_uid, series_uid, instance_uid, credentials)
future_to_study_uid[future] = study_uid
processed_count = 0
for future in futures.as_completed(future_to_study_uid):
try:
future.result()
processed_count += 1
if not processed_count % 100 or processed_count == len(content):
print('Processed instance %d out of %d' %
(processed_count, len(content)))
except Exception as e:
print('Failed to download a study. UID: %s \n exception: %s' %
(future_to_study_uid[future], e))


def main(argv=None):
credentials, _ = google.auth.default()
print('Downloading all instances in %s DICOM store' % TRAIN_DICOM_STORE_ID)
download_all_instances(TRAIN_DICOM_STORE_ID, credentials)
print('Downloading all instances in %s DICOM store' % TEST_DICOM_STORE_ID)
download_all_instances(TEST_DICOM_STORE_ID, credentials)


main()
9 changes: 9 additions & 0 deletions etl/get_annotations.sh
@@ -0,0 +1,9 @@
PROJECT_ID="kaggle-siim-healthcare"
REGION="us-central1"
DATASET_ID="siim-pneumothorax"
FHIR_STORE_ID="fhir-masks-train"
DOCUMENT_REFERENCE_ID="d70d8f3e-990a-4bc0-b11f-c87349f5d4eb"

curl -X GET \
-H "Authorization: Bearer "$(gcloud auth print-access-token) \
"https://healthcare.googleapis.com/v1beta1/projects/${PROJECT_ID}/locations/${REGION}/datasets/${DATASET_ID}/fhirStores/${FHIR_STORE_ID}/fhir/DocumentReference/${DOCUMENT_REFERENCE_ID}"

0 comments on commit 5f2c4c1

Please sign in to comment.