Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Ian Pan
committed
Sep 9, 2019
0 parents
commit 5f2c4c1
Showing
53 changed files
with
8,971 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# Kaggle SIIM-ACR Pneumothorax Segmentation | ||
## 8th Place | ||
|
||
## Hardware | ||
Ubuntu 16.04 LTS | ||
64 GB RAM / 2 TB HDD | ||
1x NVIDIA Titan V100 32GB | ||
1x Titan V 12GB | ||
|
||
## Software | ||
Python 3.7.4 | ||
CUDA 10.0 | ||
cuDNN 7.6 | ||
PyTorch 1.1 | ||
|
||
## Model checkpoints | ||
Download from here: | ||
|
||
Models should be unzipped into `./segment/checkpoints/` in order to run code as is. There should be 3 folders: | ||
``` | ||
./segment/checkpoints/TRAIN_V100/ | ||
./segment/checkpoints/TRAIN_SEGMENT/ | ||
./segment/checkpoints/TRAIN_DEEPLABXY/ | ||
``` | ||
|
||
See `entry_points.md` for reproducing results. Relative filepaths and directories are used, so the code should work as is. | ||
|
||
Note that `TRAIN_V100` and `TRAIN_DEEPLABXY` models require V100 32GB GPUs to train with the current configurations. If you wish to train these models on a lower capacity GPU, I suggest using the following flag options: `--grad-accum 8 --batch-size 2` or `--grad-accum 16 --batch-size 1`. Model performance is not guaranteed to be the same with these modifications. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
. | ||
./submit | ||
./etl | ||
./segment | ||
./segment/loss | ||
./segment/utils | ||
./segment/scripts | ||
./segment/model | ||
./segment/data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
## Setup environment | ||
|
||
``` | ||
conda create -n siim-ptx python=3.7 pip | ||
conda install pytorch=1.1 torchvision cudatoolkit=10.0 -c pytorch | ||
# Install mmdetection | ||
git clone https://github.com/open-mmlab/mmdetection/ | ||
pip install Cython | ||
python setup.py develop | ||
# pip install -v -e . | ||
conda install pandas scikit-learn scikit-image | ||
pip install albumentations pretrainedmodels pydicom adabound | ||
``` | ||
|
||
## Download data | ||
|
||
Data should be downloaded into: | ||
`./data/dicom-images-train/` | ||
`./data/dicom-images-stage2/` | ||
|
||
Scripts to help with data downloading are available in `./etl/`, but make sure that data are in the appropriate directories. Note that we did not retrain models on stage 2 train. A list of image IDs to exclude from the stage 2 train data is available in `./stage1test.txt`. | ||
|
||
## Process data | ||
|
||
``` | ||
cd ./etl/ | ||
python 0_convert_data_to_png.py | ||
python 1_get_png_masks_and_assign_labels.py | ||
python 2_create_data_splits.py | ||
``` | ||
|
||
## Train models | ||
|
||
``` | ||
cd ./segment/scripts/ | ||
bash TRAIN_V100.sh | ||
bash TRAIN_SEGMENT.sh | ||
bash TRAIN_DEEPLABXY.sh | ||
``` | ||
|
||
## Predict on stage 2 test data | ||
|
||
``` | ||
cd ./segment/scripts/ | ||
bash STAGE2_PREDICT_V100.sh | ||
bash STAGE2_PREDICT_SEGMENT.sh | ||
bash STAGE2_PREDICT_DEEPLABXY.sh | ||
``` | ||
|
||
## Create submission | ||
|
||
``` | ||
cd ./submit/ | ||
python create_submission_partitioned.py | ||
``` | ||
|
||
Submissions will be in `./submissions/` as `submission0.csv` (best) and `submission1.csv`. |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import pydicom | ||
import cv2 | ||
import os | ||
import re | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
from tqdm import tqdm | ||
|
||
def extract_meta(dicom): | ||
return {'view': dicom.ViewPosition, | ||
'sex': dicom.PatientSex, | ||
'age': dicom.PatientAge, | ||
'monochrome': dicom.PhotometricInterpretation, | ||
'sop': dicom.SOPInstanceUID, | ||
'series': dicom.SeriesInstanceUID, | ||
'study': dicom.StudyInstanceUID} | ||
|
||
def listify(dct): | ||
for key in dct.keys(): | ||
dct[key] = [dct[key]] | ||
return dct | ||
|
||
def convert_and_extract(dicoms, image_save_dir, df_savefile): | ||
list_of_dicom_df = [] | ||
for dcmfile in tqdm(dicoms, total=len(dicoms)): | ||
tmp_dcm = pydicom.read_file(dcmfile, force=True) | ||
tmp_meta = extract_meta(tmp_dcm) | ||
tmp_meta['filename'] = dcmfile | ||
tmp_meta['height'] = tmp_dcm.pixel_array.shape[0] | ||
tmp_meta['width'] = tmp_dcm.pixel_array.shape[1] | ||
tmp_meta_df = pd.DataFrame(listify(tmp_meta)) | ||
list_of_dicom_df.append(tmp_meta_df) | ||
tmp_array = tmp_dcm.pixel_array | ||
assert tmp_array.dtype == 'uint8' | ||
if tmp_meta['monochrome'] == 'MONOCHROME1': | ||
print('Inverting image ...') | ||
tmp_array = np.invert(tmp_array) | ||
status = cv2.imwrite(os.path.join(image_save_dir, tmp_meta['sop'][0] + '.png'), tmp_array) | ||
# | ||
dicom_df = pd.concat(list_of_dicom_df) | ||
dicom_df.to_csv(df_savefile, index=False) | ||
|
||
# Convert train | ||
TRAIN_DICOM_DIR = '../data/dicom-images-train/' | ||
TRAIN_IMAGE_DIR = '../data/pngs/train/' | ||
TRAIN_DF_SAVEFILE = '../data/train_meta.csv' | ||
|
||
if not os.path.exists(TRAIN_IMAGE_DIR): os.makedirs(TRAIN_IMAGE_DIR) | ||
|
||
train_dicoms = [] | ||
for root, dirs, files in os.walk(TRAIN_DICOM_DIR): | ||
for fi in files: | ||
if re.search('dcm', fi): | ||
train_dicoms.append(os.path.join(root, fi)) | ||
|
||
convert_and_extract(train_dicoms, TRAIN_IMAGE_DIR, TRAIN_DF_SAVEFILE) | ||
|
||
# Convert test | ||
TEST_DICOM_DIR = '../data/dicom-images-test/' | ||
TEST_IMAGE_DIR = '../data/pngs/test/' | ||
TEST_DF_SAVEFILE = '../data/test_meta.csv' | ||
|
||
if not os.path.exists(TEST_IMAGE_DIR): os.makedirs(TEST_IMAGE_DIR) | ||
|
||
test_dicoms = [] | ||
for root, dirs, files in os.walk(TEST_DICOM_DIR): | ||
for fi in files: | ||
if re.search('dcm', fi): | ||
test_dicoms.append(os.path.join(root, fi)) | ||
|
||
convert_and_extract(test_dicoms, TEST_IMAGE_DIR, TEST_DF_SAVEFILE) | ||
|
||
# Convert stage 2 test | ||
TEST_DICOM_DIR = '../data/dicom-images-stage2/' | ||
TEST_IMAGE_DIR = '../data/pngs/stage2/' | ||
TEST_DF_SAVEFILE = '../data/stage2_meta.csv' | ||
|
||
if not os.path.exists(TEST_IMAGE_DIR): os.makedirs(TEST_IMAGE_DIR) | ||
|
||
test_dicoms = [] | ||
for root, dirs, files in os.walk(TEST_DICOM_DIR): | ||
for fi in files: | ||
if re.search('dcm', fi): | ||
test_dicoms.append(os.path.join(root, fi)) | ||
|
||
convert_and_extract(test_dicoms, TEST_IMAGE_DIR, TEST_DF_SAVEFILE) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import cv2 | ||
import os | ||
|
||
from tqdm import tqdm | ||
|
||
def rle2mask(rle, width, height): | ||
mask = np.zeros(width * height) | ||
array = np.asarray([int(x) for x in rle.split()]) | ||
starts = array[0::2] | ||
lengths = array[1::2] | ||
# | ||
current_position = 0 | ||
for index, start in enumerate(starts): | ||
current_position += start | ||
mask[current_position:current_position+lengths[index]] = 1 | ||
current_position += lengths[index] | ||
# Need to rotate clockwise 90 degrees and flip vertically | ||
return np.fliplr(np.rot90(mask.reshape(width, height), 3)).astype('uint8') | ||
|
||
train_meta = pd.read_csv('../data/train_meta.csv') | ||
train_rle = pd.read_csv('../data/train-rle.csv') | ||
|
||
train = train_meta.merge(train_rle, left_on='sop', right_on='ImageId') | ||
|
||
# Create binary labels for pneumothorax | ||
train['ptx_binary'] = [0 if _ == ' -1' else 1 for _ in train[' EncodedPixels']] | ||
|
||
TRAIN_MASKS_DIR = '../data/masks/train/' | ||
TRAIN_MASKS_255_DIR = '../data/masks_255/train/' | ||
if not os.path.exists(TRAIN_MASKS_DIR): os.makedirs(TRAIN_MASKS_DIR) | ||
|
||
|
||
if not os.path.exists(TRAIN_MASKS_255_DIR): os.makedirs(TRAIN_MASKS_255_DIR) | ||
|
||
# Generate masks from RLE and save to PNG files | ||
# Include empty masks | ||
mask_size_dict = {} | ||
for pid, df in tqdm(train.groupby('ImageId'), total=len(np.unique(train['ImageId']))): | ||
if df[' EncodedPixels'].iloc[0] == ' -1': | ||
# If empty, image should only have 1 row | ||
# Create empty mask | ||
mask = np.zeros((df['width'].iloc[0], df['height'].iloc[0])).astype('uint8') | ||
else: | ||
mask = np.zeros((df['width'].iloc[0], df['height'].iloc[0])).astype('uint8') | ||
for rownum, row in df.iterrows(): | ||
mask += rle2mask(row[' EncodedPixels'], df['width'].iloc[0], df['height'].iloc[0]) | ||
mask[mask > 1] = 1 | ||
mask_size_dict[pid] = np.sum(mask) | ||
status = cv2.imwrite(os.path.join(TRAIN_MASKS_DIR, df['sop'].iloc[0] + '.png'), mask) | ||
mask[mask == 1] = 255 | ||
status = cv2.imwrite(os.path.join(TRAIN_MASKS_255_DIR, df['sop'].iloc[0] + '.png'), mask) | ||
# Mask files and image files should share same name in different folders | ||
|
||
del train[' EncodedPixels'] | ||
|
||
train = train.drop_duplicates() | ||
size_df = pd.DataFrame({'ImageId': list(mask_size_dict.keys()), | ||
'mask_size': [mask_size_dict[pid] for pid in mask_size_dict.keys()]}) | ||
train = train.merge(size_df, on='ImageId') | ||
|
||
train.to_csv('../data/train_labels.csv', index=False) | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# We will create 10 inner and 10 outer folds | ||
# We will probably not use all of them | ||
NSPLITS = 10 | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
from sklearn.model_selection import StratifiedKFold, KFold | ||
|
||
np.random.seed(88) | ||
|
||
train_df = pd.read_csv('../data/train_labels.csv') | ||
|
||
# Stratify based on mask size | ||
train_df['strata'] = 0 | ||
train_df.loc[train_df['mask_size'] > 0, 'strata'] = pd.qcut(train_df['mask_size'][train_df['mask_size'] > 0], 10, labels=range(1, 11)) | ||
|
||
train_df['outer'] = 888 | ||
outer_skf = StratifiedKFold(n_splits=NSPLITS, shuffle=True, random_state=88) | ||
outer_counter = 0 | ||
for outer_train, outer_test in outer_skf.split(train_df, train_df['strata']): | ||
train_df.loc[outer_test, 'outer'] = outer_counter | ||
inner_skf = StratifiedKFold(n_splits=NSPLITS, shuffle=True, random_state=88) | ||
inner_counter = 0 | ||
train_df['inner{}'.format(outer_counter)] = 888 | ||
inner_df = train_df[train_df['outer'] != outer_counter].reset_index(drop=True) | ||
# Determine which IDs should be assigned to inner train | ||
for inner_train, inner_valid in inner_skf.split(inner_df, inner_df['strata']): | ||
inner_train_ids = inner_df.loc[inner_valid, 'ImageId'] | ||
train_df.loc[train_df['ImageId'].isin(inner_train_ids), 'inner{}'.format(outer_counter)] = inner_counter | ||
inner_counter += 1 | ||
outer_counter += 1 | ||
|
||
train_df.to_csv('../data/train_labels_with_splits.csv', index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
"""Script to download all instances in a DICOM Store.""" | ||
import os | ||
import posixpath | ||
from concurrent import futures | ||
from retrying import retry | ||
import google.auth | ||
from google.auth.transport.requests import AuthorizedSession | ||
|
||
import time | ||
# URL of CHC API | ||
CHC_API_URL = 'https://healthcare.googleapis.com/v1beta1' | ||
PROJECT_ID = 'kaggle-siim-healthcare' | ||
REGION = 'us-central1' | ||
DATASET_ID = 'siim-pneumothorax' | ||
TRAIN_DICOM_STORE_ID = 'dicom-images-train' | ||
TEST_DICOM_STORE_ID = 'dicom-images-test' | ||
|
||
#SLEEP = None | ||
SLEEP = 0.1 | ||
|
||
@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000) | ||
def download_instance(dicom_web_url, dicom_store_id, study_uid, series_uid, | ||
instance_uid, credentials): | ||
"""Downloads a DICOM instance and saves it under the current folder.""" | ||
instance_url = posixpath.join(dicom_web_url, 'studies', study_uid, 'series', | ||
series_uid, 'instances', instance_uid) | ||
if SLEEP: time.sleep(SLEEP) | ||
authed_session = AuthorizedSession(credentials) | ||
if SLEEP: time.sleep(SLEEP) | ||
response = authed_session.get( | ||
instance_url, headers={'Accept': 'application/dicom; transfer-syntax=*'}) | ||
file_path = posixpath.join(dicom_store_id, study_uid, series_uid, | ||
instance_uid) | ||
filename = '%s.dcm' % file_path | ||
if not os.path.exists(filename): | ||
os.makedirs(os.path.dirname(filename)) | ||
with open(filename, 'wb') as f: | ||
f.write(response.content) | ||
|
||
|
||
def download_all_instances(dicom_store_id, credentials): | ||
"""Downloads all DICOM instances in the specified DICOM store.""" | ||
# Get a list of all instances. | ||
dicom_web_url = posixpath.join(CHC_API_URL, 'projects', PROJECT_ID, | ||
'locations', REGION, 'datasets', DATASET_ID, | ||
'dicomStores', dicom_store_id, 'dicomWeb') | ||
qido_url = posixpath.join(dicom_web_url, 'instances') | ||
authed_session = AuthorizedSession(credentials) | ||
response = authed_session.get(qido_url, params={'limit': '15000'}) | ||
if response.status_code != 200: | ||
print(response.text) | ||
return | ||
content = response.json() | ||
# DICOM Tag numbers | ||
study_instance_uid_tag = '0020000D' | ||
series_instance_uid_tag = '0020000E' | ||
sop_instance_uid_tag = '00080018' | ||
value_key = 'Value' | ||
with futures.ThreadPoolExecutor() as executor: | ||
future_to_study_uid = {} | ||
for instance in content: | ||
study_uid = instance[study_instance_uid_tag][value_key][0] | ||
series_uid = instance[series_instance_uid_tag][value_key][0] | ||
instance_uid = instance[sop_instance_uid_tag][value_key][0] | ||
if SLEEP: time.sleep(SLEEP) | ||
future = executor.submit(download_instance, dicom_web_url, dicom_store_id, | ||
study_uid, series_uid, instance_uid, credentials) | ||
future_to_study_uid[future] = study_uid | ||
processed_count = 0 | ||
for future in futures.as_completed(future_to_study_uid): | ||
try: | ||
future.result() | ||
processed_count += 1 | ||
if not processed_count % 100 or processed_count == len(content): | ||
print('Processed instance %d out of %d' % | ||
(processed_count, len(content))) | ||
except Exception as e: | ||
print('Failed to download a study. UID: %s \n exception: %s' % | ||
(future_to_study_uid[future], e)) | ||
|
||
|
||
def main(argv=None): | ||
credentials, _ = google.auth.default() | ||
print('Downloading all instances in %s DICOM store' % TRAIN_DICOM_STORE_ID) | ||
download_all_instances(TRAIN_DICOM_STORE_ID, credentials) | ||
print('Downloading all instances in %s DICOM store' % TEST_DICOM_STORE_ID) | ||
download_all_instances(TEST_DICOM_STORE_ID, credentials) | ||
|
||
|
||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
PROJECT_ID="kaggle-siim-healthcare" | ||
REGION="us-central1" | ||
DATASET_ID="siim-pneumothorax" | ||
FHIR_STORE_ID="fhir-masks-train" | ||
DOCUMENT_REFERENCE_ID="d70d8f3e-990a-4bc0-b11f-c87349f5d4eb" | ||
|
||
curl -X GET \ | ||
-H "Authorization: Bearer "$(gcloud auth print-access-token) \ | ||
"https://healthcare.googleapis.com/v1beta1/projects/${PROJECT_ID}/locations/${REGION}/datasets/${DATASET_ID}/fhirStores/${FHIR_STORE_ID}/fhir/DocumentReference/${DOCUMENT_REFERENCE_ID}" |
Oops, something went wrong.