initial commit

i-pan · Sep 9, 2019 · 5f2c4c1 · 5f2c4c1
commit 5f2c4c1
Show file tree

Hide file tree

Showing 53 changed files with 8,971 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,28 @@
+# Kaggle SIIM-ACR Pneumothorax Segmentation 
+## 8th Place
+
+## Hardware
+Ubuntu 16.04 LTS 
+64 GB RAM / 2 TB HDD
+1x NVIDIA Titan V100 32GB 
+1x Titan V 12GB
+
+## Software
+Python 3.7.4
+CUDA 10.0
+cuDNN 7.6
+PyTorch 1.1
+
+## Model checkpoints 
+Download from here: 
+
+Models should be unzipped into `./segment/checkpoints/` in order to run code as is. There should be 3 folders:
+```
+./segment/checkpoints/TRAIN_V100/
+./segment/checkpoints/TRAIN_SEGMENT/
+./segment/checkpoints/TRAIN_DEEPLABXY/
+```
+
+See `entry_points.md` for reproducing results. Relative filepaths and directories are used, so the code should work as is. 
+
+Note that `TRAIN_V100` and `TRAIN_DEEPLABXY` models require V100 32GB GPUs to train with the current configurations. If you wish to train these models on a lower capacity GPU, I suggest using the following flag options: `--grad-accum 8 --batch-size 2` or `--grad-accum 16 --batch-size 1`. Model performance is not guaranteed to be the same with these modifications. 
diff --git a/directory_structure.txt b/directory_structure.txt
@@ -0,0 +1,9 @@
+.
+./submit
+./etl
+./segment
+./segment/loss
+./segment/utils
+./segment/scripts
+./segment/model
+./segment/data
diff --git a/entry_points.md b/entry_points.md
@@ -0,0 +1,60 @@
+## Setup environment
+
+```
+conda create -n siim-ptx python=3.7 pip
+
+conda install pytorch=1.1 torchvision cudatoolkit=10.0 -c pytorch
+
+# Install mmdetection
+git clone https://github.com/open-mmlab/mmdetection/
+pip install Cython
+python setup.py develop
+# pip install -v -e .
+
+conda install pandas scikit-learn scikit-image
+pip install albumentations pretrainedmodels pydicom adabound
+```
+
+## Download data
+
+Data should be downloaded into:
+  `./data/dicom-images-train/`
+  `./data/dicom-images-stage2/`
+
+Scripts to help with data downloading are available in `./etl/`, but make sure that data are in the appropriate directories. Note that we did not retrain models on stage 2 train. A list of image IDs to exclude from the stage 2 train data is available in `./stage1test.txt`.
+
+## Process data
+
+```
+cd ./etl/
+python 0_convert_data_to_png.py
+python 1_get_png_masks_and_assign_labels.py
+python 2_create_data_splits.py 
+```
+
+## Train models
+
+```
+cd ./segment/scripts/
+bash TRAIN_V100.sh 
+bash TRAIN_SEGMENT.sh
+bash TRAIN_DEEPLABXY.sh
+```
+
+## Predict on stage 2 test data
+
+```
+cd ./segment/scripts/
+bash STAGE2_PREDICT_V100.sh 
+bash STAGE2_PREDICT_SEGMENT.sh
+bash STAGE2_PREDICT_DEEPLABXY.sh
+```
+
+## Create submission
+
+```
+cd ./submit/
+python create_submission_partitioned.py
+```
+
+Submissions will be in `./submissions/` as `submission0.csv` (best) and `submission1.csv`. 
diff --git a/etl/.DS_Store b/etl/.DS_Store
diff --git a/etl/0_convert_data_to_png.py b/etl/0_convert_data_to_png.py
@@ -0,0 +1,89 @@
+import pydicom
+import cv2
+import os
+import re
+
+import pandas as pd
+import numpy as np
+
+from tqdm import tqdm
+
+def extract_meta(dicom):
+    return {'view': dicom.ViewPosition,
+            'sex': dicom.PatientSex,
+            'age': dicom.PatientAge,
+            'monochrome': dicom.PhotometricInterpretation,
+            'sop': dicom.SOPInstanceUID,
+            'series': dicom.SeriesInstanceUID,
+            'study': dicom.StudyInstanceUID}
+
+def listify(dct): 
+    for key in dct.keys():
+        dct[key] = [dct[key]]
+    return dct
+
+def convert_and_extract(dicoms, image_save_dir, df_savefile):
+    list_of_dicom_df = []
+    for dcmfile in tqdm(dicoms, total=len(dicoms)):
+        tmp_dcm = pydicom.read_file(dcmfile, force=True) 
+        tmp_meta = extract_meta(tmp_dcm)
+        tmp_meta['filename'] = dcmfile
+        tmp_meta['height'] = tmp_dcm.pixel_array.shape[0]
+        tmp_meta['width']  = tmp_dcm.pixel_array.shape[1]
+        tmp_meta_df = pd.DataFrame(listify(tmp_meta))
+        list_of_dicom_df.append(tmp_meta_df)
+        tmp_array = tmp_dcm.pixel_array
+        assert tmp_array.dtype == 'uint8'
+        if tmp_meta['monochrome'] == 'MONOCHROME1':
+            print('Inverting image ...')
+            tmp_array = np.invert(tmp_array)
+        status = cv2.imwrite(os.path.join(image_save_dir, tmp_meta['sop'][0] + '.png'), tmp_array)
+    #
+    dicom_df = pd.concat(list_of_dicom_df)
+    dicom_df.to_csv(df_savefile, index=False)
+
+# Convert train
+TRAIN_DICOM_DIR = '../data/dicom-images-train/'
+TRAIN_IMAGE_DIR = '../data/pngs/train/'
+TRAIN_DF_SAVEFILE = '../data/train_meta.csv'
+
+if not os.path.exists(TRAIN_IMAGE_DIR): os.makedirs(TRAIN_IMAGE_DIR)
+
+train_dicoms = []
+for root, dirs, files in os.walk(TRAIN_DICOM_DIR):
+    for fi in files:
+        if re.search('dcm', fi):
+            train_dicoms.append(os.path.join(root, fi))
+
+convert_and_extract(train_dicoms, TRAIN_IMAGE_DIR, TRAIN_DF_SAVEFILE)
+
+# Convert test
+TEST_DICOM_DIR = '../data/dicom-images-test/'
+TEST_IMAGE_DIR = '../data/pngs/test/'
+TEST_DF_SAVEFILE = '../data/test_meta.csv'
+
+if not os.path.exists(TEST_IMAGE_DIR): os.makedirs(TEST_IMAGE_DIR)
+
+test_dicoms = []
+for root, dirs, files in os.walk(TEST_DICOM_DIR):
+    for fi in files:
+        if re.search('dcm', fi):
+            test_dicoms.append(os.path.join(root, fi))
+
+convert_and_extract(test_dicoms, TEST_IMAGE_DIR, TEST_DF_SAVEFILE)
+
+# Convert stage 2 test
+TEST_DICOM_DIR = '../data/dicom-images-stage2/'
+TEST_IMAGE_DIR = '../data/pngs/stage2/'
+TEST_DF_SAVEFILE = '../data/stage2_meta.csv'
+
+if not os.path.exists(TEST_IMAGE_DIR): os.makedirs(TEST_IMAGE_DIR)
+
+test_dicoms = []
+for root, dirs, files in os.walk(TEST_DICOM_DIR):
+    for fi in files:
+        if re.search('dcm', fi):
+            test_dicoms.append(os.path.join(root, fi))
+
+convert_and_extract(test_dicoms, TEST_IMAGE_DIR, TEST_DF_SAVEFILE)
+
diff --git a/etl/1_get_png_masks_and_assign_labels.py b/etl/1_get_png_masks_and_assign_labels.py
@@ -0,0 +1,67 @@
+import pandas as pd
+import numpy as np
+import cv2
+import os
+
+from tqdm import tqdm
+
+def rle2mask(rle, width, height):
+    mask = np.zeros(width * height)
+    array = np.asarray([int(x) for x in rle.split()])
+    starts = array[0::2]
+    lengths = array[1::2]
+    #
+    current_position = 0
+    for index, start in enumerate(starts):
+        current_position += start
+        mask[current_position:current_position+lengths[index]] = 1
+        current_position += lengths[index]
+    # Need to rotate clockwise 90 degrees and flip vertically
+    return np.fliplr(np.rot90(mask.reshape(width, height), 3)).astype('uint8')
+
+train_meta = pd.read_csv('../data/train_meta.csv') 
+train_rle  = pd.read_csv('../data/train-rle.csv') 
+
+train = train_meta.merge(train_rle, left_on='sop', right_on='ImageId')
+
+# Create binary labels for pneumothorax
+train['ptx_binary'] = [0 if _ == ' -1' else 1 for _ in train[' EncodedPixels']]
+
+TRAIN_MASKS_DIR = '../data/masks/train/'
+TRAIN_MASKS_255_DIR = '../data/masks_255/train/'
+if not os.path.exists(TRAIN_MASKS_DIR): os.makedirs(TRAIN_MASKS_DIR)
+
+
+if not os.path.exists(TRAIN_MASKS_255_DIR): os.makedirs(TRAIN_MASKS_255_DIR)
+
+# Generate masks from RLE and save to PNG files
+# Include empty masks
+mask_size_dict = {}
+for pid, df in tqdm(train.groupby('ImageId'), total=len(np.unique(train['ImageId']))):
+    if df[' EncodedPixels'].iloc[0] == ' -1':
+        # If empty, image should only have 1 row
+        # Create empty mask 
+        mask = np.zeros((df['width'].iloc[0], df['height'].iloc[0])).astype('uint8')
+    else:
+        mask = np.zeros((df['width'].iloc[0], df['height'].iloc[0])).astype('uint8')
+        for rownum, row in df.iterrows():
+            mask += rle2mask(row[' EncodedPixels'], df['width'].iloc[0], df['height'].iloc[0])
+    mask[mask > 1] = 1
+    mask_size_dict[pid] = np.sum(mask)
+    status = cv2.imwrite(os.path.join(TRAIN_MASKS_DIR, df['sop'].iloc[0] + '.png'), mask)
+    mask[mask == 1] = 255
+    status = cv2.imwrite(os.path.join(TRAIN_MASKS_255_DIR, df['sop'].iloc[0] + '.png'), mask)
+# Mask files and image files should share same name in different folders
+
+del train[' EncodedPixels']
+
+train = train.drop_duplicates()
+size_df = pd.DataFrame({'ImageId': list(mask_size_dict.keys()), 
+                        'mask_size': [mask_size_dict[pid] for pid in mask_size_dict.keys()]})
+train = train.merge(size_df, on='ImageId')
+
+train.to_csv('../data/train_labels.csv', index=False)
+
+
+
+
diff --git a/etl/2_create_data_splits.py b/etl/2_create_data_splits.py
@@ -0,0 +1,34 @@
+# We will create 10 inner and 10 outer folds
+# We will probably not use all of them
+NSPLITS = 10
+
+import pandas as pd 
+import numpy as np
+
+from sklearn.model_selection import StratifiedKFold, KFold
+
+np.random.seed(88)
+
+train_df = pd.read_csv('../data/train_labels.csv') 
+
+# Stratify based on mask size
+train_df['strata'] = 0
+train_df.loc[train_df['mask_size'] > 0, 'strata'] = pd.qcut(train_df['mask_size'][train_df['mask_size'] > 0], 10, labels=range(1, 11))
+
+train_df['outer'] = 888
+outer_skf = StratifiedKFold(n_splits=NSPLITS, shuffle=True, random_state=88)
+outer_counter = 0
+for outer_train, outer_test in outer_skf.split(train_df, train_df['strata']):
+    train_df.loc[outer_test, 'outer'] = outer_counter
+    inner_skf = StratifiedKFold(n_splits=NSPLITS, shuffle=True, random_state=88)
+    inner_counter = 0
+    train_df['inner{}'.format(outer_counter)] = 888
+    inner_df = train_df[train_df['outer'] != outer_counter].reset_index(drop=True)
+    # Determine which IDs should be assigned to inner train
+    for inner_train, inner_valid in inner_skf.split(inner_df, inner_df['strata']):
+        inner_train_ids = inner_df.loc[inner_valid, 'ImageId']
+        train_df.loc[train_df['ImageId'].isin(inner_train_ids), 'inner{}'.format(outer_counter)] = inner_counter
+        inner_counter += 1
+    outer_counter += 1
+
+train_df.to_csv('../data/train_labels_with_splits.csv', index=False)
diff --git a/etl/download_data.py b/etl/download_data.py
@@ -0,0 +1,90 @@
+"""Script to download all instances in a DICOM Store."""
+import os
+import posixpath
+from concurrent import futures
+from retrying import retry
+import google.auth
+from google.auth.transport.requests import AuthorizedSession
+
+import time
+# URL of CHC API
+CHC_API_URL = 'https://healthcare.googleapis.com/v1beta1'
+PROJECT_ID = 'kaggle-siim-healthcare'
+REGION = 'us-central1'
+DATASET_ID = 'siim-pneumothorax'
+TRAIN_DICOM_STORE_ID = 'dicom-images-train'
+TEST_DICOM_STORE_ID = 'dicom-images-test'
+
+#SLEEP = None
+SLEEP = 0.1
+
+@retry(wait_exponential_multiplier=1000, wait_exponential_max=10000)
+def download_instance(dicom_web_url, dicom_store_id, study_uid, series_uid,
+                      instance_uid, credentials):
+    """Downloads a DICOM instance and saves it under the current folder."""
+    instance_url = posixpath.join(dicom_web_url, 'studies', study_uid, 'series',
+                                  series_uid, 'instances', instance_uid)
+    if SLEEP: time.sleep(SLEEP) 
+    authed_session = AuthorizedSession(credentials)
+    if SLEEP: time.sleep(SLEEP)
+    response = authed_session.get(
+        instance_url, headers={'Accept': 'application/dicom; transfer-syntax=*'})
+    file_path = posixpath.join(dicom_store_id, study_uid, series_uid,
+                               instance_uid)
+    filename = '%s.dcm' % file_path
+    if not os.path.exists(filename):
+        os.makedirs(os.path.dirname(filename))
+    with open(filename, 'wb') as f:
+        f.write(response.content)
+
+
+def download_all_instances(dicom_store_id, credentials):
+    """Downloads all DICOM instances in the specified DICOM store."""
+    # Get a list of all instances.
+    dicom_web_url = posixpath.join(CHC_API_URL, 'projects', PROJECT_ID,
+                                   'locations', REGION, 'datasets', DATASET_ID,
+                                   'dicomStores', dicom_store_id, 'dicomWeb')
+    qido_url = posixpath.join(dicom_web_url, 'instances')
+    authed_session = AuthorizedSession(credentials)
+    response = authed_session.get(qido_url, params={'limit': '15000'})
+    if response.status_code != 200:
+        print(response.text)
+        return
+    content = response.json()
+    # DICOM Tag numbers
+    study_instance_uid_tag = '0020000D'
+    series_instance_uid_tag = '0020000E'
+    sop_instance_uid_tag = '00080018'
+    value_key = 'Value'
+    with futures.ThreadPoolExecutor() as executor:
+        future_to_study_uid = {}
+        for instance in content:
+            study_uid = instance[study_instance_uid_tag][value_key][0]
+            series_uid = instance[series_instance_uid_tag][value_key][0]
+            instance_uid = instance[sop_instance_uid_tag][value_key][0]
+            if SLEEP: time.sleep(SLEEP)
+            future = executor.submit(download_instance, dicom_web_url, dicom_store_id,
+                                     study_uid, series_uid, instance_uid, credentials)
+            future_to_study_uid[future] = study_uid
+        processed_count = 0
+        for future in futures.as_completed(future_to_study_uid):
+            try:
+                future.result()
+                processed_count += 1
+                if not processed_count % 100 or processed_count == len(content):
+                    print('Processed instance %d out of %d' %
+                          (processed_count, len(content)))
+            except Exception as e:
+                print('Failed to download a study. UID: %s \n exception: %s' %
+                      (future_to_study_uid[future], e))
+
+
+def main(argv=None):
+    credentials, _ = google.auth.default()
+    print('Downloading all instances in %s DICOM store' % TRAIN_DICOM_STORE_ID)
+    download_all_instances(TRAIN_DICOM_STORE_ID, credentials)
+    print('Downloading all instances in %s DICOM store' % TEST_DICOM_STORE_ID)
+    download_all_instances(TEST_DICOM_STORE_ID, credentials)
+
+
+main()
diff --git a/etl/get_annotations.sh b/etl/get_annotations.sh
@@ -0,0 +1,9 @@
+PROJECT_ID="kaggle-siim-healthcare"
+REGION="us-central1"
+DATASET_ID="siim-pneumothorax"
+FHIR_STORE_ID="fhir-masks-train"
+DOCUMENT_REFERENCE_ID="d70d8f3e-990a-4bc0-b11f-c87349f5d4eb"
+
+curl -X GET \
+-H "Authorization: Bearer "$(gcloud auth print-access-token) \
+"https://healthcare.googleapis.com/v1beta1/projects/${PROJECT_ID}/locations/${REGION}/datasets/${DATASET_ID}/fhirStores/${FHIR_STORE_ID}/fhir/DocumentReference/${DOCUMENT_REFERENCE_ID}"