add machine learning data and option to CBCT

jrkerns · May 18, 2016 · 8953c4e · 8953c4e
1 parent 93607ce
commit 8953c4e
Show file tree

Hide file tree

Showing 40 changed files with 327 additions and 27 deletions.
diff --git a/machinelearning/cbct/build_cbct_images.py b/machinelearning/cbct/build_cbct_images.py
@@ -0,0 +1,107 @@
+import os.path as osp
+import os
+import math
+
+import dicom
+import matplotlib.pyplot as plt
+import numpy as np
+from pylinac import image
+from pylinac.core.io import TemporaryZipDirectory
+from scipy.misc import imresize
+from sklearn import preprocessing
+
+
+def is_dicom(path):
+    """Whether the file is a readable DICOM file via pydicom."""
+    try:
+        ds = dicom.read_file(path, force=True)
+        ds.pixel_array
+        return True
+    except:
+        return False
+
+
+def get_files(folder, func):
+    """Get a list of files that are valid images from the folder."""
+    paths = []
+    for pdir, _, files in os.walk(folder):
+        for file in files:
+            filepath = osp.join(pdir, file)
+            if func(filepath):
+                paths.append(filepath)
+    return paths
+
+
+def identify_images(zip_file):
+    """Interactively identify images from a folder, writing the labels to an array for later training"""
+    with TemporaryZipDirectory(zip_file) as zfiles:
+        filepaths = get_files(zfiles, is_dicom)
+        feature_array = np.zeros((len(filepaths), 10000), dtype=np.float32)
+        labels = np.zeros(len(filepaths))
+        split_val = 25
+        length = len(filepaths)
+        rounds = int(math.ceil(length / split_val))
+        for n in range(rounds):
+            fig, axes = plt.subplots(5, 5)
+            for axis, (idx, fp) in zip(axes.flatten(), enumerate(filepaths[split_val*n:split_val*(n+1)])):
+                img = image.load(fp)
+                plt.sca(axis)
+                plt.imshow(img.array, cmap=plt.cm.Greys)
+                plt.axis('off')
+                plt.title(idx+split_val*n)
+            plt.show()
+        # for idx, fp in enumerate(filepaths):
+        #     img = image.load(fp)
+        #     img.plot()
+        #     label = input("Input 0 or nothing if not an HU slice, 1 if it is:")
+        #     if label == '':
+        #         label = 0
+        #     else:
+        #         label = 1
+        #     labels.append(label)
+        #     feature_array[idx, :] = process_image(fp)
+        not_done = True
+        while not_done:
+            label = input("Input the HU indices sequentially, one at a time. Type 'done' when finished:")
+            if label == 'done':
+                not_done = False
+            else:
+                labels[int(label)] = 1
+        # labels = np.array(labels)
+        for idx, fp in enumerate(filepaths):
+            feature_array[idx, :] = process_image(fp)
+        scaled_features = preprocessing.minmax_scale(feature_array, axis=1)
+        dir2write = osp.dirname(zip_file)
+        np.save(osp.join(dir2write, 'images_' + osp.splitext(osp.basename(zip_file))[0]), scaled_features)
+        np.save(osp.join(dir2write, 'labels_' + osp.splitext(osp.basename(zip_file))[0]), labels)
+
+
+def process_image(path):
+    """Load and resize the images and return as flattened numpy array"""
+    img = image.load(path, dtype=np.float32)
+    return imresize(img.array, size=(100, 100), mode='F').flatten()
+
+
+def load_images():
+    """Load the built images for training."""
+    path = osp.join(osp.dirname(osp.abspath(__file__)), 'data')
+    imgs = get_files(path, lambda x: 'images' in x)
+    img_arr = np.vstack([np.load(f) for f in imgs])
+    labels = get_files(path, lambda x: 'labels' in x)
+    labels_arr = np.concatenate([np.load(f) for f in labels])
+    return img_arr, labels_arr
+
+
+if __name__ == '__main__':
+    data_dir = osp.join(osp.dirname(osp.abspath(__file__)), 'data')
+    zsets = (
+        # osp.join(data_dir, 'Elekta_7.zip'),
+        # osp.join(data_dir, 'Elekta_8.zip'),
+        # osp.join(data_dir, 'Elekta_11.zip'),
+        # osp.join(data_dir, 'Elekta_12.zip'),
+        osp.join(data_dir, 'CBCT_3.zip'),
+        # osp.join(data_dir, 'Standard head.zip'),
+    )
+    for zset in zsets:
+        # path = osp.join(osp.dirname(osp.abspath(__file__)), 'data', 'thorax.zip')
+        identify_images(zset)
diff --git a/machinelearning/cbct/cbct_classifier.pkl.gz b/machinelearning/cbct/cbct_classifier.pkl.gz
diff --git a/machinelearning/cbct/cbct_learner.py b/machinelearning/cbct/cbct_learner.py
@@ -0,0 +1,35 @@
+import gzip
+import pickle
+import time
+
+from sklearn import svm, metrics, cross_validation, grid_search
+
+from machinelearning.cbct.build_cbct_images import load_images
+
+data, labels = load_images()
+
+data_train, data_test, y_train, y_test = cross_validation.train_test_split(data, labels, train_size=0.85)
+
+parameters = {
+    'kernel': ['linear'],
+    'C': [1, 0.1, 0.05],
+}
+start = time.time()
+classifier = grid_search.GridSearchCV(svm.SVC(verbose=True), parameters)
+classifier.fit(data_train, y_train)
+print("Training took:", time.time() - start)
+
+for params, mean_score, scores in classifier.grid_scores_:
+    print("%0.3f (+/-%0.03f) for %r"
+          % (mean_score, scores.std() * 2, params))
+print()
+print(classifier.best_estimator_)
+print("Best parameters found:")
+print(classifier.best_params_)
+print("With a training score of:")
+print(classifier.best_score_)
+print()
+print("Classification report:")
+print(metrics.classification_report(y_test, classifier.predict(data_test)))
+with gzip.open('cbct_classifier.pkl.gz', mode='wb') as m:
+    pickle.dump(classifier, m)
diff --git a/machinelearning/cbct/data/images_CBCT_3.npy b/machinelearning/cbct/data/images_CBCT_3.npy
diff --git a/machinelearning/cbct/data/images_CBCT_4.npy b/machinelearning/cbct/data/images_CBCT_4.npy
diff --git a/machinelearning/cbct/data/images_Elekta_11.npy b/machinelearning/cbct/data/images_Elekta_11.npy
diff --git a/machinelearning/cbct/data/images_Elekta_12.npy b/machinelearning/cbct/data/images_Elekta_12.npy
diff --git a/machinelearning/cbct/data/images_Elekta_7.npy b/machinelearning/cbct/data/images_Elekta_7.npy
diff --git a/machinelearning/cbct/data/images_Elekta_8.npy b/machinelearning/cbct/data/images_Elekta_8.npy
diff --git a/machinelearning/cbct/data/images_High quality head.npy b/machinelearning/cbct/data/images_High quality head.npy
diff --git a/machinelearning/cbct/data/images_Low dose head.npy b/machinelearning/cbct/data/images_Low dose head.npy
diff --git a/machinelearning/cbct/data/images_Low dose thorax.npy b/machinelearning/cbct/data/images_Low dose thorax.npy
diff --git a/machinelearning/cbct/data/images_Pelvis spotlight.npy b/machinelearning/cbct/data/images_Pelvis spotlight.npy
diff --git a/machinelearning/cbct/data/images_Pelvis-Obese.npy b/machinelearning/cbct/data/images_Pelvis-Obese.npy
diff --git a/machinelearning/cbct/data/images_Pelvis.npy b/machinelearning/cbct/data/images_Pelvis.npy
diff --git a/machinelearning/cbct/data/images_Standard head.npy b/machinelearning/cbct/data/images_Standard head.npy
diff --git a/machinelearning/cbct/data/images_Toshiba.npy b/machinelearning/cbct/data/images_Toshiba.npy
diff --git a/machinelearning/cbct/data/images_UNC120kV.npy b/machinelearning/cbct/data/images_UNC120kV.npy
diff --git a/machinelearning/cbct/data/images_thorax.npy b/machinelearning/cbct/data/images_thorax.npy
diff --git a/machinelearning/cbct/data/labels_CBCT_3.npy b/machinelearning/cbct/data/labels_CBCT_3.npy
diff --git a/machinelearning/cbct/data/labels_CBCT_4.npy b/machinelearning/cbct/data/labels_CBCT_4.npy
diff --git a/machinelearning/cbct/data/labels_Elekta_11.npy b/machinelearning/cbct/data/labels_Elekta_11.npy
diff --git a/machinelearning/cbct/data/labels_Elekta_12.npy b/machinelearning/cbct/data/labels_Elekta_12.npy
diff --git a/machinelearning/cbct/data/labels_Elekta_7.npy b/machinelearning/cbct/data/labels_Elekta_7.npy
diff --git a/machinelearning/cbct/data/labels_Elekta_8.npy b/machinelearning/cbct/data/labels_Elekta_8.npy
diff --git a/machinelearning/cbct/data/labels_High quality head.npy b/machinelearning/cbct/data/labels_High quality head.npy
diff --git a/machinelearning/cbct/data/labels_Low dose head.npy b/machinelearning/cbct/data/labels_Low dose head.npy
diff --git a/machinelearning/cbct/data/labels_Low dose thorax.npy b/machinelearning/cbct/data/labels_Low dose thorax.npy
diff --git a/machinelearning/cbct/data/labels_Pelvis spotlight.npy b/machinelearning/cbct/data/labels_Pelvis spotlight.npy
diff --git a/machinelearning/cbct/data/labels_Pelvis-Obese.npy b/machinelearning/cbct/data/labels_Pelvis-Obese.npy
diff --git a/machinelearning/cbct/data/labels_Pelvis.npy b/machinelearning/cbct/data/labels_Pelvis.npy
diff --git a/machinelearning/cbct/data/labels_Standard head.npy b/machinelearning/cbct/data/labels_Standard head.npy
diff --git a/machinelearning/cbct/data/labels_Toshiba.npy b/machinelearning/cbct/data/labels_Toshiba.npy
diff --git a/machinelearning/cbct/data/labels_UNC120kV.npy b/machinelearning/cbct/data/labels_UNC120kV.npy
diff --git a/machinelearning/cbct/data/labels_thorax.npy b/machinelearning/cbct/data/labels_thorax.npy
diff --git a/machinelearning/pylinac_classifier.pkl b/machinelearning/pylinac_classifier.pkl
diff --git a/machinelearning/pylinac_images.py b/machinelearning/pylinac_images.py
@@ -0,0 +1,79 @@
+"""A script to generate thumbnails of pylinac images for machine learning"""
+import os.path as osp
+import os
+import concurrent.futures
+import time
+
+import matplotlib.pyplot as plt
+import numpy as np
+from pylinac import image
+from scipy.misc import imresize
+from sklearn import preprocessing
+
+
+def get_image_files(folder):
+    """Get a list of files that are valid images from the folder."""
+    futures = {}
+    start = time.time()
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as exec:
+        for pdir, _, files in os.walk(folder):
+            for file in files:
+                filepath = osp.join(pdir, file)
+                future = exec.submit(image.is_image, filepath)
+                futures[future] = filepath
+    filepaths = []
+    for idx, future in enumerate(concurrent.futures.as_completed(futures)):
+        if future.result():
+            filepaths.append(futures[future])
+    print("Done with {} in {:.2f}s".format(osp.basename(folder), time.time() - start))
+    return filepaths
+
+
+def process_image(path):
+    """Load and resize the images and return as flattened numpy array"""
+    img = image.load(path, dtype=np.float32)
+    return imresize(img.array, size=(100, 100), mode='F').flatten()
+
+
+def build_images():
+    """Completely load, resize, and save the images for training. Main function."""
+    # get image file paths for each image type
+    path_stub = r'D:\Users\James\Dropbox\Programming\Python\Projects\pylinac test files'
+    pf_files = get_image_files(osp.join(path_stub, 'Picket Fences'))
+    pipspro_files = get_image_files(osp.join(path_stub, '2D Image quality phantoms', 'PipsPro'))
+    leeds_files = get_image_files(osp.join(path_stub, '2D Image quality phantoms', 'Leeds'))
+    wl_files = get_image_files(osp.join(path_stub, 'Winston-Lutz'))
+    # cbct_files = get_image_files(osp.join(path_stub, 'CBCTs'))
+    filepaths = pf_files + pipspro_files + leeds_files + wl_files
+    print("{} files found".format(len(filepaths)))
+
+    # preallocate
+    total_array = np.zeros((len(filepaths), 10000), dtype=np.float32)
+    print("Training array preallocated")
+
+    # resize each image and add to a training array
+    start = time.time()
+    futures = {}
+    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as exec:
+        for idx, path in enumerate(filepaths):
+            future = exec.submit(process_image, path)
+            futures[future] = idx
+    for idx, future in enumerate(concurrent.futures.as_completed(futures)):
+        total_array[futures[future], :] = future.result()
+    print("Training array set in {:.2f}s".format(time.time() - start))
+
+    # feature scale the images
+    scaled_array = preprocessing.minmax_scale(total_array, feature_range=(0, 1), axis=1)
+    print("Training array scaled")
+
+    # save arrays to disk for future use
+    np.save(osp.join(osp.dirname(osp.abspath(__file__)), 'images'), scaled_array)
+    np.save(osp.join(osp.dirname(osp.abspath(__file__)), 'labels'), np.concatenate(
+        (np.repeat(0, len(pf_files)), np.repeat(1, len(pipspro_files)), np.repeat(2, len(leeds_files)), np.repeat(3, len(wl_files)))))
+    print("Images build")
+
+
+def load_images():
+    """Load the built images for training."""
+    return np.load('images.npy'), np.load('labels.npy')
+
diff --git a/machinelearning/pylinac_learner.py b/machinelearning/pylinac_learner.py
@@ -0,0 +1,52 @@
+from sklearn import svm, metrics, cross_validation, grid_search
+import pickle
+import gzip
+
+from pylinac_ml.pylinac_images import load_images
+
+# build_images()
+data, labels = load_images()
+
+data_train, data_test, y_train, y_test = cross_validation.train_test_split(data, labels, train_size=0.8)
+
+parameters = {
+    'kernel': ['rbf'],
+    'C': [10],
+    'gamma': [0.001]
+}
+classifier = grid_search.GridSearchCV(svm.SVC(verbose=True), parameters)
+# parameters = {
+#     'hidden_layer_sizes': [(2500,)],
+#     'activation': ['relu'],
+#     'alpha': [0.1, 0.01, 10],
+#     'algorithm': ['adam'],
+#     'tol': [0.01],
+#     'learning_rate': ['invscaling']
+# }
+# classifier = model_selection.GridSearchCV(neural_network.MLPClassifier(verbose=True), parameters)
+
+classifier.fit(data_train, y_train)
+
+for params, mean_score, scores in classifier.grid_scores_:
+    print("%0.3f (+/-%0.03f) for %r"
+          % (mean_score, scores.std() * 2, params))
+print()
+print(classifier.best_estimator_)
+print("Best parameters found:")
+print(classifier.best_params_)
+print("With a training score of:")
+print(classifier.best_score_)
+print()
+print("Classification report:")
+print(metrics.classification_report(y_test, classifier.predict(data_test)))
+# print(metrics.f1_score(y_train, classifier.predict(iris_train), average='binary'))
+# print("And test score of")
+# print(metrics.f1_score(y_test, classifier.predict(data_test), average='binary'))
+with gzip.open('pylinac_model.pkl.gz', mode='wb') as m:
+    pickle.dump(classifier, m)
+
+# with gzip.open('pylinac_model.pkl.gz', mode='rb') as m:
+#     unp_clf = pickle.load(m)
+
+# print("Classification report after pickling/unpickling:")
+# print(metrics.classification_report(y_test, unp_clf.predict(data_test)))