In [1]:
import os
from multiprocessing import Pool
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from skimage import color, exposure, io, img_as_ubyte
from skimage.transform import resize

from sklearn import preprocessing
from sklearn.externals import joblib

In [2]:
shape = 256
scaler_filename = "../models/images_StandardScaler.save"
out_dir = "../input/preprocessed/"

In [3]:
def process_image(image_dir):
    image = io.imread(image_dir)
    
    save_dir = out_dir + "/".join(image_dir.split("/")[-3:])
    image = resize(image, (shape, shape), mode='reflect', anti_aliasing=True)
    image = color.rgb2gray(image)
    image = exposure.equalize_hist(image)
    image = img_as_ubyte(image)
#    print("preprocessed: "+ image_dir)
#    print("saved in: "+ out_dir)
    io.imsave(save_dir,image)    
    return image

In [4]:
processes = 4

scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
# scaler = preprocessing.StandardScaler()

split_n = 100

test_normal_dir = "../input/test/NORMAL"
test_pneumonia_dir = "../input/test/PNEUMONIA"
train_normal_dir = "../input/train/NORMAL"
train_pneumonia_dir = "../input/train/PNEUMONIA"
val_normal_dir = "../input/val/NORMAL"
val_pneumonia_dir = "../input/val/PNEUMONIA"
full_url = np.vectorize(lambda url,prev_url: prev_url+"/"+url)
test_normal_data = pd.DataFrame(full_url(np.array(os.listdir(test_normal_dir)),test_normal_dir), columns=["image_dir"])
test_pneumonia_data = pd.DataFrame(full_url(np.array(os.listdir(test_pneumonia_dir)),test_pneumonia_dir), columns=["image_dir"])
train_normal_data = pd.DataFrame(full_url(np.array(os.listdir(train_normal_dir)),train_normal_dir), columns=["image_dir"])
train_pneumonia_data = pd.DataFrame(full_url(np.array(os.listdir(train_pneumonia_dir)),train_pneumonia_dir), columns=["image_dir"])
val_normal_data = pd.DataFrame(full_url(np.array(os.listdir(val_normal_dir)),val_normal_dir), columns=["image_dir"])
val_pneumonia_data = pd.DataFrame(full_url(np.array(os.listdir(val_pneumonia_dir)),val_pneumonia_dir), columns=["image_dir"])
test_data = test_normal_data.append(test_pneumonia_data)
train_data = train_normal_data.append(train_pneumonia_data)
val_data = val_normal_data.append(val_pneumonia_data)

os.makedirs(out_dir, exist_ok=True)    
os.makedirs(out_dir + "test", exist_ok=True)    
os.makedirs(out_dir + "train", exist_ok=True)    
os.makedirs(out_dir + "val", exist_ok=True)    
os.makedirs(out_dir + "test/NORMAL", exist_ok=True)    
os.makedirs(out_dir + "test/PNEUMONIA", exist_ok=True)
os.makedirs(out_dir + "train/NORMAL", exist_ok=True)
os.makedirs(out_dir + "train/PNEUMONIA", exist_ok=True)
os.makedirs(out_dir + "val/NORMAL", exist_ok=True)
os.makedirs(out_dir + "val/PNEUMONIA", exist_ok=True)
os.makedirs("../models/", exist_ok=True)    

pool = Pool(processes=processes)  # Num of CPUs

i = 0
for sub_dir_list in np.array_split(train_data["image_dir"].values, split_n):
    # crop, resize, rgb to grey and hist equalization.
    train_images = np.array(pool.map(process_image, sub_dir_list, chunksize = 8))

    # standarization or normalization
    train_images = np.reshape(train_images,(len(train_images),-1))
    scaler.partial_fit(train_images)
    print("{}%".format(i))
    i += 1 

i = 0
for sub_dir_list in np.array_split(test_data["image_dir"].values, split_n):
    # crop, resize, rgb to grey and hist equalization.
    test_images = np.array(pool.map(process_image, sub_dir_list, chunksize = 8))

    # standarization or normalization
    test_images = np.reshape(test_images,(len(test_images),-1))
    scaler.partial_fit(test_images)
    print("{}%".format(i))
    i += 1 

i = 0
for sub_dir_list in np.array_split(val_data["image_dir"].values, 10):
    # crop, resize, rgb to grey and hist equalization.
    val_images = np.array(pool.map(process_image, sub_dir_list, chunksize = 8))

    # standarization or normalization
    val_images = np.reshape(val_images,(len(val_images),-1))
    scaler.partial_fit(val_images)
    print("{}%".format(i))
    i += 1 

joblib.dump(scaler, scaler_filename)

pool.close()
pool.terminate()

  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))


0%
1%


Process ForkPoolWorker-2:
Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "<ipython-input-3-6eac877b7044>", line 5, in process_image
    image = resize(image, (shape, shape), mode='reflect', anti_aliasing=True)
  File "/usr/local/lib/python3.6/dist-packages/skimage/transform/_warps.py", line 169, in resize
    preserve_range=preserve_range)
  File "/usr/local/lib/python3.6/dist-packages/skimage/transform/_warps.py", line 798, in warp
    image = convert_to_float(image, preserve_range)
  File "/u

KeyboardInterrupt: 