In [1]:
import os
from multiprocessing import Pool
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from skimage import color, exposure, io, img_as_ubyte
from skimage.transform import resize

from sklearn import preprocessing
from sklearn.externals import joblib

In [2]:
shape = 256
scaler_filename = "../models/images_StandardScaler.save"
out_dir = "../input/preprocessed/"

In [3]:
def process_image(image_dir):
    image = io.imread(image_dir)
    
    out_dir = out_dir + "/".join(image_dir.split("/")[-3:])
    image = resize(image, (shape, shape), mode='reflect', anti_aliasing=True)
    image = color.rgb2gray(image)
    image = exposure.equalize_hist(image)
    image = img_as_ubyte(image)
#    print("preprocessed: "+ image_dir)
#    print("saved in: "+ out_dir)
    io.imsave(out_dir,image)    
    return image

In [4]:
processes = 4

scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
# scaler = preprocessing.StandardScaler()

split_n = 100

test_normal_dir = "../input/test/NORMAL"
test_pneumonia_dir = "../input/test/PNEUMONIA"
train_normal_dir = "../input/train/NORMAL"
train_pneumonia_dir = "../input/train/PNEUMONIA"
val_normal_dir = "../input/val/NORMAL"
val_pneumonia_dir = "../input/val/PNEUMONIA"
full_url = np.vectorize(lambda url,prev_url: prev_url+"/"+url)
test_normal_data = pd.DataFrame(full_url(np.array(os.listdir(test_normal_dir)),test_normal_dir), columns=["image_dir"])
test_pneumonia_data = pd.DataFrame(full_url(np.array(os.listdir(test_pneumonia_dir)),test_pneumonia_dir), columns=["image_dir"])
train_normal_data = pd.DataFrame(full_url(np.array(os.listdir(train_normal_dir)),train_normal_dir), columns=["image_dir"])
train_pneumonia_data = pd.DataFrame(full_url(np.array(os.listdir(train_pneumonia_dir)),train_pneumonia_dir), columns=["image_dir"])
val_normal_data = pd.DataFrame(full_url(np.array(os.listdir(val_normal_dir)),val_normal_dir), columns=["image_dir"])
val_pneumonia_data = pd.DataFrame(full_url(np.array(os.listdir(val_pneumonia_dir)),val_pneumonia_dir), columns=["image_dir"])
test_data = test_normal_data.append(test_pneumonia_data)
train_data = train_normal_data.append(train_pneumonia_data)
val_data = val_normal_data.append(val_pneumonia_data)

pool = Pool(processes=processes)  # Num of CPUs

if not os.path.exists(out_dir + "test/NORMAL"):
    os.makedirs(out_dir + "test/NORMAL")
    
if not os.path.exists(out_dir + "test/PNEUMONIA"):
    os.makedirs(out_dir + "test/PNEUMONIA")
    
if not os.path.exists(out_dir + "train/NORMAL"):
    os.makedirs(out_dir + "test/NORMAL")
    
if not os.path.exists(out_dir + "train/PNEUMONIA"):
    os.makedirs(out_dir + "test/PNEUMONIA")
    
if not os.path.exists(out_dir + "val/NORMAL"):
    os.makedirs(out_dir + "test/NORMAL")
    
if not os.path.exists(out_dir + "val/PNEUMONIA"):
    os.makedirs(out_dir + "test/PNEUMONIA")

if not os.path.exists("../models"):
    os.makedirs("../models")

i = 0
for sub_dir_list in np.array_split(train_data["image_dir"].values, split_n):
    # crop, resize, rgb to grey and hist equalization.
    train_images = np.array(pool.map(process_image, sub_dir_list, chunksize = 8))

    # standarization or normalization
    train_images = np.reshape(train_images,(len(train_images),-1))
    scaler.partial_fit(train_images)
    print("{}%".format(i))
    i += 1 

i = 0
for sub_dir_list in np.array_split(test_data["image_dir"].values, split_n):
    # crop, resize, rgb to grey and hist equalization.
    test_images = np.array(pool.map(process_image, sub_dir_list, chunksize = 8))

    # standarization or normalization
    test_images = np.reshape(test_images,(len(test_images),-1))
    scaler.partial_fit(test_images)
    print("{}%".format(i))
    i += 1 

i = 0
for sub_dir_list in np.array_split(val_data["image_dir"].values, 10):
    # crop, resize, rgb to grey and hist equalization.
    val_images = np.array(pool.map(process_image, sub_dir_list, chunksize = 8))

    # standarization or normalization
    val_images = np.reshape(val_images,(len(val_images),-1))
    scaler.partial_fit(val_images)
    print("{}%".format(i))
    i += 1 

joblib.dump(scaler, scaler_filename)

pool.close()
pool.terminate()

  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))
  .format(dtypeobj_in, dtypeobj_out))


0%
1%
2%
3%
4%
5%
6%
7%
8%
9%
10%
11%
12%
13%
14%
15%
16%
17%
18%
19%
20%
21%
22%
23%
24%
25%
26%
27%
28%
29%
30%
31%
32%
33%
34%
35%
36%
37%
38%
39%
40%
41%
42%
43%
44%
45%
46%
47%
48%
49%
50%
51%
52%
53%
54%
55%
56%
57%
58%
59%
60%
61%
62%
63%
64%
65%
66%
67%
68%
69%
70%
71%
72%
73%
74%
75%
76%
77%
78%
79%
80%
81%
82%
83%
84%
85%
86%
87%
88%
89%
90%
91%
92%
93%
94%
95%
96%
97%
98%
99%
0%
1%
2%
3%
4%
5%
6%
7%
8%
9%
10%
11%
12%
13%
14%
15%
16%
17%
18%
19%
20%
21%
22%
23%
24%
25%
26%
27%
28%
29%
30%
31%
32%
33%
34%
35%
36%
37%
38%
39%
40%
41%
42%
43%
44%
45%
46%
47%
48%
49%
50%
51%
52%
53%
54%
55%
56%
57%
58%
59%
60%
61%
62%
63%
64%
65%
66%
67%
68%
69%
70%
71%
72%
73%
74%
75%
76%
77%
78%
79%
80%
81%
82%
83%
84%
85%
86%
87%
88%
89%
90%
91%
92%
93%
94%
95%
96%
97%
98%
99%
0%
1%
2%
3%
4%
5%
6%
7%
8%
9%
