### env
multi_cpu

### purpose
이미지의 경우 DISK IO 혹은 http connection을 통한 메모리에 로딩하는 시간이 너무김
위 경우 multiprocessing을 통하여 시간을 줄일수 있음, 단 서버혹은 DISK IO에 여유가 있는 상황

keras model에는 predict_generator가 있고 이를 통해 multi로 이용가능
일반적인 경우 데이터가 class별로 한 디렉토리에 있는 구조에서는 keras의 ImageDataGenerator 클래스를 이용하면 됨

그러나, 위의 구조로 이미지가 존재하지 않거나 url을 통해 받아오는 경우는 위 클래스 이용불가
따라서, custom된 generator를 만들어야함, 이때 keras.utils.Sequence를 상속받아야 multiprocessing을 사용하더라도 input ordering에 문제가 없음

### ref
https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly.html

In [1]:
import keras

Using TensorFlow backend.


In [2]:
from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input

In [3]:
import pandas as pd
from PIL import Image
from io import BytesIO
import requests
import numpy as np

In [4]:
sample_df = pd.read_csv('./sample_url.csv') # this csv is kaggle example
sample_df

Unnamed: 0,id,url
0,000088da12d664db,https://lh3.googleusercontent.com/-k45wfamuhT8...
1,0001623c6d808702,https://lh3.googleusercontent.com/-OQ0ywv8KVIA...
2,0001bbb682d45002,https://lh3.googleusercontent.com/-kloLenz1xZk...
3,0002362830cfe3a3,https://lh3.googleusercontent.com/-N6z79jNZYTg...
4,000270c9100de789,https://lh3.googleusercontent.com/-keriHaVOq1U...
5,0002b0fab5d3ccc4,https://lh3.googleusercontent.com/-ciWklpsrab8...
6,000396be3c24830a,https://lh3.googleusercontent.com/-6W9F179t59Q...
7,000506dc6ab3a40e,https://lh3.googleusercontent.com/-_XHsAXB2LZA...
8,0005292fc4b005a3,https://lh3.googleusercontent.com/-RBZ4F1ZKNc0...
9,0005456a82264bc8,https://lh3.googleusercontent.com/-MRK7_uiKO6A...


In [5]:
def image2feature(img_url):
    try:
        response = requests.get(img_url)
        img = Image.open(BytesIO(response.content))
        img = img.resize((224,224),Image.ANTIALIAS)
        img = img.convert("RGB")
        img_data = image.img_to_array(img)
        img_data = np.expand_dims(img,axis=0)
        img_data = img_data.copy()
        img_data = img_data.astype("float32")
        img_data = preprocess_input(img_data)
    except Exception as e:
        print('bug: ',e,'url: ',img_url)
        img_data = np.zeros((1,224,224,3))
    
    return img_data.tolist()

In [6]:
sample = image2feature(sample_df.url.iloc[2])
np.array(sample).shape

(1, 224, 224, 3)

In [7]:
def get_image_batch(urls):
    return_list = []
    for i, url in enumerate(urls):
        #print(i, url)
        return_list.append(np.array(image2feature(url)).reshape(224,224,3))
    
    return return_list

In [8]:
samples = get_image_batch(sample_df.url.iloc[:10].tolist())
np.array(samples).shape

(10, 224, 224, 3)

In [9]:
class DataGenerator(keras.utils.Sequence):
    
    def __init__(self, files, batch_size):
        self.files = files
        self.batch_size = batch_size
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.floor(len(self.files))/self.batch_size)
    
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        files_temp = [self.files[k] for k in indexes]
        X = self.__data_generation(files_temp)
        # predict 뿐만아니라 fit에 쓰고 싶으면 y도 return해야함
        return X
        
    def __data_generation(self, files_temp):
        X = np.array(get_image_batch(files_temp))
        return X
        
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.files))
        # you can add shuffling index

### Test: single vs multi

In [10]:
test_size = 100
batch_size = 10
step = 0
if test_size%batch_size == 0:
    step = int(test_size/batch_size)
else:
    step = int(test_size/batch_size) + 1

print(test_size, batch_size, step)

100 10 10


In [11]:
model = ResNet50(weights='imagenet', include_top=True)

In [12]:
pred_generator = DataGenerator(sample_df.url[:test_size].tolist(), batch_size)

In [13]:
%%time
single_proc = model.predict_generator(pred_generator, 
                                        steps=step, 
                                        workers=1, 
                                        max_queue_size=10, 
                                        use_multiprocessing=False,
                                        verbose=1)

 2/10 [=====>........................] - ETA: 59s 

  'to RGBA images')


CPU times: user 2min 29s, sys: 9.36 s, total: 2min 38s
Wall time: 1min 2s


In [14]:
%%time
multi_proc = model.predict_generator(pred_generator, 
                                        steps=step, 
                                        workers=4, 
                                        max_queue_size=10, 
                                        use_multiprocessing=True,
                                        verbose=1)

  'to RGBA images')
  'to RGBA images')




  'to RGBA images')


CPU times: user 2min 8s, sys: 9.7 s, total: 2min 18s
Wall time: 31.3 s


  'to RGBA images')


In [15]:
print(single_proc.shape, multi_proc.shape)

(100, 1000) (100, 1000)


### ordering check

In [16]:
single = np.apply_along_axis(np.argmax, 1, single_proc)
multi = np.apply_along_axis(np.argmax, 1, multi_proc)

In [17]:
np.all(single == multi)

True

In [18]:
single

array([828, 706, 724, 959, 425, 839, 703, 580, 716, 858, 971, 609, 977,
       978, 979, 858, 675, 778, 823, 917, 557, 643, 728, 525, 831, 853,
       279, 974, 762, 598, 794, 916, 915, 975, 734, 706, 718, 716, 608,
       556, 799, 903, 610, 976, 830, 978, 610, 449, 868, 781, 610, 970,
       415, 716, 839, 762, 408, 803, 523, 799, 602, 672, 553, 468, 975,
       520, 428, 975, 743, 933, 329, 839, 979, 736, 538, 874, 580, 580,
       975, 820, 672, 708, 716, 762, 468, 509, 498, 762, 706, 246, 972,
       894, 831, 663, 449, 756, 917, 831, 532, 975])

In [19]:
multi

array([828, 706, 724, 959, 425, 839, 703, 580, 716, 858, 971, 609, 977,
       978, 979, 858, 675, 778, 823, 917, 557, 643, 728, 525, 831, 853,
       279, 974, 762, 598, 794, 916, 915, 975, 734, 706, 718, 716, 608,
       556, 799, 903, 610, 976, 830, 978, 610, 449, 868, 781, 610, 970,
       415, 716, 839, 762, 408, 803, 523, 799, 602, 672, 553, 468, 975,
       520, 428, 975, 743, 933, 329, 839, 979, 736, 538, 874, 580, 580,
       975, 820, 672, 708, 716, 762, 468, 509, 498, 762, 706, 246, 972,
       894, 831, 663, 449, 756, 917, 831, 532, 975])