In [7]:
import csv
import numpy as np
import h5py
import urllib
import cv2

### Convert image read from url in csv file to numpy array
1. Get image from reading url from csv file
2. Resize image
3. Convert image to numpy array with shape: (1, IMSIZE*IMSIZE*3)

In [12]:
# use OpenCV, NumPy, and urllib
def url_to_image(url):
    # download the image, convert it to a NumPy array, and then read
    # it into OpenCV format
    resp = urllib.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

In [13]:
IMSIZE = 128   # 32 or 64 or 128 or 256

def resize_image(image):
    #print "resizing images"
    resized_image = cv2.resize(image, (IMSIZE, IMSIZE))
    return resized_image

In [14]:
def image_to_array(image):
    array = np.asarray(image)
    dimension = IMSIZE*IMSIZE*3
    return array.reshape(1, dimension)

### This for test: Test 3 examples, concatenate the results

In [5]:
# This is for Test
# first 3 rows of data
with open('../outside_front_4000.csv', 'rU') as f:
    readCSV = csv.reader(f, delimiter=',')
    imageNet_array = []
    
    i =0
    for row in readCSV:
        # Try 20 samples
        if i<=2:
            label = row[0]
            imageURL = row[1]
            i = i+1
            print "downloading %s" % (imageURL)
            image = url_to_image(imageURL)
            
            print "resizing images"
            resized_image = resize_image(image)
            #cv2.imshow("resizedImage", resized_image)       

            img_array = image_to_array(resized_image)
            print(img_array.shape)
            print("image matrix: ", img_array)

            if len(imageNet_array) == 0:
                imageNet_array = img_array
                print("length is 0")
                print(imageNet_array)
            else:
                imageNet_array = np.concatenate([imageNet_array, img_array])

downloading https://tookan.s3.amazonaws.com/task_images/xaOq1502152810748-TOOKAN08082017104003.jpg
resizing images
(1, 49152)
('image matrix: ', array([[121, 108, 107, ...,  41,  31,  34]], dtype=uint8))
length is 0
[[121 108 107 ...,  41  31  34]]
downloading https://tookan.s3.amazonaws.com/task_images/jfVf1502174230389-photoName.jpeg
resizing images
(1, 49152)
('image matrix: ', array([[81, 89, 88, ..., 27, 24, 33]], dtype=uint8))
downloading https://tookan.s3.amazonaws.com/task_images/0T1r1502174379338-photoName.jpeg
resizing images
(1, 49152)
('image matrix: ', array([[209, 220, 234, ...,  40,  46,  53]], dtype=uint8))


In [6]:
print("result:") 
print(imageNet_array)
print("shape:")          
print(imageNet_array.shape)

result:
[[121 108 107 ...,  41  31  34]
 [ 81  89  88 ...,  27  24  33]
 [209 220 234 ...,  40  46  53]]
shape:
(3, 49152)


In [7]:
# Save numpy array as .npy file 
# Number of Rows: the number of images (nx of training set)
# Number of Columns: features of an image    
np.save('testfile.npy', imageNet_array)
d = np.load('testfile.npy')
print(d.shape)
print(imageNet_array == d)

(3, 49152)
[[ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]]


### This is convert 1000 fontside image to arrayNets
**imageNet_array: Concatenated result of 1000 images **

In [22]:
import time

start = time.time()
print("Start processing")

with open('../outside_front_4000.csv', 'rU') as f:
    readCSV = csv.reader(f, delimiter=',')
    imageNet_array = []

    print("start converting")
    i = 0
    for row in readCSV:
        #label = row[0]
        imageURL = row[1]
        i = i+1
#        print "downloading %s" % (imageURL)
        image = url_to_image(imageURL)
            
#        print "resizing images"
        resized_image = resize_image(image)

        img_array = image_to_array(resized_image)
#        print(img_array.shape)
        if i % 100 == 0:
            print("Processing the "+str(i)+"th image.")
            print("image matrix: " + str(img_array))

        if len(imageNet_array) == 0:
            imageNet_array = img_array
            print("length is 0")
            print(imageNet_array)
        else:
            imageNet_array = np.concatenate([imageNet_array, img_array])
            
end = time.time()
print("Complete processing:")
print(end - start)

Start processing
start converting
length is 0
[[121 108 107 ...,  41  31  34]]
Processing the 100th image.
image matrix: [[233 225 225 ...,  55  51  49]]
Processing the 200th image.
image matrix: [[197 198 194 ...,  97 107 114]]
Processing the 300th image.
image matrix: [[155 162 181 ..., 126 133 136]]
Processing the 400th image.
image matrix: [[255 255 255 ...,  62  52  45]]
Processing the 500th image.
image matrix: [[239 231 232 ...,   8   6  24]]
Processing the 600th image.
image matrix: [[176 162 150 ...,  17  12  13]]
Processing the 700th image.
image matrix: [[ 19 213 253 ..., 137 157 163]]
Processing the 800th image.
image matrix: [[164 136 135 ...,  15  29  35]]
Processing the 900th image.
image matrix: [[221 226 225 ...,  57  41  35]]
Processing the 1000th image.
image matrix: [[ 42  66  72 ...,  87  93 100]]
Complete processing:
2624.05435991


**Method1: Save result to binary file .npy **

In [23]:
print("result:") 
print(imageNet_array)
print("shape:")          
print(imageNet_array.shape)

result:
[[121 108 107 ...,  41  31  34]
 [ 81  89  88 ...,  27  24  33]
 [209 220 234 ...,  40  46  53]
 ..., 
 [ 19  15  40 ...,  27  40  56]
 [110 113 117 ...,  90  96 103]
 [ 42  66  72 ...,  87  93 100]]
shape:
(1000, 49152)


In [24]:
# Save numpy array as .npy file 
# Number of Rows: the number of images (nx of training set)
# Number of Columns: features of an image    
np.save('1000_image.npy', imageNet_array)
d = np.load('1000_image.npy')
print(d.shape)
print(imageNet_array == d)

(1000, 49152)
[[ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 ..., 
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]]


### Method2: Save result to h5py fromat

In [37]:
from timeit import default_timer as timer

start = timer()

end = timer()
print(end - start)     

3.09944152832e-05


In [43]:
import h5py

hf = h5py.File('data.h5','w')
#hf.create_dataset('front_1000', data = imageNet_array)
hf.create_dataset('front_1000', data = imageNet_array, compression="gzip", compression_opts=9) # can compress the data

hf.close()

##### Reading HDF5 files

In [44]:
h5f = h5py.File('data.h5','r')
data = h5f['front_1000'][:]

h5f.close()

In [45]:
np.allclose(imageNet_array,data)

True

In [46]:
print(data)

[[121 108 107 ...,  41  31  34]
 [ 81  89  88 ...,  27  24  33]
 [209 220 234 ...,  40  46  53]
 ..., 
 [ 19  15  40 ...,  27  40  56]
 [110 113 117 ...,  90  96 103]
 [ 42  66  72 ...,  87  93 100]]
