# Convert image read from url in csv file to ImageNet
** This file intended to process and transform the filtered input csv file into a formal input for Machine Learning. **
1. Get image from reading url from csv file
2. Resize image
3. Convert image to numpy array with shape: (1, IMSIZE*IMSIZE*3)

### 1 - Packages
- [numpy](www.numpy.org) is the main package for scientific computing with Python.
- [matplotlib](http://matplotlib.org) is a library to plot graphs in Python.
- [h5py](http://www.h5py.org) is a common package to interact with a dataset that is stored on an H5 file.
- [cv2](http://opencv.org/) OpenCV is a library of programming functions mainly aimed at real-time computer vision. 

In [67]:
import csv
import numpy as np
import h5py
import urllib
import cv2

### 2 - Image URL to 4-D Numpy Matrix
#### 2.1 - URL to Image

In [72]:
# use OpenCV, NumPy, and urllib
def url_to_image(url):
    # download the image, convert it to a NumPy array, and then read
    # it into OpenCV format
    resp = urllib.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    return image

#### 2.2 - Resize Image

In [73]:
IMSIZE = 128   # The size of image: option including 32 or 64 or 128 or 256

def resize_image(image):
    #print "resizing images"
    resized_image = cv2.resize(image, (IMSIZE, IMSIZE))
    return resized_image

#### 2.3 - Convert image to 4 dimensional numpy matrix

In [74]:
def image_to_array(image):
    array = np.asarray(image)
    #dimension = IMSIZE*IMSIZE*3
    return array.reshape(1, IMSIZE, IMSIZE, 3)

### This for test: Test 3 examples, concatenate the results

In [5]:
# This is for Test
# first 3 rows of data
with open('../outside_front_4000.csv', 'rU') as f:
    readCSV = csv.reader(f, delimiter=',')
    imageNet_array = []
    
    i =0
    for row in readCSV:
        # Try 20 samples
        if i<=2:
            label = row[0]
            imageURL = row[1]
            i = i+1
            print "downloading %s" % (imageURL)
            image = url_to_image(imageURL)
            
            print "resizing images"
            resized_image = resize_image(image)
            #cv2.imshow("resizedImage", resized_image)       

            img_array = image_to_array(resized_image)
            print(img_array.shape)
            print("image matrix: ", img_array)

            if len(imageNet_array) == 0:
                imageNet_array = img_array
                print("length is 0")
                print(imageNet_array)
            else:
                imageNet_array = np.concatenate([imageNet_array, img_array])

downloading https://tookan.s3.amazonaws.com/task_images/xaOq1502152810748-TOOKAN08082017104003.jpg
resizing images
(1, 49152)
('image matrix: ', array([[121, 108, 107, ...,  41,  31,  34]], dtype=uint8))
length is 0
[[121 108 107 ...,  41  31  34]]
downloading https://tookan.s3.amazonaws.com/task_images/jfVf1502174230389-photoName.jpeg
resizing images
(1, 49152)
('image matrix: ', array([[81, 89, 88, ..., 27, 24, 33]], dtype=uint8))
downloading https://tookan.s3.amazonaws.com/task_images/0T1r1502174379338-photoName.jpeg
resizing images
(1, 49152)
('image matrix: ', array([[209, 220, 234, ...,  40,  46,  53]], dtype=uint8))


In [6]:
print("result:") 
print(imageNet_array)
print("shape:")          
print(imageNet_array.shape)

result:
[[121 108 107 ...,  41  31  34]
 [ 81  89  88 ...,  27  24  33]
 [209 220 234 ...,  40  46  53]]
shape:
(3, 49152)


In [7]:
# Save numpy array as .npy file 
# Number of Rows: the number of images (nx of training set)
# Number of Columns: features of an image    
np.save('testfile.npy', imageNet_array)
d = np.load('testfile.npy')
print(d.shape)
print(imageNet_array == d)

(3, 49152)
[[ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]]


### This is convert 10000 fontside image to arrayNets
**imageNet_array: Concatenated result of 1000 images **

In [79]:
import time

start = time.time()
#print("Start processing")

with open('../images/outside_front.csv', 'rU') as f:
    readCSV = csv.reader(f, delimiter=',')
    interestingrows = [row for idx, row in enumerate(readCSV) if idx in list(range(0,12329))]
    
    imageNet_array = []

    print("Start Processing")
    i = 0
    for row in interestingrows:
        #label = row[0]
        imageURL = row[1]
        i = i+1
        #print(str(i)+"th image:" + str())
        
        #print "%s th: downloading %s" % (i, imageURL)
        image = url_to_image(imageURL)
            
#        print "resizing images"
        resized_image = resize_image(image)

        img_array = image_to_array(resized_image)
#        print(img_array.shape)

        if i % 120 == 0:
            print("Processed the "+str(i)+"th image.")
#             print("image matrix: " + str(img_array))

        if len(imageNet_array) == 0:
            imageNet_array = img_array
#            print("length is 0")
#            print(imageNet_array)
        else:
            imageNet_array = np.concatenate([imageNet_array, img_array])
            
end = time.time()
print("Complete processing:")
print(end - start)

Start Processing
Processed the 120th image.
Processed the 240th image.
Processed the 360th image.
Processed the 480th image.
Processed the 600th image.
Processed the 720th image.
Processed the 840th image.
Processed the 960th image.
Processed the 1080th image.
Processed the 1200th image.
Processed the 1320th image.
Processed the 1440th image.
Processed the 1560th image.
Processed the 1680th image.
Processed the 1800th image.
Processed the 1920th image.
Processed the 2040th image.
Processed the 2160th image.
Processed the 2280th image.
Processed the 2400th image.
Processed the 2520th image.
Processed the 2640th image.
Processed the 2760th image.
Processed the 2880th image.
Processed the 3000th image.
Processed the 3120th image.
Processed the 3240th image.
Processed the 3360th image.
Processed the 3480th image.
Processed the 3600th image.
Processed the 3720th image.
Processed the 3840th image.
Processed the 3960th image.
Processed the 4080th image.
Processed the 4200th image.
Processed t

**Method1: Save result to binary file .npy **

In [80]:
print("result:") 
print(imageNet_array)
print("shape:")          
print(imageNet_array.shape)

result:
[[[[123 118 120]
   [127 146 154]
   [ 96 120 137]
   ..., 
   [118 116 125]
   [224 222 231]
   [228 225 227]]

  [[127 122 124]
   [131 151 158]
   [ 87 112 129]
   ..., 
   [127 132 140]
   [148 153 161]
   [129 130 137]]

  [[104  99 100]
   [121 141 148]
   [ 91 116 133]
   ..., 
   [ 53  64  71]
   [ 55  66  72]
   [ 92  97 109]]

  ..., 
  [[ 70  72  80]
   [ 84  86  94]
   [ 74  76  84]
   ..., 
   [ 65  71  78]
   [ 67  73  80]
   [ 68  70  78]]

  [[ 77  79  87]
   [ 68  70  78]
   [ 56  58  66]
   ..., 
   [110 116 123]
   [ 62  68  75]
   [ 62  64  72]]

  [[ 62  65  69]
   [ 70  73  77]
   [ 64  67  71]
   ..., 
   [130 134 145]
   [ 58  62  73]
   [ 67  69  77]]]


 [[[139 151 170]
   [117 124 137]
   [ 85  92 105]
   ..., 
   [157 150 147]
   [151 144 141]
   [ 75  78  82]]

  [[ 58  51  52]
   [ 49  50  63]
   [ 78  78  89]
   ..., 
   [155 148 145]
   [151 144 141]
   [ 60  63  67]]

  [[ 69  55  56]
   [100 110 144]
   [105 112 142]
   ..., 
   [153 146 143]
 

In [81]:
# Save numpy array as .npy file 
# Number of Rows: the number of images (nx of training set)
# Number of Columns: features of an image 

# np.save('1000_image.npy', imageNet_array)
# d = np.load('1000_image.npy')
np.save('12330_outside_front.npy', imageNet_array)
d = np.load('12330_outside_front.npy')
print(d.shape)
print(imageNet_array == d)

(12329, 128, 128, 3)
[[[[ True  True  True]
   [ True  True  True]
   [ True  True  True]
   ..., 
   [ True  True  True]
   [ True  True  True]
   [ True  True  True]]

  [[ True  True  True]
   [ True  True  True]
   [ True  True  True]
   ..., 
   [ True  True  True]
   [ True  True  True]
   [ True  True  True]]

  [[ True  True  True]
   [ True  True  True]
   [ True  True  True]
   ..., 
   [ True  True  True]
   [ True  True  True]
   [ True  True  True]]

  ..., 
  [[ True  True  True]
   [ True  True  True]
   [ True  True  True]
   ..., 
   [ True  True  True]
   [ True  True  True]
   [ True  True  True]]

  [[ True  True  True]
   [ True  True  True]
   [ True  True  True]
   ..., 
   [ True  True  True]
   [ True  True  True]
   [ True  True  True]]

  [[ True  True  True]
   [ True  True  True]
   [ True  True  True]
   ..., 
   [ True  True  True]
   [ True  True  True]
   [ True  True  True]]]


 [[[ True  True  True]
   [ True  True  True]
   [ True  True  True]
   ...

### Method2: Save result to h5py fromat

In [82]:
import h5py

#hf = h5py.File('data.h5','w')
#hf.create_dataset('outside_front_1000', data = imageNet_array)
hf = h5py.File('data.h5','r+')
hf.create_dataset('wipehero_outside_front_12330', data = imageNet_array, compression="gzip", compression_opts=9) # can compress the data

hf.close()

##### Reading HDF5 files

In [84]:
h5f = h5py.File('data.h5','r')
#data1 = h5f['outside_front_1000'][:]
data2 = h5f['wipehero_outside_front_12330'][:]


h5f.close()

In [85]:
np.allclose(imageNet_array,data2)

True

In [86]:
print(data2.shape)

(12329, 128, 128, 3)
