# Convert image read from url in csv file to ImageNet Version 2
- ** This file intended to **
    1. **Process and transform the filtered input csv file into a formal input for Machine Learning. **
    2. **Augment outliers data by adding filters, and getting images online**
   
- Apply different method to process image: **Scipy, Scikit-image, PIL**

### 1 - Packages
- [numpy](www.numpy.org) is the main package for scientific computing with Python.
- [matplotlib](matplotlib.org) is a library to plot graphs in Python.
- [h5py](www.h5py.org) is a common package to interact with a dataset that is stored on an H5 file.
- [cv2](opencv.org/) OpenCV is a library of programming functions mainly aimed at real-time computer vision. 
- [scipy](www.scipy.org) is a Python-based ecosystem of open-source software for mathematics, science, and engineering.
- [PIL](pillow.readthedocs.io/en/4.3.x/) is the Python Imaging Library by Fredrik Lundh and Contributors.
- [skimage](scikit-image.org/) is a collection of algorithms for image processing.

In [1]:
import csv
import numpy as np
import h5py
#import urllib
#import cv2

In [2]:
# METHOD #2: scikit-image
import scipy
from PIL import Image
from skimage import io
from scipy import ndimage
import matplotlib.pyplot as plt

## 2. Convert Function
1. Get image from reading url from csv file
2. Resize image
3. Convert image to numpy array with shape: (1, IMSIZE, IMSIZE, 3)

In [3]:
import timeit

img_width = 64
img_height = 64

def file_to_imageArray(filename, end_idx):
    start_time = timeit.default_timer()
    
    with open('../images/' + str(filename), 'rU') as f:
        readCSV = csv.reader(f, delimiter=',')
        interestingrows = [row for idx, row in enumerate(readCSV) if idx in list(range(0,end_idx))]
    
        images_array = [] # store final result
        y = []            # store label result

        i = 0
        print("Start Processing")
        
        for row in interestingrows: #for row in readCSV:
            label = row[2]		# class label 0 or 1
            imageURL = row[1]	# image url
            i = i+1
            image = io.imread(imageURL)  # read image from url
            # resize and reshape to: (1, image_height, image_width, image_depth)
            img_array = scipy.misc.imresize(image, size=(img_width,img_height)).reshape((1, img_width, img_height, 3 ))

            if i % 50 == 0:
                print("Processed the "+str(i)+"th image.")

            # Add label to list
            y.append(label)

            # Add img_array to result by Concatenating image_array to images_array
            if len(images_array) == 0:
                images_array = img_array
            else:
                images_array = np.concatenate([images_array, img_array])
            
    elapsed = timeit.default_timer() - start_time
    print("Complete processing:")
    print(str(elapsed/(60*60)) + "hr")
    return [images_array, np.array(y).astype(np.int)]

#### Convert 50 Outside Front  image to imageNets
**imageNet_array: Concatenated result of 50 images **
- 50 image_URL with label 0 (normal data)
- 50 image_URL with label 1 (outlier data)


In [4]:
outside_front_50 = file_to_imageArray('outside_front_50.csv', 50)

Start Processing
Processed the 50th image.
Complete processing:
0.036642682751hr


In [5]:
outside_front_X = outside_front_50[0]
outside_front_Y = outside_front_50[1]
print("X shape:" + str(outside_front_X.shape))
print("Y shape:" + str(outside_front_Y.shape))
#print(outside_front_Y)

X shape:(50, 64, 64, 3)
Y shape:(50,)


#### Convert 100 Outside Front images to imageNets
**imageNet_array: Concatenated result of 100 images **
- 100 image_URL with label 0 (normal data)
- 100 image_URL with label 1 (outlier data)

In [6]:
outside_front_100 = file_to_imageArray('outside_front_100.csv', 100)

Start Processing
Processed the 50th image.
Processed the 100th image.
Complete processing:
0.0667634130187hr


In [7]:
outside_front_100_X = outside_front_100[0]
outside_front_100_Y = outside_front_100[1]
print("X shape:" + str(outside_front_100_X.shape))
print("Y shape:" + str(outside_front_100_Y.shape))
print(outside_front_100_Y)

X shape:(100, 64, 64, 3)
Y shape:(100,)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


#### Save to h5py file under group "outside_front_50"

In [8]:
f = h5py.File('data.h5','w')
#f = h5py.File('data.h5','r+')
group=f.create_group('outside_front_50')
group.create_dataset('X', data = outside_front_X)    # could add ‘compression="gzip", compression_opts=9’ to compress
group.create_dataset('Y', data = outside_front_Y)
f.close()

In [9]:
f = h5py.File('data.h5','r')
group = f['outside_front_50']
X = group['X'][:]
Y = group['Y'][:]
f.close()

In [10]:
print(X.shape, Y.shape)
print(Y)

((50, 64, 64, 3), (50,))
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


#### Save to h5py file under group "outside_front_100"

In [None]:
#f = h5py.File('data.h5','r+')
#del f['outside_front_100']
#print(f['outside_front_100'])


In [11]:
f = h5py.File('data.h5','r+')
group=f.create_group('outside_front_100')
group.create_dataset('X', data = outside_front_100_X)    # could add ‘compression="gzip", compression_opts=9’ to compress
group.create_dataset('Y', data = outside_front_100_Y)
f.close()

In [12]:
f = h5py.File('data.h5','r')
group = f['outside_front_100']
x = group['X'][:]
y = group['Y'][:]
f.close()

In [13]:
print(x.shape, y.shape)
print(y.shape)

((100, 64, 64, 3), (100,))
(100,)


#### Convert 12330 Outside front unlabelled images (Not use)

In [None]:
outside_front_12330 = file_to_imageArray('outside_front.csv', 12330)

In [None]:
outside_front_12330_X = outside_front_12330[0]
outside_front_12330_Y = outside_front_12330[1]
print("X shape:" + str(outside_front_12330_X.shape))
print("Y shape:" + str(outside_front_12330_Y.shape))

In [None]:
#f = h5py.File('data.h5','w')
f = h5py.File('data.h5','r+')
group=f.create_group('outside_front_12330')
group.create_dataset('X', data = outside_front_12330_X, compression="gzip", compression_opts=9)    # could add ‘compression="gzip", compression_opts=9’ to compress
group.create_dataset('Y', data = outside_front_12330_Y, compression="gzip", compression_opts=9)
f.close()

In [None]:
f = h5py.File('data.h5','r')
group = f['outside_front_12330']
X = group['X'][:]
Y = group['Y'][:]
f.close()

In [None]:
print(X.shape, Y.shape)

## 3. Outliers Data augmentation by adding blur filter, enhance contrast and brightness

### 3.1. Save 50 original cars images and 50 outlier images to local file 

In [84]:
def download(filename, end_idx):
    start_time = timeit.default_timer()
    
    with open(str(filename), 'rU') as f:
        readCSV = csv.reader(f, delimiter=',')
        interestingrows = [row for idx, row in enumerate(readCSV) if idx in list(range(0,end_idx))]
        i = 0
        
        for row in interestingrows: #for row in readCSV:
            label = row[2]		# class label 0 or 1
            imageURL = row[1]	# image url
            # download the image using scikit-image
            #print "downloading %s" % (url)
            i = i+1
            image = io.imread(imageURL)  # read image from url
            if i <= 50:
                io.imsave('images/unsatisfied_images/outliers_orig_%d.jpg' % (i), image)
            else:
                io.imsave('images/satisfied_images/cars_orig_%d.jpg' % (i-50), image)
      
    elapsed = timeit.default_timer() - start_time
    print("Complete processing:")
    print(str(elapsed/(60*60)) + "hr")
    return 1

In [None]:
x = x = download('../images/outside_front_100.csv', 100)

### 3.2. Create and Save Blurred version of cars and outliers images

In [61]:
from PIL import Image, ImageFilter, ImageEnhance
def blur_image(filepath):
    im = Image.open(filepath)
    im1 = im.filter(ImageFilter.BLUR)
    return im1

In [63]:
for i in range (50):
    outputPath = 'images/blur/'
    inputPath_cars = 'images/satisfied_images/cars_orig_%d.jpg' % (i+1)
    blurred_image_cars = blur_image(inputPath_cars)
    blurred_image_cars.save(outputPath +'cars_blur_%d.jpg' %(i+1))
    
    inputPath_outliers = 'images/unsatisfied_images/outliers_orig_%d.jpg' % (i+1)
    blurred_image_outliers = blur_image(inputPath_outliers)
    blurred_image_outliers.save(outputPath +'outliers_blur_%d.jpg' %(i+1))

### 3.3. Create and Save Enhance Brightness version of cars and outliers images 
- exposure image
    - save under '**images/expose**' folder
- darker image
     - save under '**images/dark**' folder

In [82]:
def adjust_brightness(filepath):
    im = Image.open(filepath)
    brightness = 7.0  # value > 1: brighter
    darkness = 0.4    # value in range 0.0-1.0: darker
    enhancer = ImageEnhance.Brightness(im)
    bright_image = enhancer.enhance(brightness) # .show() to display image
    dark_image = enhancer.enhance(darkness)
    return bright_image, dark_image

In [83]:
for i in range (50):
    outputPath_bright = 'images/expose/'
    outputPath_dark = 'images/dark/'
    
    inputPath_cars = 'images/satisfied_images/cars_orig_%d.jpg' % (i+1)
    exposed_image_car, darker_image_car = adjust_brightness(inputPath_cars)
    exposed_image_car.save(outputPath_bright +'cars_exposure_%d.jpg' %(i+1))
    darker_image_car.save(outputPath_dark +'cars_dark_%d.jpg' %(i+1))
    
    inputPath_outliers = 'images/unsatisfied_images/outliers_orig_%d.jpg' % (i+1)
    exposed_image_outlier, darker_image_outlier = adjust_brightness(inputPath_outliers)
    exposed_image_outlier.save(outputPath_bright +'outliers_exposure_%d.jpg' %(i+1))
    darker_image_outlier.save(outputPath_dark +'outliers_dark_%d.jpg' %(i+1))

## 4. Generate Outliers data for train and test (Class: 1)

### 4.1. Add more outhliers by Reading images from outside_back.csv, outside_right.csv, outside_left.csv

In [96]:
def download_otherside_car_images(filename, start_idx, end_idx, outputfilename):
    start_time = timeit.default_timer()
    with open('../images/'+ str(filename), 'rU') as f:
        readCSV = csv.reader(f, delimiter=',')
        interestingrows = [row for idx, row in enumerate(readCSV) if idx in list(range(start_idx,end_idx))]
        i = 0
        for row in interestingrows: #for row in readCSV:
            imageURL = row[1]	# image url
            # download the image using scikit-image
            i = i+1
            image = io.imread(imageURL)  # read image from url
            io.imsave('images/otherside_car/%s/%d.jpg' % (outputfilename, i), image) 
            
    elapsed = timeit.default_timer() - start_time
    print("Complete processing:" + str(elapsed/(60*60)) + "hr")
    return 1

In [97]:
download_otherside_car_images('outside_back.csv',0,50, outputfilename = 'backside')
download_otherside_car_images('outside_left.csv',0,50, outputfilename ='leftside')
download_otherside_car_images('outside_right.csv',0,50,outputfilename ='rightside')

Complete processing:0.0390927216742hr
Complete processing:0.037032312221hr
Complete processing:0.0372867591514hr


1

## 4.2. Convert 450 outlier images under 'images' folder to array
### 360 for train, 90 for test. Label: 1
- 50 black image
- 50 exposed image (high brightness)
- 50 blurry image
- 50 incomplete car image
- 50 street/road/house image
- 200 otherside image
    - 50 backside
    - 50 leftside
    - 50 rightside
    - 50 inside


In [227]:
def image_to_imageArray(filenames):
    start_time = timeit.default_timer()
    outlier_images = {}
    
    for fn in filenames:
        outlier_images[str(fn)] = []
        print(fn)
        for i in range(50):
            image = Image.open('images/' + str(fn)+'/%d.jpg'%(i+1))
            #print(i)
            img_array = scipy.misc.imresize(image, size=(64,64)).reshape((64, 64, 3))
            outlier_images[str(fn)].append(img_array)
    
    return outlier_images

In [228]:
filenames = ['blur','dark', 'expose', 'incomplete_car', 'street', 'otherside_car/backside', 
             'otherside_car/leftside', 'otherside_car/rightside', 'otherside_car/inside']
#filenames = ['street']
outliers_images = image_to_imageArray(filenames)

blur
dark
expose
incomplete_car
street
otherside_car/backside
otherside_car/leftside
otherside_car/rightside
otherside_car/inside


In [229]:
outliers_for_train = []
outliers_for_test = []

for fn in filenames:
    if len(outliers_for_train) == 0:
        outliers_for_train = outliers_images[str(fn)][:40] # fetch first 40 data as outliers train
        outliers_for_test = outliers_images[str(fn)][40:]  # fetch last 10 data as outliers test
    else:
        outliers_for_train = np.concatenate([outliers_for_train, outliers_images[str(fn)][:40]])
        outliers_for_test = np.concatenate([outliers_for_test, outliers_images[str(fn)][40:]])

# Class: Label 1
outliers_train_label = np.ones((outliers_for_train.shape[0],),dtype = int)
outliers_test_label = np.ones((outliers_for_test.shape[0],),dtype = int)
print(outliers_for_train.shape, outliers_for_test.shape, outliers_train_label.shape, outliers_test_label.shape)

((360, 64, 64, 3), (90, 64, 64, 3), (360,), (90,))


## 4.3. Convert 550 outside_front car images from csv file to array (Class: 0)

In [235]:
def file_to_array(filename, end_idx, num_train = 550):
    start_time = timeit.default_timer()
    
    with open('../images/' + str(filename), 'rU') as f:
        readCSV = csv.reader(f, delimiter=',')
        interestingrows = [row for idx, row in enumerate(readCSV) if idx in list(range(0,end_idx))]
    
        images_array = [] # store final result
        for row in interestingrows: #for row in readCSV:
            label = int(row[2])      # class label 0 or 1
            imageURL = row[1]   # image url
            if len(images_array) < num_train:
                if label == 0:       # only process image with label 0 (satisfied car image)
                    image = io.imread(imageURL)  # read image from url
                    # resize and reshape to: (1, image_height, image_width, image_depth)
                    img_array = scipy.misc.imresize(image, size=(img_width,img_height)).reshape((1, img_width, img_height, 3 ))
                    # Add img_array to result by Concatenating image_array to images_array
                    if len(images_array) == 0:
                        images_array = img_array
                    else:
                        images_array = np.concatenate([images_array, img_array])
            else:
                break
            
        labels = np.zeros((num_train,),dtype='int')  # Class: Label 0 
    return images_array, labels

In [236]:
normal_x, normal_y = file_to_array('outside_front_0.csv', end_idx = 900, num_train = 550)

In [242]:
# Split dataset to train and test dataset
normal_train_x = normal_x[:440]
normal_train_y = normal_y[:440]
normal_test_x = normal_x[440:]
normal_test_y = normal_y[440:]
print(normal_train_x.shape, normal_train_y.shape, normal_test_x.shape, normal_test_y.shape )
#print(str(normal_test_y))

((440, 64, 64, 3), (440,), (110, 64, 64, 3), (110,))


### 4.4 Concatenate normal data and outliers to Train dataset
- Concatenate `normal_data_for_train` and `outliers_data_for_train` together
- Concatenate `normal_data_for_test` and `outliers_data_for_test` together

In [243]:
x_train = np.concatenate([normal_train_x, outliers_for_train])
y_train = np.concatenate([normal_train_y, outliers_train_label])
x_test = np.concatenate([normal_test_x, outliers_for_test])
y_test = np.concatenate([normal_test_y, outliers_test_label]) 
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape )
print(str(y_test))

((800, 64, 64, 3), (800,), (200, 64, 64, 3), (200,))
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


### 4.5. Save train and test data to h5py file

In [244]:
f = h5py.File('data.h5','r+')
del f['outside_front_1000']
#print(f['outside_front_100'])

In [245]:
# Write to h5py file
f = h5py.File('data.h5','r+')
group=f.create_group('outside_front_1000')
group.create_dataset('X_train', data = x_train)    # could add ‘compression="gzip", compression_opts=9’ to compress
group.create_dataset('Y_train', data = y_train)
group.create_dataset('X_test', data = x_test) 
group.create_dataset('Y_test', data = y_test) 
f.close()

In [246]:
# Read data
f = h5py.File('data.h5','r')
group = f['outside_front_1000']
X_train = group['X_train'][:]
Y_train = group['Y_train'][:]
X_test = group['X_test'][:]
Y_test = group['Y_test'][:]
f.close()

In [247]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

((800, 64, 64, 3), (800,), (200, 64, 64, 3), (200,))


In [251]:
print(str(Y_train.T))
print(str(Y_test.T))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 