<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part One: Data Preparation </h2>	


<h3>  Import Statements </h3>	


In [1]:
#Import anything you need here
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
import skimage
from skimage import data
from skimage import io
import os
import re
import time
import pickle
import ray
ray.init(include_webui=False, num_cpus=4)

Process STDOUT and STDERR is being redirected to /tmp/raylogs/.
Waiting for redis server at 127.0.0.1:47834 to respond...
Waiting for redis server at 127.0.0.1:41222 to respond...
Starting the Plasma object store with 6.00 GB memory.
Starting local scheduler with the following resources: {'CPU': 4, 'GPU': 0}.


{'local_scheduler_socket_names': ['/tmp/scheduler2761657'],
 'node_ip_address': '10.142.159.205',
 'object_store_addresses': [ObjectStoreAddress(name='/tmp/plasma_store9863268', manager_name='/tmp/plasma_manager59669040', manager_port=40436)],
 'raylet_socket_names': [],
 'redis_address': '10.142.159.205:47834',
 'webui_url': ''}

<h4> Using the code for reference, take the given folder and create a dataframe with the picture object, and the encoding as listed below. </h4>	
0=Airplanes, 1=Bear, 2=Blimp, 3=Comet, 4=Crab, 5=Dog, 6=Dolphin, 7=Giraffe, 8=Goat, 9=Gorilla, 10=Kangaroo, 11=Killer-Whale, 12=Leopards, 13=Llama, 14= Penguin, 15= Porcupine, 16=Teddy-Bear, 17=Triceratops, 18=Unicorn, 19=Zebra

In [2]:
def get_transformed_image_from_path(file_path, outFile_size):
    """
    Read an image from 'file_path', and return its aspect ratio and resized form
    """
    image = io.imread(file_path)
    aspect = image.shape[1]/image.shape[0]
    
    # print(aspect)
    transformed_image = skimage.transform.resize(image, outFile_size,\
                                                 preserve_range=True)
    return transformed_image, aspect

@ray.remote(num_return_vals=2)
def get_transformed_image_from_path_remote(file_path, outFile_size):
    """
    Ray parallelization for the function get_transformed_image_from_path()
    """
    return get_transformed_image_from_path(file_path, outFile_size)

def getCategoryEncodings():
    """
    Given the classes and their encoding as in above markdown cell
    return a dictionary with the corresponding mapping
    """
    category_list = ["Airplanes","Bear","Blimp","Comet","Crab","Dog","Dolphin",
                     "Giraffe","Goat","Gorilla","Kangaroo","Killer-Whale",
                     "Leopards","Llama","Penguin","Porcupine","Teddy-Bear",
                     "Triceratops","Unicorn","Zebra"]
    category_list = list(map(lambda x: x.lower(), category_list))
    category_encodings = dict(zip(category_list, range(len(category_list))))
    
    # # Uncomment to view category encoding dict
    # print(category_encodings)
    return category_encodings

def read_organize_data(dir_path, outFile_size):
    """
    Given a global directory path and output File dimension as tuple,
    return a dataFrame with all the images resized to outFile_size 
    having corresponding encodings for class labels.
    
    input:  dir_path = string path of root directory
            outFile_size = tuple of final resized image eg. (128, 128)
    
    returns: DataFrame with columns  
                'image' has input image resized to outFile_size
                'class' class label/encoding for the class
                'aspect' aspect ratio of the images before resizing
                'filename' the filename of the input image
    """
    
    #Get encodings
    category_encodings = getCategoryEncodings()
    
    image_df = pd.DataFrame(columns = ['image', 'class'])
    data, encodings, aspects, filenames = [], [], [], []
    
    for category in os.listdir(dir_path):  # iterate over directories
        if category[0] != '.':  #Protecting against temporary files 
            for file_name in os.listdir(dir_path + category):
                if file_name[0] != '.':
                    file_encode = category_encodings[category]
                    file_path = dir_path + category + '/' + file_name
                    image_data, aspect = get_transformed_image_from_path_remote.remote(file_path, outFile_size) #
                    
                    # #  Uncomment to view thumbnails for all training images
                    # plt.figure()
                    # io.imshow(image_data)

                    filenames.append(file_name)
                    encodings.append(file_encode)
                    data.append(image_data)
                    aspects.append(aspect)
    
    image_df['filename'] = filenames    
    image_df['class'] = encodings        
    image_df['image'] = ray.get(data) 
    image_df['aspect'] = ray.get(aspects)
    
    return image_df

In [3]:
data_dir = "data/20_categories_training/"
resize_to = (128, 128)

start_time = time.time()
starting_data = read_organize_data(data_dir, resize_to)
end_time = time.time()
print("Time to compute results: {} seconds".format(end_time - start_time))
# starting_data.head()

Time to compute results: 20.299235820770264 seconds


In [4]:
# Store this dataFrame in a pickle compressed form to read and process
# in future notebooks
starting_data.to_pickle("./starting_data_part1.pkl")

In [7]:

val_data_dir = "data/20_Validation/"
resize_to = (128, 128)

val_image_df = pd.DataFrame(columns = ['image'])
data, fileNumber, aspects, filenames = [], [], [], []
reg = r"(\d)+"

for file_name in os.listdir(val_data_dir):
    if file_name[0] != '.':
#         file_encode = category_encodings[category]
        file_path = val_data_dir +  file_name
        

        num = int(re.search(reg, file_name)[0])
        
        image_data, aspect = get_transformed_image_from_path_remote.remote(file_path, resize_to) # 

        # #  Uncomment to view thumbnails for all training images
        # plt.figure()
        # io.imshow(image_data)

        filenames.append(file_name)
        fileNumber.append(num)
        data.append(image_data)
        aspects.append(aspect)
    
val_image_df["filenumber"] = fileNumber
val_image_df['filename'] = filenames    
val_image_df['image'] = ray.get(data) 
val_image_df['aspect'] = ray.get(aspects)

test_df = val_image_df.sort_values(by="filenumber").reset_index() \
            .drop(columns=["index", "filename", "filenumber"])
test_df.head()

Unnamed: 0,image,aspect
0,"[[[55.09130859374934, 55.08886718749936, 63.08...",0.666667
1,"[[[9.960937500000142, 14.941406250000213, 18.9...",1.897638
2,"[[[250.016326904297, 249.016326904297, 254.632...",0.853868
3,"[[[95.7189331054702, 74.06268310546987, 76.031...",1.524194
4,"[[[24.203125, 51.203125, 61.82855224609375], [...",0.665714


In [8]:
test_df.to_pickle("./testing_data_part1.pkl")