In [16]:
import pandas as pd
import numpy as np
from azure.storage.blob import BlobServiceClient
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.image import load_img, img_to_array
import json
import os
from PIL import Image
import matplotlib.pyplot as plt
import io  # Used to convert bytes to a file-like object
from PIL import Image



# Parse Images to bring them into the right format for model training

### 1. Access to Azure data storage

In [17]:
#set up storage
#connection_string = 
container_name = "publicdata"

#create client
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)

### 2. Encode labels in the right way

In [18]:
preprocessed = pd.read_csv('labeled_data.csv')

order_of_labels = ["Human", "Barrier", "Bike", "Motorcycle", "Truck", "Car", "Trafficcone", "NoDetec"]

preprocessed['Labels'] = preprocessed[order_of_labels].values.tolist()
preprocessed.drop(columns=['Unnamed: 0'], inplace=True)

preprocessed

Unnamed: 0,filename,Human,Barrier,Bike,Motorcycle,Truck,Car,Trafficcone,NoDetec,Labels
0,samples/CAM_BACK/n003-2018-01-03-12-03-23+0800...,1,0,0,1,1,1,0,0,"[1, 0, 0, 1, 1, 1, 0, 0]"
1,samples/CAM_BACK/n003-2018-01-08-11-30-34+0800...,1,1,0,1,1,1,0,0,"[1, 1, 0, 1, 1, 1, 0, 0]"
2,samples/CAM_BACK/n003-2018-07-12-15-40-35+0800...,1,0,0,1,0,1,0,0,"[1, 0, 0, 1, 0, 1, 0, 0]"
3,samples/CAM_BACK/n005-2018-06-14-20-11-03+0800...,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 1, 0, 0]"
4,samples/CAM_BACK/n008-2018-06-04-16-30-00-0400...,0,0,0,0,1,1,0,0,"[0, 0, 0, 0, 1, 1, 0, 0]"
5,samples/CAM_BACK/n009-2018-05-08-15-52-41-0400...,0,0,1,0,1,1,0,0,"[0, 0, 1, 0, 1, 1, 0, 0]"
6,samples/CAM_BACK/n010-2018-08-27-12-00-23+0800...,1,1,0,0,1,1,0,0,"[1, 1, 0, 0, 1, 1, 0, 0]"
7,samples/CAM_BACK/n013-2018-08-20-14-38-24+0800...,0,0,1,0,1,1,0,0,"[0, 0, 1, 0, 1, 1, 0, 0]"
8,samples/CAM_BACK/n013-2018-08-21-11-46-25+0800...,1,0,0,0,0,1,0,0,"[1, 0, 0, 0, 0, 1, 0, 0]"
9,samples/CAM_BACK/n013-2018-08-28-16-04-27+0800...,1,0,0,0,0,1,0,0,"[1, 0, 0, 0, 0, 1, 0, 0]"


### 3. Define function loading, preprocessing and encoding image based on filename

In [36]:
import io
from PIL import Image
import numpy as np
import zlib
image = "resized_samples/CAM_BACK_LEFT/n013-2018-08-03-14-44-49+0800__CAM_BACK_LEFT__1533278795447155.jpg"

def load_image(path):
    blob_client = container_client.get_blob_client(path)
    # Download the blob's content as bytes
    blob_data = blob_client.download_blob().chunks()  # Read the entire blob content
    # Convert the blob data into a file-like object using io.BytesIO
    image_data = io.BytesIO(b"".join(blob_data))
    # Open the image using PIL.Image
    image = Image.open(image_data)
    return image_data

def preprocess_ecode(image_data):
    # Open the image using PIL.Image
    image = Image.open(image_data)
    image = image.resize((256, 256))
    grayscale_image = image.convert("L")

    # Convert the image to a NumPy array
    grayscale_image = np.array(grayscale_image)
    grayscale_array = grayscale_image / 255.0

    return grayscale_array


def from_path_to_encoded(path):
    #load image from path
    image = load_image(path)
    #apply preprocessing function
    preped = preprocess_ecode(image)
    return preped 

#test
x = from_path_to_encoded(image)

(256, 256)

### 4. Deploy the encoding function on each row

In [35]:
preprocessed["image"] = preprocessed["filename"].apply(from_path_to_encoded)
preprocessed

[[0.28235294 0.30196078 0.30196078 ... 0.70588235 0.70980392 0.61176471]
 [0.26666667 0.29803922 0.29411765 ... 0.70196078 0.70980392 0.61176471]
 [0.23529412 0.29411765 0.29411765 ... 0.69803922 0.65882353 0.56078431]
 ...
 [0.50196078 0.50196078 0.50588235 ... 0.46666667 0.4745098  0.41176471]
 [0.51372549 0.51372549 0.51764706 ... 0.4745098  0.48235294 0.42352941]
 [0.38823529 0.38823529 0.38823529 ... 0.36078431 0.36470588 0.3254902 ]]
[[0.21960784 0.22745098 0.23921569 ... 0.36078431 0.34509804 0.30196078]
 [0.21960784 0.21960784 0.23137255 ... 0.34901961 0.34901961 0.29411765]
 [0.22745098 0.21960784 0.22352941 ... 0.34509804 0.34117647 0.28627451]
 ...
 [0.62352941 0.62745098 0.63529412 ... 0.67843137 0.67843137 0.59215686]
 [0.64313725 0.64313725 0.65490196 ... 0.67843137 0.68627451 0.6       ]
 [0.47843137 0.48235294 0.49019608 ... 0.50980392 0.51372549 0.45490196]]
[[0.46666667 0.4627451  0.4627451  ... 0.75686275 0.76470588 0.6745098 ]
 [0.4627451  0.46666667 0.47058824 ... 

Unnamed: 0,filename,Human,Barrier,Bike,Motorcycle,Truck,Car,Trafficcone,NoDetec,Labels,image
0,samples/CAM_BACK/n003-2018-01-03-12-03-23+0800...,1,0,0,1,1,1,0,0,"[1, 0, 0, 1, 1, 1, 0, 0]","[[0.2823529411764706, 0.30196078431372547, 0.3..."
1,samples/CAM_BACK/n003-2018-01-08-11-30-34+0800...,1,1,0,1,1,1,0,0,"[1, 1, 0, 1, 1, 1, 0, 0]","[[0.2196078431372549, 0.22745098039215686, 0.2..."
2,samples/CAM_BACK/n003-2018-07-12-15-40-35+0800...,1,0,0,1,0,1,0,0,"[1, 0, 0, 1, 0, 1, 0, 0]","[[0.4666666666666667, 0.4627450980392157, 0.46..."
3,samples/CAM_BACK/n005-2018-06-14-20-11-03+0800...,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 1, 0, 0]","[[0.06666666666666667, 0.047058823529411764, 0..."
4,samples/CAM_BACK/n008-2018-06-04-16-30-00-0400...,0,0,0,0,1,1,0,0,"[0, 0, 0, 0, 1, 1, 0, 0]","[[0.3803921568627451, 0.37254901960784315, 0.3..."
5,samples/CAM_BACK/n009-2018-05-08-15-52-41-0400...,0,0,1,0,1,1,0,0,"[0, 0, 1, 0, 1, 1, 0, 0]","[[0.33725490196078434, 0.3333333333333333, 0.3..."
6,samples/CAM_BACK/n010-2018-08-27-12-00-23+0800...,1,1,0,0,1,1,0,0,"[1, 1, 0, 0, 1, 1, 0, 0]","[[0.19215686274509805, 0.2235294117647059, 0.2..."
7,samples/CAM_BACK/n013-2018-08-20-14-38-24+0800...,0,0,1,0,1,1,0,0,"[0, 0, 1, 0, 1, 1, 0, 0]","[[0.2235294117647059, 0.2549019607843137, 0.22..."
8,samples/CAM_BACK/n013-2018-08-21-11-46-25+0800...,1,0,0,0,0,1,0,0,"[1, 0, 0, 0, 0, 1, 0, 0]","[[0.44313725490196076, 0.5019607843137255, 0.4..."
9,samples/CAM_BACK/n013-2018-08-28-16-04-27+0800...,1,0,0,0,0,1,0,0,"[1, 0, 0, 0, 0, 1, 0, 0]","[[0.1568627450980392, 0.1843137254901961, 0.17..."


### export dataframe to csv so that it can be read in the file for model building

In [37]:
preprocessed.to_csv("preprocessed.csv")