# Car damage dataset preprocessor
This notebook will prepare the car damage dataset for the Peltarion platform.

Note: This notebook requires installation of Sidekick. For more information about this package, see:
https://github.com/Peltarion/sidekick

The raw dataset is available at:  https://storage.cloud.google.com/bucket-8732/car_damage/raw.zip

---


In [2]:
import os
import sidekick
import resource
import functools
import pandas as pd
from glob import glob
from PIL import Image
from sklearn.model_selection import train_test_split

## Set paths

In [17]:
# Path to the raw dataset (unzipped)
input_path = './raw'
# Path to the zip output
output_path = './preprocessed.zip'

### Get list of paths to all files

In [7]:
images_rel_path = glob(input_path + '/*/*.jpg') + glob(input_path + '/*/*.png')
print("Images found: ", len(images_rel_path))

Images found:  1538


## Create Dataframe
The class column values are derived from the names of the subfolders in the `input_path`.

The image column contains the relative path to the images in the subfolders.

### Create image and class columns

In [8]:
df = pd.DataFrame({'image': images_rel_path})
df['class'] = df['image'].apply(lambda path: os.path.basename(os.path.dirname(path)))
df.head()

Unnamed: 0,image,class
0,/Users/joakim/rep/car-damage/data/raw/door_scr...,door_scratch
1,/Users/joakim/rep/car-damage/data/raw/door_scr...,door_scratch
2,/Users/joakim/rep/car-damage/data/raw/door_scr...,door_scratch
3,/Users/joakim/rep/car-damage/data/raw/door_scr...,door_scratch
4,/Users/joakim/rep/car-damage/data/raw/door_scr...,door_scratch


## Filter images
Filter out non-RGB images

Create temporary ``image_mode`` column

In [9]:
def get_mode(path):
    im = Image.open(path)
    im.close()
    return im.mode

df['image_mode'] = df['image'].apply(lambda path: get_mode(path))
df['image_mode'].value_counts()

RGB     1512
RGBA      18
P          6
CMYK       2
Name: image_mode, dtype: int64

In [10]:
df = df[df.image_mode =='RGB']
df['image_mode'].value_counts()

RGB    1512
Name: image_mode, dtype: int64

Remove the temporary column

In [11]:
df = df.drop(['image_mode'], axis=1)

In [12]:
df['class'].value_counts()

unknown           549
door_dent         192
door_scratch      154
glass_shatter     137
tail_lamp         136
head_lamp         133
bumper_dent       129
bumper_scratch     82
Name: class, dtype: int64

## Create subsets for training and validation

In [13]:
def create_subsets(df, col='class', validation_size=0.20):
    train_data, validate_data = train_test_split(df, test_size=validation_size, random_state=42, stratify=df[[col]])
    train_data.insert(loc=2, column='subset', value='T')
    validate_data.insert(loc=2, column='subset', value='V')
    return train_data.append(validate_data, ignore_index=True)

df = create_subsets(df)
df['subset'].value_counts()

T    1209
V     303
Name: subset, dtype: int64

## Upsampling 
Upsampling (duplicating samples) can be used to prevent bias in an ubalanced dataset

In [14]:
def upsample_class_x2(df, class_name):
    df_sample_training = df[(df['subset'] == 'T') & (df['class'] == class_name)]
    df_sample_validation = df[(df['subset'] == 'V') & (df['class'] == class_name)]
    return pd.concat([df, df_sample_training, df_sample_validation], axis=0, sort=False)

df = upsample_class_x2(df, 'bumper_scratch')
df['class'].value_counts()

unknown           549
door_dent         192
bumper_scratch    164
door_scratch      154
glass_shatter     137
tail_lamp         136
head_lamp         133
bumper_dent       129
Name: class, dtype: int64

## Create dataset bundle

In [16]:
df.head()

Unnamed: 0,image,class,subset
0,/Users/joakim/rep/car-damage/data/raw/unknown/...,unknown,T
1,/Users/joakim/rep/car-damage/data/raw/head_lam...,head_lamp,T
2,/Users/joakim/rep/car-damage/data/raw/door_scr...,door_scratch,T
3,/Users/joakim/rep/car-damage/data/raw/head_lam...,head_lamp,T
4,/Users/joakim/rep/car-damage/data/raw/unknown/...,unknown,T


In [18]:
image_processor = functools.partial(sidekick.process_image, mode='crop_and_resize', size=(224, 224), file_format='jpeg')
sidekick.create_dataset(
    output_path,
    df,
    path_columns=['image'],
    preprocess={
        'image': image_processor
    }
)
# The duplicated images in the upsampled class will cause warnings

  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_w

  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_w