# Data cleaning jupyter notebook

In the following notebook, the process of cleaning and sorting the data from the udacity self driving car dataset 2 is shown. The dataset is composed of images from driving in Mountain View California and the corresponding csv file containing the information regarding the labelling.

In [5]:
import pandas as pd
import numpy as np
import cv2
from PIL import Image

In [6]:
df = pd.read_csv('labels.csv')

In [7]:
df.head()

Unnamed: 0,filename,xmin,ymin,xmax,ymax,class
0,1478019952686311006.jpg,950,574,1004,620,car
1,1478019952686311006.jpg,1748,482,1818,744,pedestrian
2,1478019953180167674.jpg,872,586,926,632,car
3,1478019953689774621.jpg,686,566,728,618,truck
4,1478019953689774621.jpg,716,578,764,622,car


In [8]:
len(df)

93086

In [9]:
df['class'].value_counts()

car             60788
trafficLight    17253
pedestrian       9866
truck            3503
biker            1676
Name: class, dtype: int64

In [10]:
grouped = df.groupby('class')

In [11]:
biker_group = grouped.get_group('biker')

In [12]:
biker_group.head()

Unnamed: 0,filename,xmin,ymin,xmax,ymax,class
159,1478019971686116476.jpg,584,568,638,606,biker
166,1478019971686116476.jpg,858,542,908,602,biker
169,1478019972180014279.jpg,498,570,552,608,biker
181,1478019972685986697.jpg,418,566,470,610,biker
194,1478019973185520968.jpg,332,564,388,612,biker


In [13]:
biker_group['filename'].nunique()

1076

In [14]:
len(biker_group), len(biker_group['filename'].unique().tolist())

(1676, 1076)

In [15]:
biker_filenames = biker_group['filename'].unique().tolist()

In [16]:
new_df = df.loc[df['filename'].isin(biker_filenames)]

In [17]:
new_df['filename'].nunique()

1076

In [18]:
new_df['class'].value_counts()

car             4440
trafficLight    2522
pedestrian      1811
biker           1676
truck            338
Name: class, dtype: int64

In [19]:
truck_group = grouped.get_group('truck')

In [20]:
truck_group.head()

Unnamed: 0,filename,xmin,ymin,xmax,ymax,class
3,1478019953689774621.jpg,686,566,728,618,truck
8,1478019954186238236.jpg,662,562,710,616,truck
13,1478019954685370994.jpg,640,560,694,614,truck
17,1478019955185244088.jpg,618,558,678,614,truck
22,1478019955679801306.jpg,598,554,664,612,truck


In [21]:
len(truck_group)

3503

In [22]:
truck_group['filename'].nunique()

2439

In [23]:
truck_filenames = truck_group['filename'].unique().tolist()

In [24]:
truck_df = df.loc[df['filename'].isin(truck_filenames)]

In [25]:
truck_df['filename'].nunique()

2439

In [26]:
final_df = pd.concat([new_df, truck_df])

In [27]:
len(df)

93086

In [28]:
final_df.drop_duplicates()

Unnamed: 0,filename,xmin,ymin,xmax,ymax,class
158,1478019971686116476.jpg,546,516,568,550,trafficLight
159,1478019971686116476.jpg,584,568,638,606,biker
160,1478019971686116476.jpg,664,562,750,630,car
161,1478019971686116476.jpg,740,466,758,502,trafficLight
162,1478019971686116476.jpg,746,556,800,600,car
163,1478019971686116476.jpg,752,562,798,592,car
164,1478019971686116476.jpg,816,460,836,494,trafficLight
165,1478019971686116476.jpg,818,456,842,494,trafficLight
166,1478019971686116476.jpg,858,542,908,602,biker
167,1478019971686116476.jpg,966,478,998,518,trafficLight


In [29]:
final_df['filename'].nunique()

3303

In [30]:
final_df['class'].value_counts()

car             17200
trafficLight     6358
truck            3841
pedestrian       3732
biker            1997
Name: class, dtype: int64

In [31]:
final_df['xdif'] = final_df['xmax'] - final_df['xmin']

In [32]:
final_df.head()

Unnamed: 0,filename,xmin,ymin,xmax,ymax,class,xdif
158,1478019971686116476.jpg,546,516,568,550,trafficLight,22
159,1478019971686116476.jpg,584,568,638,606,biker,54
160,1478019971686116476.jpg,664,562,750,630,car,86
161,1478019971686116476.jpg,740,466,758,502,trafficLight,18
162,1478019971686116476.jpg,746,556,800,600,car,54


In [33]:
final_df = final_df.sort_values(by=['class', 'xdif'])

In [34]:
final_df = final_df.reset_index(drop=True)

In [35]:
final_df

Unnamed: 0,filename,xmin,ymin,xmax,ymax,class,xdif
0,1478898931878839067.jpg,288,604,304,646,biker,16
1,1478900648604965875.jpg,0,580,16,666,biker,16
2,1478898666228903545.jpg,590,610,608,650,biker,18
3,1478898666800134931.jpg,560,586,578,626,biker,18
4,1478898667371299966.jpg,544,602,562,640,biker,18
5,1478897854426424817.jpg,148,604,168,686,biker,20
6,1478897854997589082.jpg,148,604,168,686,biker,20
7,1478897855568054263.jpg,148,604,168,686,biker,20
8,1478897856140609864.jpg,148,604,168,686,biker,20
9,1478897856711123471.jpg,148,604,168,686,biker,20


In [36]:
final_df['class'].ne('car').idxmin()

1997

In [37]:
final_df.iloc[[1997]]

Unnamed: 0,filename,xmin,ymin,xmax,ymax,class,xdif
1997,1478899047851282737.jpg,882,590,898,626,car,16


In [38]:
final_df = final_df.drop(final_df.index[1997:11997])

In [39]:
final_df['filename'].nunique()

3303

In [40]:
final_df['class'].value_counts()

car             7200
trafficLight    6358
truck           3841
pedestrian      3732
biker           1997
Name: class, dtype: int64

In [41]:
# def draw_boxes(image_name):
#     selected_value = df[df.filename == image_name]
#     img = cv2.imread('images/{}'.format(image_name))
#     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#     for index, row in selected_value.iterrows():
#         img = cv2.rectangle(img, (row['xmin'], row['ymin']), (row['xmax'], row['ymax']), (0, 255, 0), 3)
#     return img

In [42]:
# sample_images = final_df.sample(n=20).drop_duplicates(subset='filename')['filename'].tolist()

In [43]:
# Image.fromarray(draw_boxes(sample_images[0]))

In [44]:
filename_grouped = final_df.groupby('filename')

In [45]:
len(filename_grouped)

3303

In [46]:
grouped_list = [filename_grouped.get_group(x) for x in filename_grouped.groups]

In [47]:
train_index = np.random.choice(len(filename_grouped), size=2642, replace=False)
test_index = np.setdiff1d(list(range(3303)), train_index)

In [48]:
len(train_index), len(test_index)

(2642, 661)

In [49]:
train = pd.concat([grouped_list[i] for i in train_index])
test = pd.concat([grouped_list[i] for i in test_index])

In [50]:
len(train), len(test)

(18520, 4608)

In [52]:
len(final_df)

23128

In [None]:
train.to_csv('data/labels/train_labels.csv', index=None)
test.to_csv('data/labels/test_labels.csv', index=None)