# Combine and Clean Class Information

Load image filenames and class info.

Fix image paths to conform with folder structure.

In [1]:
# Package to load .mat
import pandas as pd
import os
import mat4py
# Set seed
import numpy as np
np.random.seed(290)

In [2]:
# Print what '~' points to
print(os.path.expanduser('~'))

C:\Users\ijyli


In [3]:
# Load '~/Box/INFO 290T Project/Raw Data/Stanford Car Dataset - Class Information/anno_test.csv'
anno_test = pd.read_csv(os.path.expanduser('~') + '/Box/INFO 290T Project/Raw Data/Stanford Car Dataset - Class Information/anno_test.csv', header=None)
# Assign column names
anno_test.columns = ['filename', 'x1', 'y1', 'x2', 'y2', 'class']
# Flag for test dataset
anno_test['test'] = 1
anno_test

Unnamed: 0,filename,x1,y1,x2,y2,class,test
0,00001.jpg,30,52,246,147,181,1
1,00002.jpg,100,19,576,203,103,1
2,00003.jpg,51,105,968,659,145,1
3,00004.jpg,67,84,581,407,187,1
4,00005.jpg,140,151,593,339,185,1
...,...,...,...,...,...,...,...
8036,08037.jpg,49,57,1169,669,63,1
8037,08038.jpg,23,18,640,459,16,1
8038,08039.jpg,33,27,602,252,17,1
8039,08040.jpg,33,142,521,376,38,1


In [4]:
# Load '~/Box/INFO 290T Project/Raw Data/Stanford Car Dataset - Class Information/anno_train.csv'
anno_train = pd.read_csv(os.path.expanduser('~') + '/Box/INFO 290T Project/Raw Data/Stanford Car Dataset - Class Information/anno_train.csv', header=None)
# Assign column names
anno_train.columns = ['filename', 'x1', 'y1', 'x2', 'y2', 'class']
# Flag for train dataset
anno_train['test'] = 0
anno_train

Unnamed: 0,filename,x1,y1,x2,y2,class,test
0,00001.jpg,39,116,569,375,14,0
1,00002.jpg,36,116,868,587,3,0
2,00003.jpg,85,109,601,381,91,0
3,00004.jpg,621,393,1484,1096,134,0
4,00005.jpg,14,36,133,99,106,0
...,...,...,...,...,...,...,...
8139,08140.jpg,3,44,423,336,78,0
8140,08141.jpg,138,150,706,523,196,0
8141,08142.jpg,26,246,660,449,163,0
8142,08143.jpg,78,526,1489,908,112,0


In [5]:
# Stack together
df_annotations = pd.concat([anno_test, anno_train], axis = 0)
df_annotations

Unnamed: 0,filename,x1,y1,x2,y2,class,test
0,00001.jpg,30,52,246,147,181,1
1,00002.jpg,100,19,576,203,103,1
2,00003.jpg,51,105,968,659,145,1
3,00004.jpg,67,84,581,407,187,1
4,00005.jpg,140,151,593,339,185,1
...,...,...,...,...,...,...,...
8139,08140.jpg,3,44,423,336,78,0
8140,08141.jpg,138,150,706,523,196,0
8141,08142.jpg,26,246,660,449,163,0
8142,08143.jpg,78,526,1489,908,112,0


In [6]:
# Getting class names from the original .mat file
# Load '~/Box/INFO 290T Project/Raw Data/Stanford Car Dataset/cars_annos.mat'
cars_annos = mat4py.loadmat(os.path.expanduser('~') + '/Box/INFO 290T Project/Raw Data/Stanford Car Dataset/cars_annos.mat')
cars_annos

{'annotations': {'relative_im_path': ['car_ims/000001.jpg',
   'car_ims/000002.jpg',
   'car_ims/000003.jpg',
   'car_ims/000004.jpg',
   'car_ims/000005.jpg',
   'car_ims/000006.jpg',
   'car_ims/000007.jpg',
   'car_ims/000008.jpg',
   'car_ims/000009.jpg',
   'car_ims/000010.jpg',
   'car_ims/000011.jpg',
   'car_ims/000012.jpg',
   'car_ims/000013.jpg',
   'car_ims/000014.jpg',
   'car_ims/000015.jpg',
   'car_ims/000016.jpg',
   'car_ims/000017.jpg',
   'car_ims/000018.jpg',
   'car_ims/000019.jpg',
   'car_ims/000020.jpg',
   'car_ims/000021.jpg',
   'car_ims/000022.jpg',
   'car_ims/000023.jpg',
   'car_ims/000024.jpg',
   'car_ims/000025.jpg',
   'car_ims/000026.jpg',
   'car_ims/000027.jpg',
   'car_ims/000028.jpg',
   'car_ims/000029.jpg',
   'car_ims/000030.jpg',
   'car_ims/000031.jpg',
   'car_ims/000032.jpg',
   'car_ims/000033.jpg',
   'car_ims/000034.jpg',
   'car_ims/000035.jpg',
   'car_ims/000036.jpg',
   'car_ims/000037.jpg',
   'car_ims/000038.jpg',
   'car_ims/000

In [7]:
# Check type and structure of cars_annos
print(type(cars_annos))
# Check keys of cars_annos
print(cars_annos.keys())
# Unpack class names
class_names = cars_annos['class_names']

<class 'dict'>
dict_keys(['annotations', 'class_names'])


In [8]:
# Class names part
df_class_names = pd.DataFrame(class_names)
df_class_names

Unnamed: 0,0
0,AM General Hummer SUV 2000
1,Acura RL Sedan 2012
2,Acura TL Sedan 2012
3,Acura TL Type-S 2008
4,Acura TSX Sedan 2012
...,...
191,Volkswagen Beetle Hatchback 2012
192,Volvo C30 Hatchback 2012
193,Volvo 240 Sedan 1993
194,Volvo XC90 SUV 2007


In [9]:
# Create class as a variable in df_class_names - just the index + 1
# Rename column 0 to class_name
df_class_names['class'] = df_class_names.index + 1
df_class_names.rename(columns={0: 'class_name'}, inplace=True)

In [10]:
# Create df by merging on class
df = df_annotations.merge(df_class_names, on='class')
df

Unnamed: 0,filename,x1,y1,x2,y2,class,test,class_name
0,00001.jpg,30,52,246,147,181,1,Suzuki Aerio Sedan 2007
1,00055.jpg,84,169,561,443,181,1,Suzuki Aerio Sedan 2007
2,00323.jpg,12,31,489,226,181,1,Suzuki Aerio Sedan 2007
3,00540.jpg,7,121,635,357,181,1,Suzuki Aerio Sedan 2007
4,00541.jpg,9,42,282,207,181,1,Suzuki Aerio Sedan 2007
...,...,...,...,...,...,...,...,...
16180,06147.jpg,31,103,617,317,77,0,Chrysler Sebring Convertible 2010
16181,06490.jpg,55,93,608,447,77,0,Chrysler Sebring Convertible 2010
16182,07763.jpg,30,39,210,157,77,0,Chrysler Sebring Convertible 2010
16183,07842.jpg,128,89,928,575,77,0,Chrysler Sebring Convertible 2010


In [11]:
# Fix image paths
base_path = '/Box/INFO 290T Project/Raw Data/Stanford Car Dataset/'
# Next part of path - 'cars_test/cars_test/' or 'cars_train/cars_train/' based on test column
# Replace relative_im_path with base_path + next part of path + relative_im_path with 'cars_ims/' stripped off
df['im_path'] = base_path + df['test'].map(lambda x: 'cars_test/cars_test/' if x == 1 else 'cars_train/cars_train/') + df['filename']
df

Unnamed: 0,filename,x1,y1,x2,y2,class,test,class_name,im_path
0,00001.jpg,30,52,246,147,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...
1,00055.jpg,84,169,561,443,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...
2,00323.jpg,12,31,489,226,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...
3,00540.jpg,7,121,635,357,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...
4,00541.jpg,9,42,282,207,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...
...,...,...,...,...,...,...,...,...,...
16180,06147.jpg,31,103,617,317,77,0,Chrysler Sebring Convertible 2010,/Box/INFO 290T Project/Raw Data/Stanford Car D...
16181,06490.jpg,55,93,608,447,77,0,Chrysler Sebring Convertible 2010,/Box/INFO 290T Project/Raw Data/Stanford Car D...
16182,07763.jpg,30,39,210,157,77,0,Chrysler Sebring Convertible 2010,/Box/INFO 290T Project/Raw Data/Stanford Car D...
16183,07842.jpg,128,89,928,575,77,0,Chrysler Sebring Convertible 2010,/Box/INFO 290T Project/Raw Data/Stanford Car D...


### Add Image Resolution

In [12]:
from PIL import Image
# tqdm for progress bar
# from tqdm import tqdm
# tqdm.pandas(desc="Progress")

# Height and width functions using PIL
# Hopefully lazy loading
def get_image_width(image_path):
    with Image.open(os.path.expanduser('~') + image_path) as img:
        return img.size[0]
def get_image_height(image_path):
    with Image.open(os.path.expanduser('~') + image_path) as img:
        return img.size[1]

# Apply get_image_resolution to im_path
# Columns for width and height
# df['width'] = df['im_path'].progress_apply(get_image_width)
# df['height'] = df['im_path'].progress_apply(get_image_height)
df['width'] = df['im_path'].apply(get_image_width)
df['height'] = df['im_path'].apply(get_image_height)
# num_pixels = width * height
df['num_pixels'] = df['width'] * df['height']
df

Unnamed: 0,filename,x1,y1,x2,y2,class,test,class_name,im_path,width,height,num_pixels
0,00001.jpg,30,52,246,147,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...,276,182,50232
1,00055.jpg,84,169,561,443,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...,640,480,307200
2,00323.jpg,12,31,489,226,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...,499,238,118762
3,00540.jpg,7,121,635,357,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...,640,480,307200
4,00541.jpg,9,42,282,207,181,1,Suzuki Aerio Sedan 2007,/Box/INFO 290T Project/Raw Data/Stanford Car D...,340,255,86700
...,...,...,...,...,...,...,...,...,...,...,...,...
16180,06147.jpg,31,103,617,317,77,0,Chrysler Sebring Convertible 2010,/Box/INFO 290T Project/Raw Data/Stanford Car D...,640,480,307200
16181,06490.jpg,55,93,608,447,77,0,Chrysler Sebring Convertible 2010,/Box/INFO 290T Project/Raw Data/Stanford Car D...,640,480,307200
16182,07763.jpg,30,39,210,157,77,0,Chrysler Sebring Convertible 2010,/Box/INFO 290T Project/Raw Data/Stanford Car D...,259,194,50246
16183,07842.jpg,128,89,928,575,77,0,Chrysler Sebring Convertible 2010,/Box/INFO 290T Project/Raw Data/Stanford Car D...,1024,768,786432


### Add proposed class

In [13]:
# Load names and proposed classes dataset
# Excel file "~/Box/INFO 290T Project/Names and Proposed Classes.xlsx"
names_and_proposed_classes = pd.read_excel('~/Box/INFO 290T Project/Names and Proposed Classes.xlsx')
names_and_proposed_classes

Unnamed: 0,Name,Class
0,AM General Hummer SUV 2000,SUV
1,Acura RL Sedan 2012,Sedan
2,Acura TL Sedan 2012,Sedan
3,Acura TL Type-S 2008,Sedan
4,Acura TSX Sedan 2012,Sedan
...,...,...
191,Volkswagen Beetle Hatchback 2012,Hatchback
192,Volvo C30 Hatchback 2012,Hatchback
193,Volvo 240 Sedan 1993,Sedan
194,Volvo XC90 SUV 2007,SUV


In [14]:
df_with_proposed_classes = pd.merge(names_and_proposed_classes, df, left_on='Name', right_on='class_name', how='left')
# rename class to old_class, class_name to old_class_name
df_with_proposed_classes.rename(columns={'class': 'old_class', 'class_name': 'old_class_name'}, inplace=True)
# Drop Name column
df_with_proposed_classes.drop(columns=['Name'], inplace=True)
df_with_proposed_classes

Unnamed: 0,Class,filename,x1,y1,x2,y2,old_class,test,old_class_name,im_path,width,height,num_pixels
0,SUV,00076.jpg,11,13,84,60,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,96,64,6144
1,SUV,00457.jpg,31,20,226,119,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,250,144,36000
2,SUV,00684.jpg,111,54,365,190,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,373,216,80568
3,SUV,01117.jpg,45,39,729,414,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,800,600,480000
4,SUV,01167.jpg,14,16,268,169,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,278,182,50596
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16180,Convertible,07537.jpg,47,65,249,180,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,262,193,50566
16181,Convertible,07594.jpg,29,34,381,273,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,400,300,120000
16182,Convertible,07846.jpg,78,289,669,633,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,1024,683,699392
16183,Convertible,07895.jpg,31,6,494,272,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,500,272,136000


### New Proposed Train-Test Split

In [15]:
# Set proposed_new_test to 1 in 80 percent of rows (randomly selected)
df_with_proposed_classes['test_80_20'] = np.random.choice([0, 1], size=(len(df_with_proposed_classes),), p=[0.8, 0.2])

df_with_proposed_classes['test_80_20'].value_counts()

test_80_20
0    12930
1     3255
Name: count, dtype: int64

### Save final data

In [16]:
# Export as Excel file
df_with_proposed_classes.to_excel(os.path.expanduser('~') + '/Box/INFO 290T Project/Intermediate Data/cars_annos.xlsx', index=False)