# Resolution Statistics

In [24]:
# Packages
import pandas as pd
import matplotlib.pyplot as plt
import dataframe_image as dfi # NOTE: YOU MUST HAVE GOOGLE CHROME INSTALLED FOR THIS TO WORK CORRECTLY
import os

## Get Observations actually in our data

In [25]:
# Get mapping to original location of images

# Function for loading parquet files and loading Class, harmonized_filename columns
def combine_directory_parquets_read_class_fn(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    # If path does not end in a slash, add one
    if directory_path[-1] != '/':
        directory_path += '/'
    # list of files in directory
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    # read in all parquet files
    combined_df = pd.concat([pd.read_parquet(directory_path + f, columns=['Class', 'harmonized_filename']) for f in file_list])
    # Return combined dataframe
    return combined_df

In [26]:
train_data = combine_directory_parquets_read_class_fn('../../../Data/Features/All Features/train/')
test_data = combine_directory_parquets_read_class_fn('../../../Data/Features/All Features/test/')
# Combine train and test data
all_data_class_fns = pd.concat([train_data, test_data])
# Parse original filename
# Split on _orig_ and keep the second part
all_data_class_fns['Original_Filename'] = all_data_class_fns['harmonized_filename'].str.split('_orig_').str[1]
# Split on _resized and keep the first part
all_data_class_fns['Original_Filename'] = all_data_class_fns['Original_Filename'].str.split('_resized').str[0]
# replace _ with /
all_data_class_fns['Original_Filename'] = all_data_class_fns['Original_Filename'].str.replace('_', '/')
# Add .jpg to end
all_data_class_fns['Original_Filename'] = all_data_class_fns['Original_Filename'] + '.jpg'
# Replace train with cars_train/cars_train and test with cars_test/cars_test
all_data_class_fns['Original_Filename'] = all_data_class_fns['Original_Filename'].str.replace('train/', 'cars_train/cars_train/')
all_data_class_fns['Original_Filename'] = all_data_class_fns['Original_Filename'].str.replace('test/', 'cars_test/cars_test/')
all_data_class_fns

Unnamed: 0,Class,harmonized_filename,Original_Filename
0,Sedan,Sedan_train_orig_test_01516_resized.jpg,cars_test/cars_test/01516.jpg
1,SUV,SUV_train_orig_train_00294_resized.jpg,cars_train/cars_train/00294.jpg
2,Convertible,Convertible_train_orig_train_04236_resized.jpg,cars_train/cars_train/04236.jpg
3,Pickup,Pickup_train_orig_train_03906_resized.jpg,cars_train/cars_train/03906.jpg
4,SUV,SUV_train_orig_test_01344_resized.jpg,cars_test/cars_test/01344.jpg
...,...,...,...
88,Sedan,Sedan_test_orig_test_03443_resized.jpg,cars_test/cars_test/03443.jpg
89,Pickup,Pickup_test_orig_train_04088_resized.jpg,cars_train/cars_train/04088.jpg
90,Sedan,Sedan_test_orig_train_03673_resized.jpg,cars_train/cars_train/03673.jpg
91,Sedan,Sedan_test_orig_train_06616_resized.jpg,cars_train/cars_train/06616.jpg


## Get cars_annos data

In [27]:
# Bring in '~\Box\INFO 290T Project\Intermediate Data\cars_annos.xslx'
cars_annos = pd.read_excel('~/Box/INFO 290T Project/Intermediate Data/cars_annos.xlsx')
# Create column 'Original Filename'
# Create column based on test col
cars_annos['train_test'] = cars_annos['test'].apply(lambda x: 'cars_train/cars_train' if x == 0 else 'cars_test/cars_test')
# Original Filename is train_test + '/' + filename
cars_annos['Original_Filename'] = cars_annos['train_test'] + '/' + cars_annos['filename']
cars_annos

Unnamed: 0,Class,filename,x1,y1,x2,y2,old_class,test,old_class_name,im_path,width,height,num_pixels,test_80_20,train_test,Original_Filename
0,SUV,00076.jpg,11,13,84,60,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,96,64,6144,0,cars_test/cars_test,cars_test/cars_test/00076.jpg
1,SUV,00457.jpg,31,20,226,119,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,250,144,36000,0,cars_test/cars_test,cars_test/cars_test/00457.jpg
2,SUV,00684.jpg,111,54,365,190,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,373,216,80568,0,cars_test/cars_test,cars_test/cars_test/00684.jpg
3,SUV,01117.jpg,45,39,729,414,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,800,600,480000,0,cars_test/cars_test,cars_test/cars_test/01117.jpg
4,SUV,01167.jpg,14,16,268,169,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,278,182,50596,0,cars_test/cars_test,cars_test/cars_test/01167.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16180,Convertible,07537.jpg,47,65,249,180,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,262,193,50566,0,cars_train/cars_train,cars_train/cars_train/07537.jpg
16181,Convertible,07594.jpg,29,34,381,273,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,400,300,120000,0,cars_train/cars_train,cars_train/cars_train/07594.jpg
16182,Convertible,07846.jpg,78,289,669,633,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,1024,683,699392,0,cars_train/cars_train,cars_train/cars_train/07846.jpg
16183,Convertible,07895.jpg,31,6,494,272,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,500,272,136000,0,cars_train/cars_train,cars_train/cars_train/07895.jpg


## Inner Join cars_annos with all_data_class_fns on 'Original_Filename'

In [28]:
cars_annos = cars_annos.merge(all_data_class_fns, on='Original_Filename', how='inner')
# Drop Class_x and rename Class_y to Class
cars_annos.drop(columns=['Class_x'], inplace=True)
cars_annos.rename(columns={'Class_y': 'Class'}, inplace=True)
cars_annos

Unnamed: 0,filename,x1,y1,x2,y2,old_class,test,old_class_name,im_path,width,height,num_pixels,test_80_20,train_test,Original_Filename,Class,harmonized_filename
0,01117.jpg,45,39,729,414,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,800,600,480000,0,cars_test/cars_test,cars_test/cars_test/01117.jpg,SUV,SUV_train_orig_test_01117_resized.jpg
1,01538.jpg,32,69,487,316,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,500,353,176500,0,cars_test/cars_test,cars_test/cars_test/01538.jpg,SUV,SUV_train_orig_test_01538_resized.jpg
2,01802.jpg,46,115,623,469,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,786,491,385926,0,cars_test/cars_test,cars_test/cars_test/01802.jpg,SUV,SUV_train_orig_test_01802_resized.jpg
3,01887.jpg,11,60,796,535,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,800,580,464000,0,cars_test/cars_test,cars_test/cars_test/01887.jpg,SUV,SUV_train_orig_test_01887_resized.jpg
4,02017.jpg,28,117,415,274,1,1,AM General Hummer SUV 2000,/Box/INFO 290T Project/Raw Data/Stanford Car D...,500,375,187500,0,cars_test/cars_test,cars_test/cars_test/02017.jpg,SUV,SUV_train_orig_test_02017_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7487,07500.jpg,20,27,613,413,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,630,420,264600,0,cars_train/cars_train,cars_train/cars_train/07500.jpg,Convertible,Convertible_train_orig_train_07500_resized.jpg
7488,07594.jpg,29,34,381,273,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,400,300,120000,0,cars_train/cars_train,cars_train/cars_train/07594.jpg,Convertible,Convertible_train_orig_train_07594_resized.jpg
7489,07846.jpg,78,289,669,633,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,1024,683,699392,0,cars_train/cars_train,cars_train/cars_train/07846.jpg,Convertible,Convertible_train_orig_train_07846_resized.jpg
7490,07895.jpg,31,6,494,272,196,0,smart fortwo Convertible 2012,/Box/INFO 290T Project/Raw Data/Stanford Car D...,500,272,136000,0,cars_train/cars_train,cars_train/cars_train/07895.jpg,Convertible,Convertible_train_orig_train_07895_resized.jpg


## Compute Statistics

In [29]:
# Summarize columns 'width' and 'height' by class
# percentile function
# https://stackoverflow.com/questions/17578115/pass-percentiles-to-pandas-agg-function
def percentile(n):
    def percentile_(x):
        return x.quantile(n / 100, interpolation='nearest')
    percentile_.__name__ = 'percentile_{:02.0f}'.format(n)
    return percentile_


In [30]:
# Overall num pixels by class
cars_annos.groupby('Class').agg({'num_pixels': ['count', 'mean', 'std', 'min', percentile(25), percentile(50), percentile(75), 'max']}).reset_index().sort_values(by='Class')

Unnamed: 0_level_0,Class,num_pixels,num_pixels,num_pixels,num_pixels,num_pixels,num_pixels,num_pixels,num_pixels
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,percentile_25,percentile_50,percentile_75,max
0,Convertible,1419,548757.403805,792697.2,95944,187500,286440,634880,12212224
1,Pickup,902,473148.279379,721102.1,104544,187500,307200,420938,13790208
2,SUV,2057,622720.773456,1389421.0,96302,237900,307200,639200,42120000
3,Sedan,3114,633245.745344,1092483.0,102800,230250,307200,706552,21026304


In [33]:
# Describe items in selected_classes
selected_classes_description = (cars_annos[['width', 'height', 'num_pixels']].rename(columns = {'width': 'Width', 'height': 'Height', 'num_pixels': 'Number of Pixels'}).describe().T
                                                                                       .rename(columns={'count': 'Count', 'mean': 'Mean', 'std': 'Standard Deviation', 'min': 'Minimum', '25%': '25th Percentile', '50%': '50th Percentile', '75%': '75th Percentile', 'max': 'Maximum'})
                                                                                       .style
                                                                                       .format(precision=2, thousands=",", decimal=".")
                                                                                       .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
                                                                                       .set_properties(**{'text-align': 'center'}))

# Export
dfi.export(selected_classes_description, '../../../Output/Resolution Statistics/Finalized/selected_classes_description.png')

# Export to Excel
selected_classes_description.to_excel('../../../Output/Resolution Statistics/Finalized/selected_classes_description.xlsx')

selected_classes_description

Unnamed: 0,Count,Mean,Standard Deviation,Minimum,25th Percentile,50th Percentile,75th Percentile,Maximum
Width,7492.0,812.1,447.7,358.0,584.0,640.0,1024.0,7800.0
Height,7492.0,564.48,314.89,256.0,375.0,480.0,680.0,5400.0
Number of Pixels,7492.0,595078.8,1100151.84,95944.0,219636.0,307200.0,691200.0,42120000.0
