# Combine All Features

Load parquet files of features, stack as needed, then merge them into a single dataframe.

In [1]:
# Packages
import os
import pandas as pd
import shutil

## Generalized Function to Load and Combine All DFs in a Directory

In [2]:
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    # If path does not end in a slash, add one
    if directory_path[-1] != '/':
        directory_path += '/'
    # list of files in directory
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    # read in all parquet files
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    # Return combined dataframe
    return combined_df

## Function to create harmonized Image file name

In [3]:
def create_harmonized_filename(file_path):
    '''
    Strip Blur vs. No Blur and other data from file path.
    '''
    # Split path by '/' and take last element
    harmonized_filename = file_path.split('/')[-1]
    # Remove _no_blur and _blurred from file name
    harmonized_filename = harmonized_filename.replace('_no_blur', '')
    harmonized_filename = harmonized_filename.replace('_blurred', '')
    # Return harmonized file name
    return harmonized_filename

In [4]:
def prep_dataset_for_merge(df, feature_name):
    '''
    Prepares dataset for merging.
    '''
    # Create new column for harmonized file name
    df['harmonized_filename'] = df['Image Path'].apply(create_harmonized_filename)
    # Rename Image Path to Image Path + feature name
    df.rename(columns={'Image Path': 'Image Path ' + feature_name}, inplace=True)
    # Return dataset
    return df

## Load Features

In [5]:
# Vision Transformer Embeddings
vit_embeddings = combine_directory_parquets('../../../Data/Features/Vision Transformer')
# Prep dataset for merging
vit_embeddings = prep_dataset_for_merge(vit_embeddings, 'Vision Transformer')
vit_embeddings

Unnamed: 0,Image Path Vision Transformer,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,ViT_Embedding_Element_1272,ViT_Embedding_Element_1273,ViT_Embedding_Element_1274,ViT_Embedding_Element_1275,ViT_Embedding_Element_1276,ViT_Embedding_Element_1277,ViT_Embedding_Element_1278,ViT_Embedding_Element_1279,test_80_20,harmonized_filename
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,-0.059224,0.173239,0.363462,0.457626,-0.077351,-0.236950,-0.031632,-0.261893,0,Sedan_train_orig_test_01516_resized.jpg
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,0.215697,0.023538,0.393746,0.455197,0.223018,-0.265846,-0.200683,-0.405006,0,SUV_train_orig_train_00294_resized.jpg
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,0.111003,0.158716,0.380261,0.493224,0.169883,-0.105756,0.124275,-0.446003,0,Convertible_train_orig_train_04236_resized.jpg
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,0.048522,-0.032885,0.400770,0.430547,0.214644,-0.323948,-0.276459,-0.414079,0,Pickup_train_orig_train_03906_resized.jpg
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,-0.089879,0.237541,0.321542,0.343445,0.047995,-0.305656,0.091199,-0.370617,0,SUV_train_orig_test_01344_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,../../../Images/test/No Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,0.068092,-0.512507,...,0.023480,0.073889,0.333791,0.452840,0.038384,-0.209088,-0.219002,-0.133379,1,Sedan_test_orig_train_00376_resized.jpg
1028,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,-0.112296,0.220512,0.196491,0.279989,0.168716,-0.193009,-0.030066,-0.041473,1,SUV_test_orig_test_00579_resized.jpg
1029,../../../Images/test/No Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,0.011075,-0.483120,...,0.052621,0.168497,0.294833,0.398839,-0.061976,-0.102406,-0.211565,-0.086189,1,Sedan_test_orig_test_00430_resized.jpg
1030,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,-0.054918,0.249961,0.387619,0.443451,0.075971,-0.162953,-0.078282,-0.177154,1,Sedan_test_orig_test_07328_resized.jpg


In [6]:
# HOG 16 pixels per cell Vectors
hog_16_ppc_vectors = combine_directory_parquets('../../../Data/Features/HOG_16_ppc')
# Prep dataset for merging
hog_16_ppc_vectors = prep_dataset_for_merge(hog_16_ppc_vectors, 'HOG 16 ppc')
hog_16_ppc_vectors

Unnamed: 0,Image Path HOG 16 ppc,test_80_20,HOG_16_ppc_0,HOG_16_ppc_1,HOG_16_ppc_2,HOG_16_ppc_3,HOG_16_ppc_4,HOG_16_ppc_5,HOG_16_ppc_6,HOG_16_ppc_7,...,HOG_16_ppc_7047,HOG_16_ppc_7048,HOG_16_ppc_7049,HOG_16_ppc_7050,HOG_16_ppc_7051,HOG_16_ppc_7052,HOG_16_ppc_7053,HOG_16_ppc_7054,HOG_16_ppc_7055,harmonized_filename
0,../../../Images/train/Blurred/Convertible_trai...,0,0.101255,0.092199,0.138589,0.094241,0.086408,0.177341,0.231480,0.095215,...,0.002598,0.002031,0.022873,0.266860,0.001467,0.002084,0.019091,0.309883,0.007916,Convertible_train_orig_train_00272_resized.jpg
1,../../../Images/train/Blurred/SUV_train_orig_t...,0,0.193664,0.121777,0.061164,0.062579,0.215821,0.296258,0.054987,0.024952,...,0.178335,0.210089,0.126771,0.570253,0.094985,0.061559,0.063759,0.074800,0.020386,SUV_train_orig_test_07471_resized.jpg
2,../../../Images/train/Blurred/Convertible_trai...,0,0.216030,0.231329,0.231329,0.222042,0.145685,0.231329,0.231329,0.140553,...,0.163555,0.093627,0.149736,0.130635,0.056536,0.106760,0.223056,0.223691,0.173976,Convertible_train_orig_test_01313_resized.jpg
3,../../../Images/train/Blurred/Sedan_train_orig...,0,0.008284,0.001071,0.001698,0.000250,0.003443,0.000000,0.000618,0.000125,...,0.001784,0.011086,0.007476,0.015787,0.003075,0.008991,0.003826,0.016995,0.001115,Sedan_train_orig_test_05964_resized.jpg
4,../../../Images/train/Blurred/Sedan_train_orig...,0,0.014640,0.003920,0.019302,0.011404,0.031828,0.013773,0.019780,0.035038,...,0.135492,0.119477,0.163750,0.214886,0.130832,0.104585,0.118230,0.179855,0.072599,Sedan_train_orig_test_06611_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,../../../Images/train/Blurred/SUV_train_orig_t...,0,0.174214,0.209595,0.213015,0.213015,0.205037,0.189975,0.166364,0.213015,...,0.051808,0.080350,0.293684,0.293684,0.069438,0.173916,0.239382,0.293684,0.080813,SUV_train_orig_test_04769_resized.jpg
271,../../../Images/train/Blurred/Sedan_train_orig...,0,0.243238,0.216069,0.195349,0.132494,0.215306,0.125493,0.102957,0.105917,...,0.049900,0.105218,0.082897,0.243796,0.102431,0.091609,0.139014,0.368997,0.091731,Sedan_train_orig_test_03561_resized.jpg
272,../../../Images/train/Blurred/Sedan_train_orig...,0,0.111608,0.120127,0.115701,0.138836,0.123670,0.137052,0.185639,0.129203,...,0.187670,0.165545,0.099942,0.093727,0.125647,0.092924,0.090182,0.169777,0.068531,Sedan_train_orig_test_05509_resized.jpg
273,../../../Images/train/Blurred/Sedan_train_orig...,0,0.003046,0.005498,0.033391,0.005163,0.001608,0.000944,0.012180,0.002938,...,0.024644,0.036449,0.268107,0.133966,0.013881,0.026867,0.145853,0.106891,0.001281,Sedan_train_orig_test_03423_resized.jpg


In [7]:
# HOG 24 pixels per cell Vectors
hog_24_ppc_vectors = combine_directory_parquets('../../../Data/Features/HOG_24_ppc')
# Prep dataset for merging
hog_24_ppc_vectors = prep_dataset_for_merge(hog_24_ppc_vectors, 'HOG 24 ppc')
hog_24_ppc_vectors

Unnamed: 0,Image Path HOG 24 ppc,test_80_20,HOG_24_ppc_0,HOG_24_ppc_1,HOG_24_ppc_2,HOG_24_ppc_3,HOG_24_ppc_4,HOG_24_ppc_5,HOG_24_ppc_6,HOG_24_ppc_7,...,HOG_24_ppc_2295,HOG_24_ppc_2296,HOG_24_ppc_2297,HOG_24_ppc_2298,HOG_24_ppc_2299,HOG_24_ppc_2300,HOG_24_ppc_2301,HOG_24_ppc_2302,HOG_24_ppc_2303,harmonized_filename
0,../../../Images/train/Blurred/Convertible_trai...,0,0.059938,0.097728,0.106369,0.048924,0.091199,0.097621,0.075841,0.066475,...,0.000662,0.005605,0.084604,0.339768,0.002502,0.114248,0.092849,0.311350,0.125613,Convertible_train_orig_train_00272_resized.jpg
1,../../../Images/train/Blurred/SUV_train_orig_t...,0,0.256457,0.256457,0.077890,0.063679,0.126482,0.256457,0.147489,0.256457,...,0.026551,0.030365,0.038664,0.220049,0.012558,0.038802,0.037201,0.075855,0.026042,SUV_train_orig_test_07471_resized.jpg
2,../../../Images/train/Blurred/Convertible_trai...,0,0.202457,0.223452,0.223452,0.211585,0.175203,0.223452,0.223452,0.176913,...,0.096925,0.070365,0.138975,0.164921,0.077499,0.062890,0.140606,0.187274,0.122544,Convertible_train_orig_test_01313_resized.jpg
3,../../../Images/train/Blurred/Sedan_train_orig...,0,0.007001,0.000845,0.002915,0.000151,0.003460,0.000303,0.003501,0.000498,...,0.018255,0.198336,0.102858,0.149118,0.034768,0.003529,0.100560,0.235702,0.000821,Sedan_train_orig_test_05964_resized.jpg
4,../../../Images/train/Blurred/Sedan_train_orig...,0,0.027547,0.005092,0.020988,0.020537,0.009588,0.008990,0.024175,0.009435,...,0.122497,0.090929,0.138466,0.208792,0.100809,0.081907,0.108674,0.152249,0.106565,Sedan_train_orig_test_06611_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
821,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.119561,0.122641,0.064218,0.077519,0.111523,0.110687,0.101571,0.079767,...,0.168222,0.068426,0.043202,0.171373,0.186756,0.038155,0.036713,0.138185,0.117317,Sedan_test_orig_test_05332_resized.jpg
822,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.236619,0.227736,0.219241,0.236619,0.236619,0.150675,0.231033,0.236619,...,0.162245,0.050129,0.186667,0.273364,0.037968,0.005627,0.164959,0.264899,0.003458,Sedan_test_orig_train_02826_resized.jpg
823,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.107081,0.163420,0.159713,0.034679,0.243864,0.146184,0.215033,0.243864,...,0.126880,0.069564,0.084517,0.106073,0.078765,0.051485,0.082266,0.092818,0.061774,Sedan_test_orig_train_01495_resized.jpg
824,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.000000,0.000000,0.000482,0.000000,0.005666,0.000450,0.001927,0.000284,...,0.071698,0.074662,0.138145,0.142298,0.061959,0.104318,0.143668,0.169297,0.079651,Sedan_test_orig_train_04215_resized.jpg


In [8]:
# HSV Vectors
HSV_vectors = combine_directory_parquets('../../../Data/Features/HSV')
# Prep dataset for merging
HSV_vectors = prep_dataset_for_merge(HSV_vectors, 'HSV')
HSV_vectors

Unnamed: 0,Image Path HSV,test_80_20,HSV_0,HSV_1,HSV_2,HSV_3,HSV_4,HSV_5,HSV_6,HSV_7,...,HSV_759,HSV_760,HSV_761,HSV_762,HSV_763,HSV_764,HSV_765,HSV_766,HSV_767,harmonized_filename
0,../../../Images/train/No Blur/Sedan_train_orig...,0,279,76,110,63,40,59,52,127,...,79,110,118,105,182,332,2044,1522,5299,Sedan_train_orig_test_01516_resized.jpg
1,../../../Images/train/No Blur/SUV_train_orig_t...,0,683,50,44,51,29,27,34,380,...,240,239,196,174,182,186,133,180,1099,SUV_train_orig_train_00294_resized.jpg
2,../../../Images/train/No Blur/Convertible_trai...,0,8665,8,31,8,10,2,5,58,...,12,7,4,13,5,4,3,0,6,Convertible_train_orig_train_04236_resized.jpg
3,../../../Images/train/No Blur/Pickup_train_ori...,0,3865,0,0,2,0,1,0,5,...,27,25,28,21,25,13,18,21,94,Pickup_train_orig_train_03906_resized.jpg
4,../../../Images/train/No Blur/SUV_train_orig_t...,0,1266,49,115,301,71,43,276,500,...,224,222,225,190,202,150,156,127,840,SUV_train_orig_test_01344_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
821,../../../Images/test/No Blur/Sedan_test_orig_t...,1,5305,2,18,21,10,6,20,41,...,6,4,7,5,3,4,3,4,36,Sedan_test_orig_train_00376_resized.jpg
822,../../../Images/test/No Blur/SUV_test_orig_tes...,1,571,8,21,105,15,7,25,140,...,92,112,86,92,84,112,89,100,1158,SUV_test_orig_test_00579_resized.jpg
823,../../../Images/test/No Blur/Sedan_test_orig_t...,1,3833,7,72,236,13,46,127,664,...,1,0,1,2,1,1,0,0,1,Sedan_test_orig_test_00430_resized.jpg
824,../../../Images/test/No Blur/Sedan_test_orig_t...,1,3032,13,11,38,8,3,35,44,...,138,137,57,24,15,12,11,7,41,Sedan_test_orig_test_07328_resized.jpg


## Merge DFs

In [9]:
# Merge all dataframes on harmonized file name
all_features = (vit_embeddings.merge(hog_16_ppc_vectors, on=['harmonized_filename', 'test_80_20'], how='inner')
                              .merge(hog_24_ppc_vectors, on=['harmonized_filename', 'test_80_20'], how='inner')
                              .merge(HSV_vectors, on=['harmonized_filename', 'test_80_20'], how='inner'))
all_features

Unnamed: 0,Image Path Vision Transformer,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,HSV_758,HSV_759,HSV_760,HSV_761,HSV_762,HSV_763,HSV_764,HSV_765,HSV_766,HSV_767
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,92,79,110,118,105,182,332,2044,1522,5299
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,275,240,239,196,174,182,186,133,180,1099
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,24,12,7,4,13,5,4,3,0,6
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,29,27,25,28,21,25,13,18,21,94
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,223,224,222,225,190,202,150,156,127,840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/test/No Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,0.068092,-0.512507,...,8,6,4,7,5,3,4,3,4,36
8248,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,96,92,112,86,92,84,112,89,100,1158
8249,../../../Images/test/No Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,0.011075,-0.483120,...,1,1,0,1,2,1,1,0,0,1
8250,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,106,138,137,57,24,15,12,11,7,41


In [10]:
# Check harmonized_filename column
all_features['harmonized_filename'].iloc[0]

'Sedan_train_orig_test_01516_resized.jpg'

## Add Class Labels and Use Corrections

In [11]:
# Create Class column by cutting at first _ in harmonized_filename
all_features['Class'] = all_features['harmonized_filename'].apply(lambda x: x.split('_')[0])

In [12]:
# Correct class for items in train dataset
# Load "relabeled_train_no_blur_old_and_new_labels.xlsx"
relabeled_train_no_blur_old_and_new_labels = pd.read_excel('../../../Data/Relabeled_Train_No_Blur/relabeled_train_no_blur_old_and_new_labels.xlsx')
relabeled_train_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_train_orig_test_00002_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00002_resized.jpg
1,Convertible_train_orig_test_00037_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00037_resized.jpg
2,Convertible_train_orig_test_00060_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00060_resized.jpg
3,Convertible_train_orig_test_00087_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00087_resized.jpg
4,Convertible_train_orig_test_00112_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00112_resized.jpg
...,...,...,...,...,...
6591,Sedan_train_orig_train_08136_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08136_resized.jpg
6592,Sedan_train_orig_train_08137_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08137_resized.jpg
6593,Sedan_train_orig_train_08138_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08138_resized.jpg
6594,Sedan_train_orig_train_08139_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_train_orig_train_08139_resized.jpg


In [13]:
# Correct class for items in test dataset
# Load "relabeled_test_no_blur_old_and_new_labels.xlsx"
relabeled_test_no_blur_old_and_new_labels = pd.read_excel('../../../Data/Relabeled_Test_No_Blur/relabeled_test_no_blur_old_and_new_labels.xlsx')
relabeled_test_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_test_orig_test_00023_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00023_resized.jpg
1,Convertible_test_orig_test_00096_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00096_resized.jpg
2,Convertible_test_orig_test_00107_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00107_resized.jpg
3,Convertible_test_orig_test_00135_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00135_resized.jpg
4,Convertible_test_orig_test_00147_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00147_resized.jpg
...,...,...,...,...,...
1651,Sedan_test_orig_train_08026_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_test_orig_train_08026_resized.jpg
1652,Sedan_test_orig_train_08043_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08043_resized.jpg
1653,Sedan_test_orig_train_08046_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08046_resized.jpg
1654,Sedan_test_orig_train_08098_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08098_resized.jpg


In [14]:
# Stack train and test relabeled dataframes
relabeled = pd.concat([relabeled_train_no_blur_old_and_new_labels, relabeled_test_no_blur_old_and_new_labels])
relabeled

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_train_orig_test_00002_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00002_resized.jpg
1,Convertible_train_orig_test_00037_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00037_resized.jpg
2,Convertible_train_orig_test_00060_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00060_resized.jpg
3,Convertible_train_orig_test_00087_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00087_resized.jpg
4,Convertible_train_orig_test_00112_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00112_resized.jpg
...,...,...,...,...,...
1651,Sedan_test_orig_train_08026_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_test_orig_train_08026_resized.jpg
1652,Sedan_test_orig_train_08043_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08043_resized.jpg
1653,Sedan_test_orig_train_08046_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08046_resized.jpg
1654,Sedan_test_orig_train_08098_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08098_resized.jpg


In [15]:
# Merge on 'filename'
all_features = all_features.merge(relabeled, on='harmonized_filename', how='left')
# Replace Class with New Class if not missing
all_features['Class With Ambiguous and None'] = all_features['New Class'].fillna(all_features['Class'])
# Create Corrected Old Class - Old Class, but replace with New Class if New Class is in 'SUV', 'Sedan', 'Pickup', 'Convertible'
all_features['Corrected Old Class'] = all_features['Old Class']
all_features.loc[all_features['New Class'].isin(['SUV', 'Sedan', 'Pickup', 'Convertible']), 'Corrected Old Class'] = all_features['New Class']
# Drop label, New Class, filename
all_features.drop(columns=['label', 'New Class', 'filename', 'Old Class'], inplace=True)
all_features

Unnamed: 0,Image Path Vision Transformer,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,HSV_759,HSV_760,HSV_761,HSV_762,HSV_763,HSV_764,HSV_765,HSV_766,HSV_767,Class
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,79,110,118,105,182,332,2044,1522,5299,Sedan
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,240,239,196,174,182,186,133,180,1099,SUV
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,12,7,4,13,5,4,3,0,6,Convertible
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,27,25,28,21,25,13,18,21,94,Pickup
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,224,222,225,190,202,150,156,127,840,SUV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/test/No Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,0.068092,-0.512507,...,6,4,7,5,3,4,3,4,36,Ambiguous
8248,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,92,112,86,92,84,112,89,100,1158,SUV
8249,../../../Images/test/No Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,0.011075,-0.483120,...,1,0,1,2,1,1,0,0,1,Ambiguous
8250,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,138,137,57,24,15,12,11,7,41,Sedan


## Clean Up Columns

In [16]:
# Get image path columns
image_path_columns = [col for col in all_features.columns if 'Image Path' in col]
# Split off and delete Image Path columns
image_path_df = all_features[['harmonized_filename'] + image_path_columns]
image_path_df.to_excel('../../../Data/Features/All Features Image Paths/All_Features_Image_Paths.xlsx', index=False)
all_features.drop(columns=image_path_columns, inplace=True)
# Reorder columns
all_features = all_features[['Class', 'harmonized_filename', 'test_80_20'] + [col for col in all_features.columns if col not in ['Class', 'harmonized_filename', 'test_80_20']]]
all_features

Unnamed: 0,Class,harmonized_filename,test_80_20,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,...,HSV_758,HSV_759,HSV_760,HSV_761,HSV_762,HSV_763,HSV_764,HSV_765,HSV_766,HSV_767
0,Sedan,Sedan_train_orig_test_01516_resized.jpg,0,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,...,92,79,110,118,105,182,332,2044,1522,5299
1,SUV,SUV_train_orig_train_00294_resized.jpg,0,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,...,275,240,239,196,174,182,186,133,180,1099
2,Convertible,Convertible_train_orig_train_04236_resized.jpg,0,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,...,24,12,7,4,13,5,4,3,0,6
3,Pickup,Pickup_train_orig_train_03906_resized.jpg,0,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,...,29,27,25,28,21,25,13,18,21,94
4,SUV,SUV_train_orig_test_01344_resized.jpg,0,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,...,223,224,222,225,190,202,150,156,127,840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,Ambiguous,Sedan_test_orig_train_00376_resized.jpg,1,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,...,8,6,4,7,5,3,4,3,4,36
8248,SUV,SUV_test_orig_test_00579_resized.jpg,1,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,...,96,92,112,86,92,84,112,89,100,1158
8249,Ambiguous,Sedan_test_orig_test_00430_resized.jpg,1,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,...,1,1,0,1,2,1,1,0,0,1
8250,Sedan,Sedan_test_orig_test_07328_resized.jpg,1,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,...,106,138,137,57,24,15,12,11,7,41


## Add Blur and No-Blur Paths

In [17]:
def construct_path(train_test, blur_no_blur, harmonized_filename):
    return '../../../Images/' + train_test + '/' + blur_no_blur + '/' + harmonized_filename

# Add image_path_blur column
all_features['image_path_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'Blurred', x['harmonized_filename']), axis=1)
# Add image_path_no_blur column
all_features['image_path_no_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'No_Blur', x['harmonized_filename']), axis=1)
# Reorder columns - Class, harmonized_filename, test_80_20, image_path_blur, image_path_no_blur, all other columns
all_features = all_features[['Class', 'harmonized_filename', 'test_80_20', 'image_path_blur', 'image_path_no_blur'] + [col for col in all_features.columns if col not in ['Class', 'harmonized_filename', 'test_80_20', 'image_path_blur', 'image_path_no_blur']]]
all_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features['image_path_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'Blurred', x['harmonized_filename']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features['image_path_no_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'No_Blur', x['harmonized_filename']), axis=1)


Unnamed: 0,Class,harmonized_filename,test_80_20,image_path_blur,image_path_no_blur,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,...,HSV_758,HSV_759,HSV_760,HSV_761,HSV_762,HSV_763,HSV_764,HSV_765,HSV_766,HSV_767
0,Sedan,Sedan_train_orig_test_01516_resized.jpg,0,../../../Images/train/Blurred/Sedan_train_orig...,../../../Images/train/No_Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,...,92,79,110,118,105,182,332,2044,1522,5299
1,SUV,SUV_train_orig_train_00294_resized.jpg,0,../../../Images/train/Blurred/SUV_train_orig_t...,../../../Images/train/No_Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,...,275,240,239,196,174,182,186,133,180,1099
2,Convertible,Convertible_train_orig_train_04236_resized.jpg,0,../../../Images/train/Blurred/Convertible_trai...,../../../Images/train/No_Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,...,24,12,7,4,13,5,4,3,0,6
3,Pickup,Pickup_train_orig_train_03906_resized.jpg,0,../../../Images/train/Blurred/Pickup_train_ori...,../../../Images/train/No_Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,...,29,27,25,28,21,25,13,18,21,94
4,SUV,SUV_train_orig_test_01344_resized.jpg,0,../../../Images/train/Blurred/SUV_train_orig_t...,../../../Images/train/No_Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,...,223,224,222,225,190,202,150,156,127,840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,Ambiguous,Sedan_test_orig_train_00376_resized.jpg,1,../../../Images/test/Blurred/Sedan_test_orig_t...,../../../Images/test/No_Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,...,8,6,4,7,5,3,4,3,4,36
8248,SUV,SUV_test_orig_test_00579_resized.jpg,1,../../../Images/test/Blurred/SUV_test_orig_tes...,../../../Images/test/No_Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,...,96,92,112,86,92,84,112,89,100,1158
8249,Ambiguous,Sedan_test_orig_test_00430_resized.jpg,1,../../../Images/test/Blurred/Sedan_test_orig_t...,../../../Images/test/No_Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,...,1,1,0,1,2,1,1,0,0,1
8250,Sedan,Sedan_test_orig_test_07328_resized.jpg,1,../../../Images/test/Blurred/Sedan_test_orig_t...,../../../Images/test/No_Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,...,106,138,137,57,24,15,12,11,7,41


In [18]:
# Print all columns in all_features
for col in all_features.columns:
    print(col)

Class
harmonized_filename
test_80_20
image_path_blur
image_path_no_blur
ViT_Embedding_Element_0
ViT_Embedding_Element_1
ViT_Embedding_Element_2
ViT_Embedding_Element_3
ViT_Embedding_Element_4
ViT_Embedding_Element_5
ViT_Embedding_Element_6
ViT_Embedding_Element_7
ViT_Embedding_Element_8
ViT_Embedding_Element_9
ViT_Embedding_Element_10
ViT_Embedding_Element_11
ViT_Embedding_Element_12
ViT_Embedding_Element_13
ViT_Embedding_Element_14
ViT_Embedding_Element_15
ViT_Embedding_Element_16
ViT_Embedding_Element_17
ViT_Embedding_Element_18
ViT_Embedding_Element_19
ViT_Embedding_Element_20
ViT_Embedding_Element_21
ViT_Embedding_Element_22
ViT_Embedding_Element_23
ViT_Embedding_Element_24
ViT_Embedding_Element_25
ViT_Embedding_Element_26
ViT_Embedding_Element_27
ViT_Embedding_Element_28
ViT_Embedding_Element_29
ViT_Embedding_Element_30
ViT_Embedding_Element_31
ViT_Embedding_Element_32
ViT_Embedding_Element_33
ViT_Embedding_Element_34
ViT_Embedding_Element_35
ViT_Embedding_Element_36
ViT_Embedding

## Output Training and Testing Dataframes

In [19]:
# Split df into train and test
all_features_train = all_features[all_features['test_80_20'] == 0]
all_features_test = all_features[all_features['test_80_20'] == 1]

# Drop test_80_20 column
all_features_train.drop(columns='test_80_20', inplace=True)
all_features_test.drop(columns='test_80_20', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features_train.drop(columns='test_80_20', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features_test.drop(columns='test_80_20', inplace=True)


In [20]:
def split_df(df, dataset_name, out_folder, num_pieces):
    '''
    Splits dataframes into num_pieces and saves them as parquet files in out_folder. Reduces file size to comply with GitHub limits.
    '''
    # Tracking total length of pieces
    total_len_pieces = 0
    # Delete previous pieces, all contents of out_folder
    for filename in os.listdir(out_folder):
        file_path = os.path.join(out_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    # Save pieces
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(df) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(df) // num_pieces
        # get piece
        piece = df[start_index:end_index]
        piece.to_parquet(out_folder + '/' + dataset_name + '_piece_' + str(i) + '.parquet', index=False)
        #print(len(piece))
        total_len_pieces += len(piece)
    
    # check total piece length and length of vit_embeddings_df
    print('length check passed')
    print(total_len_pieces == len(df))

In [21]:
# Run on all_features dataframes
split_df(all_features_train, 'all_features_train', '../../../Data/Features/All Features/train', 16)
split_df(all_features_test, 'all_features_test', '../../../Data/Features/All Features/test', 16)

length check passed
True
length check passed
True


## Output 100 row sample to Excel

In [22]:
all_features_train.sample(100).to_excel('../../../Data/Features/All Features Train Sample/all_features_train_sample.xlsx', index=False)