# Combine VGG, LBP, and HSV

Load parquet files of features, stack as needed, then merge them into a single dataframe.

In [1]:
# Packages
import os
import pandas as pd
import shutil

## Generalized Function to Load and Combine All DFs in a Directory

In [2]:
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    # If path does not end in a slash, add one
    if directory_path[-1] != '/':
        directory_path += '/'
    # list of files in directory
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    # read in all parquet files
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    # Return combined dataframe
    return combined_df

## Function to create harmonized Image file name

In [3]:
def create_harmonized_filename(file_path):
    '''
    Strip Blur vs. No Blur and other data from file path.
    '''
    # Split path by '/' and take last element
    harmonized_filename = file_path.split('/')[-1]
    # Remove _no_blur and _blurred from file name
    harmonized_filename = harmonized_filename.replace('_no_blur', '')
    harmonized_filename = harmonized_filename.replace('_blurred', '')
    # Return harmonized file name
    return harmonized_filename

In [4]:
def prep_dataset_for_merge(df, feature_name):
    '''
    Prepares dataset for merging.
    '''
    # Create new column for harmonized file name
    df['harmonized_filename'] = df['Image Path'].apply(create_harmonized_filename)
    # Rename Image Path to Image Path + feature name
    df.rename(columns={'Image Path': 'Image Path ' + feature_name}, inplace=True)
    # Return dataset
    return df

## Load Features

In [5]:
# HSV Vectors
HSV_vectors = combine_directory_parquets('../../Data/Features/HSV')
# Prep dataset for merging
HSV_vectors = prep_dataset_for_merge(HSV_vectors, 'HSV')
HSV_vectors

Unnamed: 0,Image Path HSV,test_80_20,HSV_0,HSV_1,HSV_2,HSV_3,HSV_4,HSV_5,HSV_6,HSV_7,...,HSV_759,HSV_760,HSV_761,HSV_762,HSV_763,HSV_764,HSV_765,HSV_766,HSV_767,harmonized_filename
0,../../../Images/test/No Blur/Sedan_test_orig_t...,1,20360,0,3,6,4,2,6,10,...,40,41,42,35,46,49,44,41,422,Sedan_test_orig_train_05437_resized.jpg
1,../../../Images/test/No Blur/SUV_test_orig_tra...,1,21718,0,33,357,5,0,110,375,...,93,128,160,159,215,255,5898,386,1136,SUV_test_orig_train_06351_resized.jpg
2,../../../Images/test/No Blur/Pickup_test_orig_...,1,11613,0,12,13,2,1,20,70,...,1,0,1,0,0,0,0,0,1,Pickup_test_orig_train_03359_resized.jpg
3,../../../Images/test/No Blur/SUV_test_orig_tes...,1,1560,8,11,22,5,3,9,29,...,681,707,849,874,841,945,885,855,2753,SUV_test_orig_test_03954_resized.jpg
4,../../../Images/test/No Blur/SUV_test_orig_tes...,1,3389,14,114,191,34,18,138,1070,...,145,90,87,92,84,78,78,124,883,SUV_test_orig_test_06649_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,../../../Images/train/No Blur/SUV_train_orig_t...,0,10013,15,37,70,21,49,37,221,...,3,6,0,3,1,2,1,7,26,SUV_train_orig_test_00370_resized.jpg
821,../../../Images/train/No Blur/Pickup_train_ori...,0,5396,1,9,103,2,3,122,192,...,244,176,143,129,83,91,71,63,239,Pickup_train_orig_train_04622_resized.jpg
822,../../../Images/train/No Blur/Convertible_trai...,0,3157,950,510,252,134,146,88,227,...,342,357,356,293,347,289,246,218,1325,Convertible_train_orig_train_04430_resized.jpg
823,../../../Images/train/No Blur/SUV_train_orig_t...,0,13477,6,8,10,6,6,3,18,...,218,193,195,235,273,322,390,460,11035,SUV_train_orig_test_04527_resized.jpg


In [6]:
# LBP Vectors
LBP_vectors = combine_directory_parquets('../../Data/Features/LBP')
# Prep dataset for merging
LBP_vectors = prep_dataset_for_merge(LBP_vectors, 'LBP')
LBP_vectors

Unnamed: 0,Image Path LBP,test_80_20,LBP_0,LBP_1,LBP_2,LBP_3,LBP_4,LBP_5,LBP_6,LBP_7,...,LBP_16,LBP_17,LBP_18,LBP_19,LBP_20,LBP_21,LBP_22,LBP_23,LBP_24,harmonized_filename
0,../../../Images/train/Blurred/Convertible_trai...,0,0.026428,0.020401,0.016449,0.012634,0.010895,0.009186,0.008514,0.011658,...,0.012466,0.013702,0.008850,0.009171,0.009399,0.010910,0.014618,0.019577,0.342926,Convertible_train_orig_train_00272_resized.jpg
1,../../../Images/train/Blurred/SUV_train_orig_t...,0,0.038757,0.023911,0.016830,0.014236,0.011322,0.009949,0.010422,0.011734,...,0.015686,0.014740,0.010971,0.011002,0.011429,0.016052,0.018845,0.022339,0.429779,SUV_train_orig_test_07471_resized.jpg
2,../../../Images/train/Blurred/Convertible_trai...,0,0.044312,0.026428,0.020065,0.014328,0.013092,0.011200,0.011566,0.013031,...,0.014877,0.016525,0.012482,0.011383,0.012878,0.016235,0.020645,0.023376,0.454559,Convertible_train_orig_test_01313_resized.jpg
3,../../../Images/train/Blurred/Sedan_train_orig...,0,0.022125,0.023621,0.014084,0.013245,0.006790,0.006348,0.006027,0.008469,...,0.015106,0.037003,0.016296,0.013870,0.013031,0.018585,0.020355,0.025879,0.450439,Sedan_train_orig_test_05964_resized.jpg
4,../../../Images/train/Blurred/Sedan_train_orig...,0,0.055222,0.034653,0.020630,0.012558,0.008011,0.006851,0.006927,0.006897,...,0.010681,0.015182,0.007812,0.007996,0.007980,0.012894,0.018341,0.028107,0.550018,Sedan_train_orig_test_06611_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,../../../Images/train/Blurred/Convertible_trai...,0,0.023788,0.015076,0.013199,0.010590,0.008301,0.007965,0.007965,0.009872,...,0.014816,0.032043,0.014099,0.012238,0.011551,0.013870,0.015594,0.015991,0.470932,Convertible_train_orig_train_07594_resized.jpg
821,../../../Images/train/Blurred/SUV_train_orig_t...,0,0.044052,0.037994,0.021652,0.013428,0.009430,0.008530,0.007584,0.009171,...,0.008286,0.008514,0.006714,0.007523,0.008530,0.011536,0.019104,0.034271,0.550278,SUV_train_orig_train_03339_resized.jpg
822,../../../Images/train/Blurred/SUV_train_orig_t...,0,0.042984,0.028244,0.020065,0.014587,0.009125,0.009338,0.008041,0.009613,...,0.011734,0.013550,0.009155,0.009644,0.010330,0.013397,0.020660,0.027008,0.488159,SUV_train_orig_test_04325_resized.jpg
823,../../../Images/train/Blurred/Sedan_train_orig...,0,0.021835,0.018417,0.009720,0.010651,0.005157,0.005051,0.004944,0.009384,...,0.014191,0.032394,0.013107,0.011108,0.010406,0.012939,0.016403,0.020767,0.460464,Sedan_train_orig_train_02626_resized.jpg


In [7]:
# VGG Vectors
vgg_vectors = combine_directory_parquets('../../Data/Features/VGG')
# Prep dataset for merging
vgg_vectors = prep_dataset_for_merge(vgg_vectors, 'VGG')
vgg_vectors

Unnamed: 0,Image Path VGG,VGG_Embedding_Element_0,VGG_Embedding_Element_1,VGG_Embedding_Element_2,VGG_Embedding_Element_3,VGG_Embedding_Element_4,VGG_Embedding_Element_5,VGG_Embedding_Element_6,VGG_Embedding_Element_7,VGG_Embedding_Element_8,...,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,test_80_20,harmonized_filename
0,../../../Images/train/No Blur/Sedan_train_orig...,0.057273,0.085857,0.014088,0.000000,0.000000,0.000000,1.265887,0.025079,0.020789,...,0.288115,0.000000,0.005827,0.039591,0.000000,0.063602,0.365371,0.000000,0,Sedan_train_orig_train_04172_resized.jpg
1,../../../Images/train/No Blur/Convertible_trai...,0.293713,0.714101,0.075956,0.144106,0.011360,0.000000,2.309738,0.094079,0.409755,...,0.154690,0.000000,0.000000,0.037017,0.350120,0.543494,0.100478,0.000000,0,Convertible_train_orig_test_01764_resized.jpg
2,../../../Images/train/No Blur/SUV_train_orig_t...,0.147546,0.793779,0.000000,0.014114,0.237972,0.191854,1.408566,0.000000,0.314723,...,0.115495,0.099091,0.000000,0.019142,0.110762,0.750666,0.198542,0.275273,0,SUV_train_orig_test_07060_resized.jpg
3,../../../Images/train/No Blur/Sedan_train_orig...,0.043017,1.089323,0.000000,0.004603,0.012446,0.000000,0.802531,0.000000,0.444981,...,0.072416,0.164781,0.000000,0.063925,0.189980,0.426336,0.031850,0.021951,0,Sedan_train_orig_test_07195_resized.jpg
4,../../../Images/train/No Blur/Sedan_train_orig...,0.027134,0.049786,0.030874,0.000000,0.003062,0.000000,1.098564,0.000000,0.005797,...,0.791576,0.000000,0.000000,0.017989,0.000000,0.532629,0.047045,0.000000,0,Sedan_train_orig_test_04947_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,../../../Images/train/No Blur/Sedan_train_orig...,0.000000,0.355245,0.014701,0.000000,0.063166,0.000000,0.168762,0.068016,0.209514,...,0.068890,0.000000,0.013373,0.039992,0.152950,0.097856,0.072973,0.033932,0,Sedan_train_orig_train_00112_resized.jpg
1028,../../../Images/train/No Blur/Pickup_train_ori...,0.114592,0.210919,0.000000,0.000000,0.031758,0.000000,0.007828,0.000000,0.209584,...,0.454710,0.000000,0.000000,0.519490,0.037494,0.018614,1.264154,0.019924,0,Pickup_train_orig_test_01147_resized.jpg
1029,../../../Images/train/No Blur/SUV_train_orig_t...,0.084685,1.241065,0.000000,0.000000,0.025655,0.000000,1.467685,0.000000,0.184841,...,0.032092,0.000000,0.000000,0.476425,0.031688,0.863446,0.086722,0.031863,0,SUV_train_orig_test_08000_resized.jpg
1030,../../../Images/train/No Blur/Sedan_train_orig...,0.532457,0.549263,0.000000,0.000000,0.000000,0.020113,0.588399,0.000000,0.198008,...,0.033320,0.000000,0.000000,0.099895,0.018707,0.320334,0.286392,0.000000,0,Sedan_train_orig_test_01132_resized.jpg


## Merge DFs

In [8]:
# Merge all dataframes on harmonized file name
all_features = (HSV_vectors.merge(LBP_vectors, on=['harmonized_filename', 'test_80_20'], how = 'inner')
                           .merge(vgg_vectors, on=['harmonized_filename', 'test_80_20'], how = 'inner'))
all_features

Unnamed: 0,Image Path HSV,test_80_20,HSV_0,HSV_1,HSV_2,HSV_3,HSV_4,HSV_5,HSV_6,HSV_7,...,VGG_Embedding_Element_502,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511
0,../../../Images/test/No Blur/Sedan_test_orig_t...,1,20360,0,3,6,4,2,6,10,...,0.150243,0.264980,0.352080,0.030870,0.000000,0.005269,0.085076,0.184339,0.393665,0.048444
1,../../../Images/test/No Blur/SUV_test_orig_tra...,1,21718,0,33,357,5,0,110,375,...,0.300909,0.188039,0.081065,0.006685,0.000000,0.073381,0.063552,0.418783,0.056318,0.009986
2,../../../Images/test/No Blur/Pickup_test_orig_...,1,11613,0,12,13,2,1,20,70,...,0.010515,0.935640,0.440709,0.000000,0.397143,0.358335,0.026000,0.439225,0.561005,0.136741
3,../../../Images/test/No Blur/SUV_test_orig_tes...,1,1560,8,11,22,5,3,9,29,...,0.073244,0.000000,0.510363,0.016521,0.000000,0.146333,0.000060,1.518961,0.189747,0.162572
4,../../../Images/test/No Blur/SUV_test_orig_tes...,1,3389,14,114,191,34,18,138,1070,...,0.031909,0.000000,0.122666,0.000000,0.038126,0.107039,0.015900,0.521024,0.037165,0.036494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/train/No Blur/SUV_train_orig_t...,0,10013,15,37,70,21,49,37,221,...,0.254979,0.154362,0.105418,0.000000,0.000000,0.006182,0.400323,0.126782,0.109129,0.000000
8248,../../../Images/train/No Blur/Pickup_train_ori...,0,5396,1,9,103,2,3,122,192,...,0.019901,0.528632,0.027102,0.000000,0.000000,0.268571,0.142446,0.000000,0.111561,0.000000
8249,../../../Images/train/No Blur/Convertible_trai...,0,3157,950,510,252,134,146,88,227,...,0.003150,0.235740,0.275030,0.000000,0.000000,0.210836,0.758296,0.236081,1.110856,0.000000
8250,../../../Images/train/No Blur/SUV_train_orig_t...,0,13477,6,8,10,6,6,3,18,...,0.165793,0.320819,0.198651,0.000000,0.000000,0.146347,0.000000,0.173617,0.578854,0.002865


In [9]:
# Check harmonized_filename column
all_features['harmonized_filename'].iloc[0]

'Sedan_test_orig_train_05437_resized.jpg'

## Add Class Labels and Use Corrections

In [10]:
# Correct class for items in train dataset
# Load "relabeled_train_no_blur_old_and_new_labels.xlsx"
relabeled_train_no_blur_old_and_new_labels = pd.read_excel('../../Data/Relabeled_Train_No_Blur/relabeled_train_no_blur_old_and_new_labels.xlsx')
relabeled_train_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_train_orig_test_00002_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00002_resized.jpg
1,Convertible_train_orig_test_00037_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00037_resized.jpg
2,Convertible_train_orig_test_00060_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00060_resized.jpg
3,Convertible_train_orig_test_00087_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00087_resized.jpg
4,Convertible_train_orig_test_00112_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00112_resized.jpg
...,...,...,...,...,...
6591,Sedan_train_orig_train_08136_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08136_resized.jpg
6592,Sedan_train_orig_train_08137_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08137_resized.jpg
6593,Sedan_train_orig_train_08138_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08138_resized.jpg
6594,Sedan_train_orig_train_08139_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_train_orig_train_08139_resized.jpg


In [11]:
# Correct class for items in test dataset
# Load "relabeled_test_no_blur_old_and_new_labels.xlsx"
relabeled_test_no_blur_old_and_new_labels = pd.read_excel('../../Data/Relabeled_Test_No_Blur/relabeled_test_no_blur_old_and_new_labels.xlsx')
relabeled_test_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_test_orig_test_00023_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00023_resized.jpg
1,Convertible_test_orig_test_00096_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00096_resized.jpg
2,Convertible_test_orig_test_00107_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00107_resized.jpg
3,Convertible_test_orig_test_00135_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00135_resized.jpg
4,Convertible_test_orig_test_00147_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00147_resized.jpg
...,...,...,...,...,...
1651,Sedan_test_orig_train_08026_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_test_orig_train_08026_resized.jpg
1652,Sedan_test_orig_train_08043_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08043_resized.jpg
1653,Sedan_test_orig_train_08046_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08046_resized.jpg
1654,Sedan_test_orig_train_08098_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08098_resized.jpg


In [12]:
# Stack train and test relabeled dataframes
# Keep limited columns
relabeled = pd.concat([relabeled_train_no_blur_old_and_new_labels, relabeled_test_no_blur_old_and_new_labels])[['harmonized_filename', 'New Class', 'Old Class']]
relabeled

Unnamed: 0,harmonized_filename,New Class,Old Class
0,Convertible_train_orig_test_00002_resized.jpg,Convertible,Convertible
1,Convertible_train_orig_test_00037_resized.jpg,Convertible,Convertible
2,Convertible_train_orig_test_00060_resized.jpg,Convertible,Convertible
3,Convertible_train_orig_test_00087_resized.jpg,Convertible,Convertible
4,Convertible_train_orig_test_00112_resized.jpg,Convertible,Convertible
...,...,...,...
1651,Sedan_test_orig_train_08026_resized.jpg,Ambiguous,Sedan
1652,Sedan_test_orig_train_08043_resized.jpg,Sedan,Sedan
1653,Sedan_test_orig_train_08046_resized.jpg,Sedan,Sedan
1654,Sedan_test_orig_train_08098_resized.jpg,Sedan,Sedan


In [13]:
# Merge on 'filename'
all_features = all_features.merge(relabeled, on='harmonized_filename', how='left')
all_features

Unnamed: 0,Image Path HSV,test_80_20,HSV_0,HSV_1,HSV_2,HSV_3,HSV_4,HSV_5,HSV_6,HSV_7,...,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,New Class,Old Class
0,../../../Images/test/No Blur/Sedan_test_orig_t...,1,20360,0,3,6,4,2,6,10,...,0.352080,0.030870,0.000000,0.005269,0.085076,0.184339,0.393665,0.048444,Sedan,Sedan
1,../../../Images/test/No Blur/SUV_test_orig_tra...,1,21718,0,33,357,5,0,110,375,...,0.081065,0.006685,0.000000,0.073381,0.063552,0.418783,0.056318,0.009986,Ambiguous,SUV
2,../../../Images/test/No Blur/Pickup_test_orig_...,1,11613,0,12,13,2,1,20,70,...,0.440709,0.000000,0.397143,0.358335,0.026000,0.439225,0.561005,0.136741,Pickup,Pickup
3,../../../Images/test/No Blur/SUV_test_orig_tes...,1,1560,8,11,22,5,3,9,29,...,0.510363,0.016521,0.000000,0.146333,0.000060,1.518961,0.189747,0.162572,SUV,SUV
4,../../../Images/test/No Blur/SUV_test_orig_tes...,1,3389,14,114,191,34,18,138,1070,...,0.122666,0.000000,0.038126,0.107039,0.015900,0.521024,0.037165,0.036494,SUV,SUV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/train/No Blur/SUV_train_orig_t...,0,10013,15,37,70,21,49,37,221,...,0.105418,0.000000,0.000000,0.006182,0.400323,0.126782,0.109129,0.000000,SUV,SUV
8248,../../../Images/train/No Blur/Pickup_train_ori...,0,5396,1,9,103,2,3,122,192,...,0.027102,0.000000,0.000000,0.268571,0.142446,0.000000,0.111561,0.000000,Pickup,Pickup
8249,../../../Images/train/No Blur/Convertible_trai...,0,3157,950,510,252,134,146,88,227,...,0.275030,0.000000,0.000000,0.210836,0.758296,0.236081,1.110856,0.000000,Convertible,Convertible
8250,../../../Images/train/No Blur/SUV_train_orig_t...,0,13477,6,8,10,6,6,3,18,...,0.198651,0.000000,0.000000,0.146347,0.000000,0.173617,0.578854,0.002865,SUV,SUV


### Create Class

In [14]:
# Value counts of 'New Class'
all_features['New Class'].value_counts()

Sedan                    3114
SUV                      2057
Convertible              1419
Pickup                    902
Ambiguous                 754
Clearly None of the 4       6
Name: New Class, dtype: int64

In [15]:
# Compare 'New Class' and 'Old Class' columns
old_v_new_class = all_features[['Old Class', 'New Class']].value_counts().reset_index().sort_values(by=['Old Class', 'New Class'])
old_v_new_class

Unnamed: 0,Old Class,New Class,0
7,Convertible,Ambiguous,86
2,Convertible,Convertible,1417
9,Convertible,Sedan,11
8,Pickup,Ambiguous,78
3,Pickup,Pickup,799
5,SUV,Ambiguous,245
6,SUV,Pickup,101
1,SUV,SUV,2057
10,SUV,Sedan,6
4,Sedan,Ambiguous,345


In [16]:
# Set 'Class' to 'New Class'
all_features['Class'] = all_features['New Class']
# Keep cases where 'New Class' is in 'SUV', 'Sedan', 'Pickup', 'Convertible'
all_features = all_features[all_features['Class'].isin(['SUV', 'Sedan', 'Pickup', 'Convertible'])]
# Drop 'New Class' and 'Old Class' columns
all_features.drop(columns=['New Class', 'Old Class'], inplace=True)
all_features

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features.drop(columns=['New Class', 'Old Class'], inplace=True)


Unnamed: 0,Image Path HSV,test_80_20,HSV_0,HSV_1,HSV_2,HSV_3,HSV_4,HSV_5,HSV_6,HSV_7,...,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,Class
0,../../../Images/test/No Blur/Sedan_test_orig_t...,1,20360,0,3,6,4,2,6,10,...,0.264980,0.352080,0.030870,0.000000,0.005269,0.085076,0.184339,0.393665,0.048444,Sedan
2,../../../Images/test/No Blur/Pickup_test_orig_...,1,11613,0,12,13,2,1,20,70,...,0.935640,0.440709,0.000000,0.397143,0.358335,0.026000,0.439225,0.561005,0.136741,Pickup
3,../../../Images/test/No Blur/SUV_test_orig_tes...,1,1560,8,11,22,5,3,9,29,...,0.000000,0.510363,0.016521,0.000000,0.146333,0.000060,1.518961,0.189747,0.162572,SUV
4,../../../Images/test/No Blur/SUV_test_orig_tes...,1,3389,14,114,191,34,18,138,1070,...,0.000000,0.122666,0.000000,0.038126,0.107039,0.015900,0.521024,0.037165,0.036494,SUV
5,../../../Images/test/No Blur/Sedan_test_orig_t...,1,2010,18,22,2600,9,13,290,2541,...,0.000000,0.301258,0.000000,0.000000,0.027415,0.255073,0.399262,0.089734,0.010790,Sedan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/train/No Blur/SUV_train_orig_t...,0,10013,15,37,70,21,49,37,221,...,0.154362,0.105418,0.000000,0.000000,0.006182,0.400323,0.126782,0.109129,0.000000,SUV
8248,../../../Images/train/No Blur/Pickup_train_ori...,0,5396,1,9,103,2,3,122,192,...,0.528632,0.027102,0.000000,0.000000,0.268571,0.142446,0.000000,0.111561,0.000000,Pickup
8249,../../../Images/train/No Blur/Convertible_trai...,0,3157,950,510,252,134,146,88,227,...,0.235740,0.275030,0.000000,0.000000,0.210836,0.758296,0.236081,1.110856,0.000000,Convertible
8250,../../../Images/train/No Blur/SUV_train_orig_t...,0,13477,6,8,10,6,6,3,18,...,0.320819,0.198651,0.000000,0.000000,0.146347,0.000000,0.173617,0.578854,0.002865,SUV


In [17]:
# Value counts for 'Class'
all_features['Class'].value_counts()

Sedan          3114
SUV            2057
Convertible    1419
Pickup          902
Name: Class, dtype: int64

In [18]:
# Value counts of 'test_80_20'
all_features['test_80_20'].value_counts()

0    6003
1    1489
Name: test_80_20, dtype: int64

## Clean Up Columns

In [19]:
# Get image path columns
image_path_columns = [col for col in all_features.columns if 'Image Path' in col]
# Split off and delete Image Path columns
image_path_df = all_features[['harmonized_filename'] + image_path_columns]
image_path_df.to_excel('../../Data/Features/Top3_Features/Top3_Features_Image_Paths.xlsx', index=False)
all_features.drop(columns=image_path_columns, inplace=True)
# Reorder columns
front_cols = ['Class', 'harmonized_filename', 'test_80_20']
all_features = all_features[front_cols + [col for col in all_features.columns if col not in front_cols]]
all_features

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features.drop(columns=image_path_columns, inplace=True)


Unnamed: 0,Class,harmonized_filename,test_80_20,HSV_0,HSV_1,HSV_2,HSV_3,HSV_4,HSV_5,HSV_6,...,VGG_Embedding_Element_502,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511
0,Sedan,Sedan_test_orig_train_05437_resized.jpg,1,20360,0,3,6,4,2,6,...,0.150243,0.264980,0.352080,0.030870,0.000000,0.005269,0.085076,0.184339,0.393665,0.048444
2,Pickup,Pickup_test_orig_train_03359_resized.jpg,1,11613,0,12,13,2,1,20,...,0.010515,0.935640,0.440709,0.000000,0.397143,0.358335,0.026000,0.439225,0.561005,0.136741
3,SUV,SUV_test_orig_test_03954_resized.jpg,1,1560,8,11,22,5,3,9,...,0.073244,0.000000,0.510363,0.016521,0.000000,0.146333,0.000060,1.518961,0.189747,0.162572
4,SUV,SUV_test_orig_test_06649_resized.jpg,1,3389,14,114,191,34,18,138,...,0.031909,0.000000,0.122666,0.000000,0.038126,0.107039,0.015900,0.521024,0.037165,0.036494
5,Sedan,Sedan_test_orig_test_06584_resized.jpg,1,2010,18,22,2600,9,13,290,...,0.105276,0.000000,0.301258,0.000000,0.000000,0.027415,0.255073,0.399262,0.089734,0.010790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,SUV,SUV_train_orig_test_00370_resized.jpg,0,10013,15,37,70,21,49,37,...,0.254979,0.154362,0.105418,0.000000,0.000000,0.006182,0.400323,0.126782,0.109129,0.000000
8248,Pickup,Pickup_train_orig_train_04622_resized.jpg,0,5396,1,9,103,2,3,122,...,0.019901,0.528632,0.027102,0.000000,0.000000,0.268571,0.142446,0.000000,0.111561,0.000000
8249,Convertible,Convertible_train_orig_train_04430_resized.jpg,0,3157,950,510,252,134,146,88,...,0.003150,0.235740,0.275030,0.000000,0.000000,0.210836,0.758296,0.236081,1.110856,0.000000
8250,SUV,SUV_train_orig_test_04527_resized.jpg,0,13477,6,8,10,6,6,3,...,0.165793,0.320819,0.198651,0.000000,0.000000,0.146347,0.000000,0.173617,0.578854,0.002865


## Add Blur and No-Blur Paths

In [20]:
def construct_path(train_test, blur_no_blur, harmonized_filename):
    return '../../../Images/' + train_test + '/' + blur_no_blur + '/' + harmonized_filename

# Add image_path_blur column
all_features['image_path_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'Blurred', x['harmonized_filename']), axis=1)
# Add image_path_no_blur column
all_features['image_path_no_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'No_Blur', x['harmonized_filename']), axis=1)
# Reorder columns - Class, harmonized_filename, test_80_20, image_path_blur, image_path_no_blur, all other columns
front_cols = front_cols + ['image_path_blur', 'image_path_no_blur']
all_features = all_features[front_cols + [col for col in all_features.columns if col not in front_cols]]
all_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features['image_path_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'Blurred', x['harmonized_filename']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features['image_path_no_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'No_Blur', x['harmonized_filename']), axis=1)


Unnamed: 0,Class,harmonized_filename,test_80_20,image_path_blur,image_path_no_blur,HSV_0,HSV_1,HSV_2,HSV_3,HSV_4,...,VGG_Embedding_Element_502,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511
0,Sedan,Sedan_test_orig_train_05437_resized.jpg,1,../../../Images/test/Blurred/Sedan_test_orig_t...,../../../Images/test/No_Blur/Sedan_test_orig_t...,20360,0,3,6,4,...,0.150243,0.264980,0.352080,0.030870,0.000000,0.005269,0.085076,0.184339,0.393665,0.048444
2,Pickup,Pickup_test_orig_train_03359_resized.jpg,1,../../../Images/test/Blurred/Pickup_test_orig_...,../../../Images/test/No_Blur/Pickup_test_orig_...,11613,0,12,13,2,...,0.010515,0.935640,0.440709,0.000000,0.397143,0.358335,0.026000,0.439225,0.561005,0.136741
3,SUV,SUV_test_orig_test_03954_resized.jpg,1,../../../Images/test/Blurred/SUV_test_orig_tes...,../../../Images/test/No_Blur/SUV_test_orig_tes...,1560,8,11,22,5,...,0.073244,0.000000,0.510363,0.016521,0.000000,0.146333,0.000060,1.518961,0.189747,0.162572
4,SUV,SUV_test_orig_test_06649_resized.jpg,1,../../../Images/test/Blurred/SUV_test_orig_tes...,../../../Images/test/No_Blur/SUV_test_orig_tes...,3389,14,114,191,34,...,0.031909,0.000000,0.122666,0.000000,0.038126,0.107039,0.015900,0.521024,0.037165,0.036494
5,Sedan,Sedan_test_orig_test_06584_resized.jpg,1,../../../Images/test/Blurred/Sedan_test_orig_t...,../../../Images/test/No_Blur/Sedan_test_orig_t...,2010,18,22,2600,9,...,0.105276,0.000000,0.301258,0.000000,0.000000,0.027415,0.255073,0.399262,0.089734,0.010790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,SUV,SUV_train_orig_test_00370_resized.jpg,0,../../../Images/train/Blurred/SUV_train_orig_t...,../../../Images/train/No_Blur/SUV_train_orig_t...,10013,15,37,70,21,...,0.254979,0.154362,0.105418,0.000000,0.000000,0.006182,0.400323,0.126782,0.109129,0.000000
8248,Pickup,Pickup_train_orig_train_04622_resized.jpg,0,../../../Images/train/Blurred/Pickup_train_ori...,../../../Images/train/No_Blur/Pickup_train_ori...,5396,1,9,103,2,...,0.019901,0.528632,0.027102,0.000000,0.000000,0.268571,0.142446,0.000000,0.111561,0.000000
8249,Convertible,Convertible_train_orig_train_04430_resized.jpg,0,../../../Images/train/Blurred/Convertible_trai...,../../../Images/train/No_Blur/Convertible_trai...,3157,950,510,252,134,...,0.003150,0.235740,0.275030,0.000000,0.000000,0.210836,0.758296,0.236081,1.110856,0.000000
8250,SUV,SUV_train_orig_test_04527_resized.jpg,0,../../../Images/train/Blurred/SUV_train_orig_t...,../../../Images/train/No_Blur/SUV_train_orig_t...,13477,6,8,10,6,...,0.165793,0.320819,0.198651,0.000000,0.000000,0.146347,0.000000,0.173617,0.578854,0.002865


In [21]:
# Print all columns in all_features
for col in all_features.columns:
    print(col)

Class
harmonized_filename
test_80_20
image_path_blur
image_path_no_blur
HSV_0
HSV_1
HSV_2
HSV_3
HSV_4
HSV_5
HSV_6
HSV_7
HSV_8
HSV_9
HSV_10
HSV_11
HSV_12
HSV_13
HSV_14
HSV_15
HSV_16
HSV_17
HSV_18
HSV_19
HSV_20
HSV_21
HSV_22
HSV_23
HSV_24
HSV_25
HSV_26
HSV_27
HSV_28
HSV_29
HSV_30
HSV_31
HSV_32
HSV_33
HSV_34
HSV_35
HSV_36
HSV_37
HSV_38
HSV_39
HSV_40
HSV_41
HSV_42
HSV_43
HSV_44
HSV_45
HSV_46
HSV_47
HSV_48
HSV_49
HSV_50
HSV_51
HSV_52
HSV_53
HSV_54
HSV_55
HSV_56
HSV_57
HSV_58
HSV_59
HSV_60
HSV_61
HSV_62
HSV_63
HSV_64
HSV_65
HSV_66
HSV_67
HSV_68
HSV_69
HSV_70
HSV_71
HSV_72
HSV_73
HSV_74
HSV_75
HSV_76
HSV_77
HSV_78
HSV_79
HSV_80
HSV_81
HSV_82
HSV_83
HSV_84
HSV_85
HSV_86
HSV_87
HSV_88
HSV_89
HSV_90
HSV_91
HSV_92
HSV_93
HSV_94
HSV_95
HSV_96
HSV_97
HSV_98
HSV_99
HSV_100
HSV_101
HSV_102
HSV_103
HSV_104
HSV_105
HSV_106
HSV_107
HSV_108
HSV_109
HSV_110
HSV_111
HSV_112
HSV_113
HSV_114
HSV_115
HSV_116
HSV_117
HSV_118
HSV_119
HSV_120
HSV_121
HSV_122
HSV_123
HSV_124
HSV_125
HSV_126
HSV_127
HSV_128
HSV_12

## Output Training and Testing Dataframes

In [22]:
# Split df into train and test
all_features_train = all_features[all_features['test_80_20'] == 0]
all_features_test = all_features[all_features['test_80_20'] == 1]

# Drop test_80_20 column
all_features_train.drop(columns='test_80_20', inplace=True)
all_features_test.drop(columns='test_80_20', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features_train.drop(columns='test_80_20', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features_test.drop(columns='test_80_20', inplace=True)


In [23]:
def split_df(df, dataset_name, out_folder, num_pieces):
    '''
    Splits dataframes into num_pieces and saves them as parquet files in out_folder. Reduces file size to comply with GitHub limits.
    '''
    # Tracking total length of pieces
    total_len_pieces = 0
    # Delete previous pieces, all contents of out_folder
    for filename in os.listdir(out_folder):
        file_path = os.path.join(out_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    # Save pieces
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(df) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(df) // num_pieces
        # get piece
        piece = df[start_index:end_index]
        piece.to_parquet(out_folder + '/' + dataset_name + '_piece_' + str(i) + '.parquet', index=False)
        #print(len(piece))
        total_len_pieces += len(piece)
    
    # check total piece length and length of vit_embeddings_df
    print('length check passed')
    print(total_len_pieces == len(df))

In [25]:
# Run on all_features dataframes
split_df(all_features_train, 'top3_features_train', '../../Data/Features/Top3_Features/train', 16)
split_df(all_features_test, 'top3_features_test', '../../Data/Features/Top3_Features/test', 16)

length check passed
True
length check passed
True
