# Combine All Features

Load parquet files of features, stack as needed, then merge them into a single dataframe.

In [1]:
# Packages
import os
import pandas as pd
import shutil

## Generalized Function to Load and Combine All DFs in a Directory

In [2]:
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    # If path does not end in a slash, add one
    if directory_path[-1] != '/':
        directory_path += '/'
    # list of files in directory
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    # read in all parquet files
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    # Return combined dataframe
    return combined_df

## Function to create harmonized Image file name

In [3]:
def create_harmonized_filename(file_path):
    '''
    Strip Blur vs. No Blur and other data from file path.
    '''
    # Split path by '/' and take last element
    harmonized_filename = file_path.split('/')[-1]
    # Remove _no_blur and _blurred from file name
    harmonized_filename = harmonized_filename.replace('_no_blur', '')
    harmonized_filename = harmonized_filename.replace('_blurred', '')
    # Return harmonized file name
    return harmonized_filename

In [4]:
def prep_dataset_for_merge(df, feature_name):
    '''
    Prepares dataset for merging.
    '''
    # Create new column for harmonized file name
    df['harmonized_filename'] = df['Image Path'].apply(create_harmonized_filename)
    # Rename Image Path to Image Path + feature name
    df.rename(columns={'Image Path': 'Image Path ' + feature_name}, inplace=True)
    # Return dataset
    return df

## Load Features

In [5]:
# Vision Transformer Embeddings
vit_embeddings = combine_directory_parquets('../../Data/Features/Vision Transformer')
# Prep dataset for merging
vit_embeddings = prep_dataset_for_merge(vit_embeddings, 'Vision Transformer')
vit_embeddings

Unnamed: 0,Image Path Vision Transformer,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,ViT_Embedding_Element_1272,ViT_Embedding_Element_1273,ViT_Embedding_Element_1274,ViT_Embedding_Element_1275,ViT_Embedding_Element_1276,ViT_Embedding_Element_1277,ViT_Embedding_Element_1278,ViT_Embedding_Element_1279,test_80_20,harmonized_filename
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,-0.059224,0.173239,0.363462,0.457626,-0.077351,-0.236950,-0.031632,-0.261893,0,Sedan_train_orig_test_01516_resized.jpg
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,0.215697,0.023538,0.393746,0.455197,0.223018,-0.265846,-0.200683,-0.405006,0,SUV_train_orig_train_00294_resized.jpg
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,0.111003,0.158716,0.380261,0.493224,0.169883,-0.105756,0.124275,-0.446003,0,Convertible_train_orig_train_04236_resized.jpg
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,0.048522,-0.032885,0.400770,0.430547,0.214644,-0.323948,-0.276459,-0.414079,0,Pickup_train_orig_train_03906_resized.jpg
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,-0.089879,0.237541,0.321542,0.343445,0.047995,-0.305656,0.091199,-0.370617,0,SUV_train_orig_test_01344_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,../../../Images/test/No Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,0.068092,-0.512507,...,0.023480,0.073889,0.333791,0.452840,0.038384,-0.209088,-0.219002,-0.133379,1,Sedan_test_orig_train_00376_resized.jpg
1028,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,-0.112296,0.220512,0.196491,0.279989,0.168716,-0.193009,-0.030066,-0.041473,1,SUV_test_orig_test_00579_resized.jpg
1029,../../../Images/test/No Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,0.011075,-0.483120,...,0.052621,0.168497,0.294833,0.398839,-0.061976,-0.102406,-0.211565,-0.086189,1,Sedan_test_orig_test_00430_resized.jpg
1030,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,-0.054918,0.249961,0.387619,0.443451,0.075971,-0.162953,-0.078282,-0.177154,1,Sedan_test_orig_test_07328_resized.jpg


In [6]:
# Eliminated for being too many items

# # HOG 16 pixels per cell Vectors
# hog_16_ppc_vectors = combine_directory_parquets('../../Data/Features/HOG_16_ppc')
# # Prep dataset for merging
# hog_16_ppc_vectors = prep_dataset_for_merge(hog_16_ppc_vectors, 'HOG 16 ppc')
# hog_16_ppc_vectors

In [7]:
# HOG 24 pixels per cell Vectors
hog_24_ppc_vectors = combine_directory_parquets('../../Data/Features/HOG_24_ppc')
# Prep dataset for merging
hog_24_ppc_vectors = prep_dataset_for_merge(hog_24_ppc_vectors, 'HOG 24 ppc')
hog_24_ppc_vectors

Unnamed: 0,Image Path HOG 24 ppc,test_80_20,HOG_24_ppc_0,HOG_24_ppc_1,HOG_24_ppc_2,HOG_24_ppc_3,HOG_24_ppc_4,HOG_24_ppc_5,HOG_24_ppc_6,HOG_24_ppc_7,...,HOG_24_ppc_2295,HOG_24_ppc_2296,HOG_24_ppc_2297,HOG_24_ppc_2298,HOG_24_ppc_2299,HOG_24_ppc_2300,HOG_24_ppc_2301,HOG_24_ppc_2302,HOG_24_ppc_2303,harmonized_filename
0,../../../Images/train/Blurred/Convertible_trai...,0,0.059938,0.097728,0.106369,0.048924,0.091199,0.097621,0.075841,0.066475,...,0.000662,0.005605,0.084604,0.339768,0.002502,0.114248,0.092849,0.311350,0.125613,Convertible_train_orig_train_00272_resized.jpg
1,../../../Images/train/Blurred/SUV_train_orig_t...,0,0.256457,0.256457,0.077890,0.063679,0.126482,0.256457,0.147489,0.256457,...,0.026551,0.030365,0.038664,0.220049,0.012558,0.038802,0.037201,0.075855,0.026042,SUV_train_orig_test_07471_resized.jpg
2,../../../Images/train/Blurred/Convertible_trai...,0,0.202457,0.223452,0.223452,0.211585,0.175203,0.223452,0.223452,0.176913,...,0.096925,0.070365,0.138975,0.164921,0.077499,0.062890,0.140606,0.187274,0.122544,Convertible_train_orig_test_01313_resized.jpg
3,../../../Images/train/Blurred/Sedan_train_orig...,0,0.007001,0.000845,0.002915,0.000151,0.003460,0.000303,0.003501,0.000498,...,0.018255,0.198336,0.102858,0.149118,0.034768,0.003529,0.100560,0.235702,0.000821,Sedan_train_orig_test_05964_resized.jpg
4,../../../Images/train/Blurred/Sedan_train_orig...,0,0.027547,0.005092,0.020988,0.020537,0.009588,0.008990,0.024175,0.009435,...,0.122497,0.090929,0.138466,0.208792,0.100809,0.081907,0.108674,0.152249,0.106565,Sedan_train_orig_test_06611_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
821,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.119561,0.122641,0.064218,0.077519,0.111523,0.110687,0.101571,0.079767,...,0.168222,0.068426,0.043202,0.171373,0.186756,0.038155,0.036713,0.138185,0.117317,Sedan_test_orig_test_05332_resized.jpg
822,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.236619,0.227736,0.219241,0.236619,0.236619,0.150675,0.231033,0.236619,...,0.162245,0.050129,0.186667,0.273364,0.037968,0.005627,0.164959,0.264899,0.003458,Sedan_test_orig_train_02826_resized.jpg
823,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.107081,0.163420,0.159713,0.034679,0.243864,0.146184,0.215033,0.243864,...,0.126880,0.069564,0.084517,0.106073,0.078765,0.051485,0.082266,0.092818,0.061774,Sedan_test_orig_train_01495_resized.jpg
824,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.000000,0.000000,0.000482,0.000000,0.005666,0.000450,0.001927,0.000284,...,0.071698,0.074662,0.138145,0.142298,0.061959,0.104318,0.143668,0.169297,0.079651,Sedan_test_orig_train_04215_resized.jpg


In [8]:
# HSV Vectors
HSV_vectors = combine_directory_parquets('../../Data/Features/HSV')
# Prep dataset for merging
HSV_vectors = prep_dataset_for_merge(HSV_vectors, 'HSV')
HSV_vectors

Unnamed: 0,Image Path HSV,test_80_20,HSV_0,HSV_1,HSV_2,HSV_3,HSV_4,HSV_5,HSV_6,HSV_7,...,HSV_759,HSV_760,HSV_761,HSV_762,HSV_763,HSV_764,HSV_765,HSV_766,HSV_767,harmonized_filename
0,../../../Images/train/No Blur/Sedan_train_orig...,0,279,76,110,63,40,59,52,127,...,79,110,118,105,182,332,2044,1522,5299,Sedan_train_orig_test_01516_resized.jpg
1,../../../Images/train/No Blur/SUV_train_orig_t...,0,683,50,44,51,29,27,34,380,...,240,239,196,174,182,186,133,180,1099,SUV_train_orig_train_00294_resized.jpg
2,../../../Images/train/No Blur/Convertible_trai...,0,8665,8,31,8,10,2,5,58,...,12,7,4,13,5,4,3,0,6,Convertible_train_orig_train_04236_resized.jpg
3,../../../Images/train/No Blur/Pickup_train_ori...,0,3865,0,0,2,0,1,0,5,...,27,25,28,21,25,13,18,21,94,Pickup_train_orig_train_03906_resized.jpg
4,../../../Images/train/No Blur/SUV_train_orig_t...,0,1266,49,115,301,71,43,276,500,...,224,222,225,190,202,150,156,127,840,SUV_train_orig_test_01344_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
821,../../../Images/test/No Blur/Sedan_test_orig_t...,1,5305,2,18,21,10,6,20,41,...,6,4,7,5,3,4,3,4,36,Sedan_test_orig_train_00376_resized.jpg
822,../../../Images/test/No Blur/SUV_test_orig_tes...,1,571,8,21,105,15,7,25,140,...,92,112,86,92,84,112,89,100,1158,SUV_test_orig_test_00579_resized.jpg
823,../../../Images/test/No Blur/Sedan_test_orig_t...,1,3833,7,72,236,13,46,127,664,...,1,0,1,2,1,1,0,0,1,Sedan_test_orig_test_00430_resized.jpg
824,../../../Images/test/No Blur/Sedan_test_orig_t...,1,3032,13,11,38,8,3,35,44,...,138,137,57,24,15,12,11,7,41,Sedan_test_orig_test_07328_resized.jpg


In [9]:
# LBP Vectors
LBP_vectors = combine_directory_parquets('../../Data/Features/LBP')
# Prep dataset for merging
LBP_vectors = prep_dataset_for_merge(LBP_vectors, 'LBP')
LBP_vectors

Unnamed: 0,Image Path LBP,test_80_20,LBP_0,LBP_1,LBP_2,LBP_3,LBP_4,LBP_5,LBP_6,LBP_7,...,LBP_16,LBP_17,LBP_18,LBP_19,LBP_20,LBP_21,LBP_22,LBP_23,LBP_24,harmonized_filename
0,../../../Images/train/Blurred/Convertible_trai...,0,0.026428,0.020401,0.016449,0.012634,0.010895,0.009186,0.008514,0.011658,...,0.012466,0.013702,0.008850,0.009171,0.009399,0.010910,0.014618,0.019577,0.342926,Convertible_train_orig_train_00272_resized.jpg
1,../../../Images/train/Blurred/SUV_train_orig_t...,0,0.038757,0.023911,0.016830,0.014236,0.011322,0.009949,0.010422,0.011734,...,0.015686,0.014740,0.010971,0.011002,0.011429,0.016052,0.018845,0.022339,0.429779,SUV_train_orig_test_07471_resized.jpg
2,../../../Images/train/Blurred/Convertible_trai...,0,0.044312,0.026428,0.020065,0.014328,0.013092,0.011200,0.011566,0.013031,...,0.014877,0.016525,0.012482,0.011383,0.012878,0.016235,0.020645,0.023376,0.454559,Convertible_train_orig_test_01313_resized.jpg
3,../../../Images/train/Blurred/Sedan_train_orig...,0,0.022125,0.023621,0.014084,0.013245,0.006790,0.006348,0.006027,0.008469,...,0.015106,0.037003,0.016296,0.013870,0.013031,0.018585,0.020355,0.025879,0.450439,Sedan_train_orig_test_05964_resized.jpg
4,../../../Images/train/Blurred/Sedan_train_orig...,0,0.055222,0.034653,0.020630,0.012558,0.008011,0.006851,0.006927,0.006897,...,0.010681,0.015182,0.007812,0.007996,0.007980,0.012894,0.018341,0.028107,0.550018,Sedan_train_orig_test_06611_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
821,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.065628,0.035370,0.022003,0.015320,0.011765,0.009583,0.007385,0.007874,...,0.007996,0.007828,0.007767,0.008789,0.010666,0.012680,0.021545,0.035065,0.602371,Sedan_test_orig_test_05332_resized.jpg
822,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.033752,0.023499,0.015930,0.012238,0.008774,0.007904,0.008087,0.009964,...,0.012955,0.012726,0.009247,0.009842,0.011093,0.013138,0.016403,0.022659,0.395554,Sedan_test_orig_train_02826_resized.jpg
823,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.037018,0.020233,0.017487,0.015518,0.011948,0.010483,0.009598,0.010544,...,0.015640,0.016098,0.010651,0.011902,0.012482,0.015839,0.017487,0.020828,0.400986,Sedan_test_orig_train_01495_resized.jpg
824,../../../Images/test/Blurred/Sedan_test_orig_t...,1,0.040054,0.023636,0.020844,0.016586,0.013351,0.010910,0.010620,0.013397,...,0.014053,0.017090,0.011292,0.011993,0.012878,0.016953,0.020187,0.022614,0.452133,Sedan_test_orig_train_04215_resized.jpg


In [10]:
# VGG Vectors
vgg_vectors = combine_directory_parquets('../../Data/Features/VGG')
# Prep dataset for merging
vgg_vectors = prep_dataset_for_merge(vgg_vectors, 'VGG')
vgg_vectors

Unnamed: 0,Image Path VGG,VGG_Embedding_Element_0,VGG_Embedding_Element_1,VGG_Embedding_Element_2,VGG_Embedding_Element_3,VGG_Embedding_Element_4,VGG_Embedding_Element_5,VGG_Embedding_Element_6,VGG_Embedding_Element_7,VGG_Embedding_Element_8,...,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,test_80_20,harmonized_filename
0,../../../Images/train/No Blur/Sedan_train_orig...,0.127800,0.165676,0.054183,0.054542,0.268908,0.000000,0.036441,0.000000,0.475737,...,0.040191,0.000000,0.000000,0.000000,0.098337,0.000000,0.550390,0.000368,0,Sedan_train_orig_test_01516_resized.jpg
1,../../../Images/train/No Blur/SUV_train_orig_t...,0.220538,0.117480,0.002096,0.033507,0.132664,0.005938,0.547738,0.010078,0.458705,...,0.050671,0.000000,0.000000,0.181632,0.077526,0.056239,0.071139,0.000000,0,SUV_train_orig_train_00294_resized.jpg
2,../../../Images/train/No Blur/Convertible_trai...,0.057756,0.232584,0.000000,0.038366,0.000000,0.000000,0.870056,0.000000,0.495067,...,0.356170,0.000000,0.004893,0.068126,0.000000,0.000000,0.496660,0.000000,0,Convertible_train_orig_train_04236_resized.jpg
3,../../../Images/train/No Blur/Pickup_train_ori...,0.172837,0.464956,0.027007,0.000000,0.016368,0.003086,0.213469,0.011158,0.318440,...,0.183065,0.000000,0.000000,0.289010,0.286223,0.002977,0.412420,0.000000,0,Pickup_train_orig_train_03906_resized.jpg
4,../../../Images/train/No Blur/SUV_train_orig_t...,1.563481,1.783680,0.211702,0.000000,0.113318,0.000000,0.089108,0.000000,1.170577,...,0.275861,0.083213,0.000000,0.418786,0.430101,0.094569,1.051647,0.781257,0,SUV_train_orig_test_01344_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,../../../Images/test/No Blur/Sedan_test_orig_t...,0.142197,0.802305,0.020482,0.000000,0.100756,0.531840,0.722337,0.000000,0.188415,...,0.009357,0.004506,0.000000,0.016683,0.104344,0.048166,0.127519,0.239549,1,Sedan_test_orig_train_00376_resized.jpg
1028,../../../Images/test/No Blur/SUV_test_orig_tes...,0.117131,0.609740,0.010078,0.000000,0.225453,0.105221,0.876188,0.024356,0.063433,...,0.152157,0.000000,0.013115,0.003288,0.586821,0.098766,0.528439,0.161134,1,SUV_test_orig_test_00579_resized.jpg
1029,../../../Images/test/No Blur/Sedan_test_orig_t...,0.023442,0.542435,0.033571,0.000000,0.376348,0.000000,0.338733,0.008471,0.452877,...,0.138978,0.026820,0.000000,0.110718,0.022224,0.280968,0.107825,0.088039,1,Sedan_test_orig_test_00430_resized.jpg
1030,../../../Images/test/No Blur/Sedan_test_orig_t...,0.126454,0.336609,0.230368,0.000000,0.000000,0.000000,1.666591,0.000000,0.182742,...,0.196631,0.101043,0.000000,0.081184,0.000000,0.752433,0.423842,0.000000,1,Sedan_test_orig_test_07328_resized.jpg


## Merge DFs

In [11]:
# Merge all dataframes on harmonized file name
all_features = (vit_embeddings#.merge(hog_16_ppc_vectors, on=['harmonized_filename', 'test_80_20'], how='inner')
                              .merge(hog_24_ppc_vectors, on=['harmonized_filename', 'test_80_20'], how='inner')
                              .merge(HSV_vectors, on=['harmonized_filename', 'test_80_20'], how='inner')
                              .merge(LBP_vectors, on=['harmonized_filename', 'test_80_20'], how='inner')
                              .merge(vgg_vectors, on=['harmonized_filename', 'test_80_20'], how='inner')
                              )
all_features

Unnamed: 0,Image Path Vision Transformer,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,VGG_Embedding_Element_502,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,0.019599,0.631864,0.040191,0.000000,0.000000,0.000000,0.098337,0.000000,0.550390,0.000368
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,0.009478,0.577905,0.050671,0.000000,0.000000,0.181632,0.077526,0.056239,0.071139,0.000000
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,0.142523,0.609230,0.356170,0.000000,0.004893,0.068126,0.000000,0.000000,0.496660,0.000000
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,0.030089,0.077514,0.183065,0.000000,0.000000,0.289010,0.286223,0.002977,0.412420,0.000000
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,0.249159,2.016973,0.275861,0.083213,0.000000,0.418786,0.430101,0.094569,1.051647,0.781257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/test/No Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,0.068092,-0.512507,...,0.079488,0.139404,0.009357,0.004506,0.000000,0.016683,0.104344,0.048166,0.127519,0.239549
8248,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,0.215540,0.154493,0.152157,0.000000,0.013115,0.003288,0.586821,0.098766,0.528439,0.161134
8249,../../../Images/test/No Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,0.011075,-0.483120,...,0.130971,0.262393,0.138978,0.026820,0.000000,0.110718,0.022224,0.280968,0.107825,0.088039
8250,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,0.097665,0.098011,0.196631,0.101043,0.000000,0.081184,0.000000,0.752433,0.423842,0.000000


In [12]:
# Check harmonized_filename column
all_features['harmonized_filename'].iloc[0]

'Sedan_train_orig_test_01516_resized.jpg'

## Add Class Labels and Use Corrections

In [13]:
# Correct class for items in train dataset
# Load "relabeled_train_no_blur_old_and_new_labels.xlsx"
relabeled_train_no_blur_old_and_new_labels = pd.read_excel('../../Data/Relabeled_Train_No_Blur/relabeled_train_no_blur_old_and_new_labels.xlsx')
relabeled_train_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_train_orig_test_00002_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00002_resized.jpg
1,Convertible_train_orig_test_00037_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00037_resized.jpg
2,Convertible_train_orig_test_00060_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00060_resized.jpg
3,Convertible_train_orig_test_00087_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00087_resized.jpg
4,Convertible_train_orig_test_00112_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00112_resized.jpg
...,...,...,...,...,...
6591,Sedan_train_orig_train_08136_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08136_resized.jpg
6592,Sedan_train_orig_train_08137_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08137_resized.jpg
6593,Sedan_train_orig_train_08138_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08138_resized.jpg
6594,Sedan_train_orig_train_08139_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_train_orig_train_08139_resized.jpg


In [14]:
# Correct class for items in test dataset
# Load "relabeled_test_no_blur_old_and_new_labels.xlsx"
relabeled_test_no_blur_old_and_new_labels = pd.read_excel('../../Data/Relabeled_Test_No_Blur/relabeled_test_no_blur_old_and_new_labels.xlsx')
relabeled_test_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_test_orig_test_00023_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00023_resized.jpg
1,Convertible_test_orig_test_00096_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00096_resized.jpg
2,Convertible_test_orig_test_00107_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00107_resized.jpg
3,Convertible_test_orig_test_00135_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00135_resized.jpg
4,Convertible_test_orig_test_00147_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00147_resized.jpg
...,...,...,...,...,...
1651,Sedan_test_orig_train_08026_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_test_orig_train_08026_resized.jpg
1652,Sedan_test_orig_train_08043_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08043_resized.jpg
1653,Sedan_test_orig_train_08046_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08046_resized.jpg
1654,Sedan_test_orig_train_08098_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08098_resized.jpg


In [15]:
# Stack train and test relabeled dataframes
# Keep limited columns
relabeled = pd.concat([relabeled_train_no_blur_old_and_new_labels, relabeled_test_no_blur_old_and_new_labels])[['harmonized_filename', 'New Class', 'Old Class']]
relabeled

Unnamed: 0,harmonized_filename,New Class,Old Class
0,Convertible_train_orig_test_00002_resized.jpg,Convertible,Convertible
1,Convertible_train_orig_test_00037_resized.jpg,Convertible,Convertible
2,Convertible_train_orig_test_00060_resized.jpg,Convertible,Convertible
3,Convertible_train_orig_test_00087_resized.jpg,Convertible,Convertible
4,Convertible_train_orig_test_00112_resized.jpg,Convertible,Convertible
...,...,...,...
1651,Sedan_test_orig_train_08026_resized.jpg,Ambiguous,Sedan
1652,Sedan_test_orig_train_08043_resized.jpg,Sedan,Sedan
1653,Sedan_test_orig_train_08046_resized.jpg,Sedan,Sedan
1654,Sedan_test_orig_train_08098_resized.jpg,Sedan,Sedan


In [16]:
# Merge on 'filename'
all_features = all_features.merge(relabeled, on='harmonized_filename', how='left')
all_features

Unnamed: 0,Image Path Vision Transformer,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,New Class,Old Class
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,0.040191,0.000000,0.000000,0.000000,0.098337,0.000000,0.550390,0.000368,Sedan,Sedan
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,0.050671,0.000000,0.000000,0.181632,0.077526,0.056239,0.071139,0.000000,SUV,SUV
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,0.356170,0.000000,0.004893,0.068126,0.000000,0.000000,0.496660,0.000000,Convertible,Convertible
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,0.183065,0.000000,0.000000,0.289010,0.286223,0.002977,0.412420,0.000000,Pickup,Pickup
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,0.275861,0.083213,0.000000,0.418786,0.430101,0.094569,1.051647,0.781257,SUV,SUV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/test/No Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,0.068092,-0.512507,...,0.009357,0.004506,0.000000,0.016683,0.104344,0.048166,0.127519,0.239549,Ambiguous,Sedan
8248,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,0.152157,0.000000,0.013115,0.003288,0.586821,0.098766,0.528439,0.161134,SUV,SUV
8249,../../../Images/test/No Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,0.011075,-0.483120,...,0.138978,0.026820,0.000000,0.110718,0.022224,0.280968,0.107825,0.088039,Ambiguous,Sedan
8250,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,0.196631,0.101043,0.000000,0.081184,0.000000,0.752433,0.423842,0.000000,Sedan,Sedan


### Create Class

In [17]:
# Value counts of 'New Class'
all_features['New Class'].value_counts()

New Class
Sedan                    3114
SUV                      2057
Convertible              1419
Pickup                    902
Ambiguous                 754
Clearly None of the 4       6
Name: count, dtype: int64

In [18]:
# Compare 'New Class' and 'Old Class' columns
old_v_new_class = all_features[['Old Class', 'New Class']].value_counts().reset_index().sort_values(by=['Old Class', 'New Class'])
old_v_new_class

Unnamed: 0,Old Class,New Class,count
7,Convertible,Ambiguous,86
2,Convertible,Convertible,1417
9,Convertible,Sedan,11
8,Pickup,Ambiguous,78
3,Pickup,Pickup,799
5,SUV,Ambiguous,245
6,SUV,Pickup,101
1,SUV,SUV,2057
10,SUV,Sedan,6
4,Sedan,Ambiguous,345


In [19]:
# Set 'Class' to 'New Class'
all_features['Class'] = all_features['New Class']
# Keep cases where 'New Class' is in 'SUV', 'Sedan', 'Pickup', 'Convertible'
all_features = all_features[all_features['Class'].isin(['SUV', 'Sedan', 'Pickup', 'Convertible'])]
# Drop 'New Class' and 'Old Class' columns
all_features.drop(columns=['New Class', 'Old Class'], inplace=True)
all_features

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features.drop(columns=['New Class', 'Old Class'], inplace=True)


Unnamed: 0,Image Path Vision Transformer,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,Class
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,0.631864,0.040191,0.000000,0.000000,0.000000,0.098337,0.000000,0.550390,0.000368,Sedan
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,0.577905,0.050671,0.000000,0.000000,0.181632,0.077526,0.056239,0.071139,0.000000,SUV
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,0.609230,0.356170,0.000000,0.004893,0.068126,0.000000,0.000000,0.496660,0.000000,Convertible
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,0.077514,0.183065,0.000000,0.000000,0.289010,0.286223,0.002977,0.412420,0.000000,Pickup
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,2.016973,0.275861,0.083213,0.000000,0.418786,0.430101,0.094569,1.051647,0.781257,SUV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8244,../../../Images/test/No Blur/Sedan_test_orig_t...,0.000908,0.291246,-0.122745,-0.338141,-0.133159,0.065774,0.116908,-0.228449,-0.484830,...,0.866147,0.638139,0.000000,0.000000,0.030869,0.038794,0.592077,1.137420,0.000000,Sedan
8245,../../../Images/test/No Blur/SUV_test_orig_tra...,-0.389516,0.184974,0.117843,-0.140552,-0.100692,0.376913,-0.010999,0.189513,-0.116804,...,0.087451,0.109797,0.000000,0.000000,0.186960,0.004254,0.219011,0.021491,0.000000,SUV
8248,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,0.154493,0.152157,0.000000,0.013115,0.003288,0.586821,0.098766,0.528439,0.161134,SUV
8250,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,0.098011,0.196631,0.101043,0.000000,0.081184,0.000000,0.752433,0.423842,0.000000,Sedan


In [20]:
# Value counts for 'Class'
all_features['Class'].value_counts()

Class
Sedan          3114
SUV            2057
Convertible    1419
Pickup          902
Name: count, dtype: int64

In [21]:
# Value counts of 'test_80_20'
all_features['test_80_20'].value_counts()

test_80_20
0    6003
1    1489
Name: count, dtype: int64

## Clean Up Columns

In [22]:
# Get image path columns
image_path_columns = [col for col in all_features.columns if 'Image Path' in col]
# Split off and delete Image Path columns
image_path_df = all_features[['harmonized_filename'] + image_path_columns]
image_path_df.to_excel('../../Data/Features/All Features Image Paths/All_Features_Image_Paths.xlsx', index=False)
all_features.drop(columns=image_path_columns, inplace=True)
# Reorder columns
front_cols = ['Class', 'harmonized_filename', 'test_80_20']
all_features = all_features[front_cols + [col for col in all_features.columns if col not in front_cols]]
all_features

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features.drop(columns=image_path_columns, inplace=True)


Unnamed: 0,Class,harmonized_filename,test_80_20,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,...,VGG_Embedding_Element_502,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511
0,Sedan,Sedan_train_orig_test_01516_resized.jpg,0,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,...,0.019599,0.631864,0.040191,0.000000,0.000000,0.000000,0.098337,0.000000,0.550390,0.000368
1,SUV,SUV_train_orig_train_00294_resized.jpg,0,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,...,0.009478,0.577905,0.050671,0.000000,0.000000,0.181632,0.077526,0.056239,0.071139,0.000000
2,Convertible,Convertible_train_orig_train_04236_resized.jpg,0,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,...,0.142523,0.609230,0.356170,0.000000,0.004893,0.068126,0.000000,0.000000,0.496660,0.000000
3,Pickup,Pickup_train_orig_train_03906_resized.jpg,0,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,...,0.030089,0.077514,0.183065,0.000000,0.000000,0.289010,0.286223,0.002977,0.412420,0.000000
4,SUV,SUV_train_orig_test_01344_resized.jpg,0,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,...,0.249159,2.016973,0.275861,0.083213,0.000000,0.418786,0.430101,0.094569,1.051647,0.781257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8244,Sedan,Sedan_test_orig_train_03968_resized.jpg,1,0.000908,0.291246,-0.122745,-0.338141,-0.133159,0.065774,0.116908,...,0.064810,0.866147,0.638139,0.000000,0.000000,0.030869,0.038794,0.592077,1.137420,0.000000
8245,SUV,SUV_test_orig_train_00034_resized.jpg,1,-0.389516,0.184974,0.117843,-0.140552,-0.100692,0.376913,-0.010999,...,0.054488,0.087451,0.109797,0.000000,0.000000,0.186960,0.004254,0.219011,0.021491,0.000000
8248,SUV,SUV_test_orig_test_00579_resized.jpg,1,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,...,0.215540,0.154493,0.152157,0.000000,0.013115,0.003288,0.586821,0.098766,0.528439,0.161134
8250,Sedan,Sedan_test_orig_test_07328_resized.jpg,1,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,...,0.097665,0.098011,0.196631,0.101043,0.000000,0.081184,0.000000,0.752433,0.423842,0.000000


## Add Blur and No-Blur Paths

In [23]:
def construct_path(train_test, blur_no_blur, harmonized_filename):
    return '../../../Images/' + train_test + '/' + blur_no_blur + '/' + harmonized_filename

# Add image_path_blur column
all_features['image_path_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'Blurred', x['harmonized_filename']), axis=1)
# Add image_path_no_blur column
all_features['image_path_no_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'No_Blur', x['harmonized_filename']), axis=1)
# Reorder columns - Class, harmonized_filename, test_80_20, image_path_blur, image_path_no_blur, all other columns
front_cols = front_cols + ['image_path_blur', 'image_path_no_blur']
all_features = all_features[front_cols + [col for col in all_features.columns if col not in front_cols]]
all_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features['image_path_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'Blurred', x['harmonized_filename']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features['image_path_no_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'No_Blur', x['harmonized_filename']), axis=1)


Unnamed: 0,Class,harmonized_filename,test_80_20,image_path_blur,image_path_no_blur,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,...,VGG_Embedding_Element_502,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511
0,Sedan,Sedan_train_orig_test_01516_resized.jpg,0,../../../Images/train/Blurred/Sedan_train_orig...,../../../Images/train/No_Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,...,0.019599,0.631864,0.040191,0.000000,0.000000,0.000000,0.098337,0.000000,0.550390,0.000368
1,SUV,SUV_train_orig_train_00294_resized.jpg,0,../../../Images/train/Blurred/SUV_train_orig_t...,../../../Images/train/No_Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,...,0.009478,0.577905,0.050671,0.000000,0.000000,0.181632,0.077526,0.056239,0.071139,0.000000
2,Convertible,Convertible_train_orig_train_04236_resized.jpg,0,../../../Images/train/Blurred/Convertible_trai...,../../../Images/train/No_Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,...,0.142523,0.609230,0.356170,0.000000,0.004893,0.068126,0.000000,0.000000,0.496660,0.000000
3,Pickup,Pickup_train_orig_train_03906_resized.jpg,0,../../../Images/train/Blurred/Pickup_train_ori...,../../../Images/train/No_Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,...,0.030089,0.077514,0.183065,0.000000,0.000000,0.289010,0.286223,0.002977,0.412420,0.000000
4,SUV,SUV_train_orig_test_01344_resized.jpg,0,../../../Images/train/Blurred/SUV_train_orig_t...,../../../Images/train/No_Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,...,0.249159,2.016973,0.275861,0.083213,0.000000,0.418786,0.430101,0.094569,1.051647,0.781257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8244,Sedan,Sedan_test_orig_train_03968_resized.jpg,1,../../../Images/test/Blurred/Sedan_test_orig_t...,../../../Images/test/No_Blur/Sedan_test_orig_t...,0.000908,0.291246,-0.122745,-0.338141,-0.133159,...,0.064810,0.866147,0.638139,0.000000,0.000000,0.030869,0.038794,0.592077,1.137420,0.000000
8245,SUV,SUV_test_orig_train_00034_resized.jpg,1,../../../Images/test/Blurred/SUV_test_orig_tra...,../../../Images/test/No_Blur/SUV_test_orig_tra...,-0.389516,0.184974,0.117843,-0.140552,-0.100692,...,0.054488,0.087451,0.109797,0.000000,0.000000,0.186960,0.004254,0.219011,0.021491,0.000000
8248,SUV,SUV_test_orig_test_00579_resized.jpg,1,../../../Images/test/Blurred/SUV_test_orig_tes...,../../../Images/test/No_Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,...,0.215540,0.154493,0.152157,0.000000,0.013115,0.003288,0.586821,0.098766,0.528439,0.161134
8250,Sedan,Sedan_test_orig_test_07328_resized.jpg,1,../../../Images/test/Blurred/Sedan_test_orig_t...,../../../Images/test/No_Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,...,0.097665,0.098011,0.196631,0.101043,0.000000,0.081184,0.000000,0.752433,0.423842,0.000000


In [24]:
# Print all columns in all_features
for col in all_features.columns:
    print(col)

Class
harmonized_filename
test_80_20
image_path_blur
image_path_no_blur
ViT_Embedding_Element_0
ViT_Embedding_Element_1
ViT_Embedding_Element_2
ViT_Embedding_Element_3
ViT_Embedding_Element_4
ViT_Embedding_Element_5
ViT_Embedding_Element_6
ViT_Embedding_Element_7
ViT_Embedding_Element_8
ViT_Embedding_Element_9
ViT_Embedding_Element_10
ViT_Embedding_Element_11
ViT_Embedding_Element_12
ViT_Embedding_Element_13
ViT_Embedding_Element_14
ViT_Embedding_Element_15
ViT_Embedding_Element_16
ViT_Embedding_Element_17
ViT_Embedding_Element_18
ViT_Embedding_Element_19
ViT_Embedding_Element_20
ViT_Embedding_Element_21
ViT_Embedding_Element_22
ViT_Embedding_Element_23
ViT_Embedding_Element_24
ViT_Embedding_Element_25
ViT_Embedding_Element_26
ViT_Embedding_Element_27
ViT_Embedding_Element_28
ViT_Embedding_Element_29
ViT_Embedding_Element_30
ViT_Embedding_Element_31
ViT_Embedding_Element_32
ViT_Embedding_Element_33
ViT_Embedding_Element_34
ViT_Embedding_Element_35
ViT_Embedding_Element_36
ViT_Embedding

## Output Training and Testing Dataframes

In [25]:
# Split df into train and test
all_features_train = all_features[all_features['test_80_20'] == 0]
all_features_test = all_features[all_features['test_80_20'] == 1]

# Drop test_80_20 column
all_features_train.drop(columns='test_80_20', inplace=True)
all_features_test.drop(columns='test_80_20', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features_train.drop(columns='test_80_20', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features_test.drop(columns='test_80_20', inplace=True)


In [26]:
def split_df(df, dataset_name, out_folder, num_pieces):
    '''
    Splits dataframes into num_pieces and saves them as parquet files in out_folder. Reduces file size to comply with GitHub limits.
    '''
    # Tracking total length of pieces
    total_len_pieces = 0
    # Delete previous pieces, all contents of out_folder
    for filename in os.listdir(out_folder):
        file_path = os.path.join(out_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    # Save pieces
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(df) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(df) // num_pieces
        # get piece
        piece = df[start_index:end_index]
        piece.to_parquet(out_folder + '/' + dataset_name + '_piece_' + str(i) + '.parquet', index=False)
        #print(len(piece))
        total_len_pieces += len(piece)
    
    # check total piece length and length of vit_embeddings_df
    print('length check passed')
    print(total_len_pieces == len(df))

In [27]:
# Run on all_features dataframes
split_df(all_features_train, 'all_features_train', '../../Data/Features/All Features/train', 16)
split_df(all_features_test, 'all_features_test', '../../Data/Features/All Features/test', 16)

length check passed
True
length check passed
True


## Output 100 row sample to Excel

In [28]:
all_features_train.sample(100).to_excel('../../Data/Features/All Features Train Sample/all_features_train_sample.xlsx', index=False)