# Combine All Features

Load parquet files of features, stack as needed, then merge them into a single dataframe.

In [22]:
# Packages
import os
import pandas as pd
import shutil

## Generalized Function to Load and Combine All DFs in a Directory

In [23]:
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    # If path does not end in a slash, add one
    if directory_path[-1] != '/':
        directory_path += '/'
    # list of files in directory
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    # read in all parquet files
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    # Return combined dataframe
    return combined_df

## Load Features

In [24]:
# Vision Transformer Embeddings
vit_embeddings = combine_directory_parquets('../../../Data/Features/Vision Transformer')
vit_embeddings

Unnamed: 0,Image Path,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,ViT_Embedding_Element_1271,ViT_Embedding_Element_1272,ViT_Embedding_Element_1273,ViT_Embedding_Element_1274,ViT_Embedding_Element_1275,ViT_Embedding_Element_1276,ViT_Embedding_Element_1277,ViT_Embedding_Element_1278,ViT_Embedding_Element_1279,test_80_20
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,0.231262,-0.059224,0.173239,0.363462,0.457626,-0.077351,-0.236950,-0.031632,-0.261893,0
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,0.261749,0.215697,0.023538,0.393746,0.455197,0.223018,-0.265846,-0.200683,-0.405006,0
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,0.010302,0.111003,0.158716,0.380261,0.493224,0.169883,-0.105756,0.124275,-0.446003,0
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,0.256229,0.048522,-0.032885,0.400770,0.430547,0.214644,-0.323948,-0.276459,-0.414079,0
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,0.034098,-0.089879,0.237541,0.321542,0.343445,0.047995,-0.305656,0.091199,-0.370617,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,../../../Images/test/No Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,0.068092,-0.512507,...,0.027900,0.023480,0.073889,0.333791,0.452840,0.038384,-0.209088,-0.219002,-0.133379,1
1028,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,0.013969,-0.112296,0.220512,0.196491,0.279989,0.168716,-0.193009,-0.030066,-0.041473,1
1029,../../../Images/test/No Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,0.011075,-0.483120,...,0.072253,0.052621,0.168497,0.294833,0.398839,-0.061976,-0.102406,-0.211565,-0.086189,1
1030,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,0.009759,-0.054918,0.249961,0.387619,0.443451,0.075971,-0.162953,-0.078282,-0.177154,1


## Merge DFs

In [25]:
all_features = vit_embeddings

## Add Class Labels and Use Corrections

In [26]:
# Parse 'Image Path'
# Split on last '/'
all_features['filename'] = all_features['Image Path'].apply(lambda x: x.split('/')[-1])
# Split on first '_' and take as Class
all_features['Class'] = all_features['filename'].apply(lambda x: x.split('_')[0])

In [27]:
# Correct class for items in train dataset

In [28]:
# Correct class for items in test dataset
# Load "relabeled_test_no_blur_old_and_new_labels.xlsx"
relabeled_test_no_blur_old_and_new_labels = pd.read_excel('../../../Data/Relabeled_Test_No_Blur/relabeled_test_no_blur_old_and_new_labels.xlsx')
relabeled_test_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class
0,Convertible_test_orig_test_00023_resized_no_bl...,2,Convertible,Convertible
1,Convertible_test_orig_test_00096_resized_no_bl...,2,Convertible,Convertible
2,Convertible_test_orig_test_00107_resized_no_bl...,2,Convertible,Convertible
3,Convertible_test_orig_test_00135_resized_no_bl...,2,Convertible,Convertible
4,Convertible_test_orig_test_00147_resized_no_bl...,2,Convertible,Convertible
...,...,...,...,...
1651,Sedan_test_orig_train_08026_resized_no_blur.jpg,5,Ambiguous,Sedan
1652,Sedan_test_orig_train_08043_resized_no_blur.jpg,3,Sedan,Sedan
1653,Sedan_test_orig_train_08046_resized_no_blur.jpg,3,Sedan,Sedan
1654,Sedan_test_orig_train_08098_resized_no_blur.jpg,3,Sedan,Sedan


In [29]:
# Merge on 'filename'
all_features = all_features.merge(relabeled_test_no_blur_old_and_new_labels, on='filename', how='left')
# Replace Class with New Class if not missing
all_features['Class'] = all_features['New Class'].fillna(all_features['Class'])
# Drop label, New Class, filename, Old Class
all_features.drop(columns=['label', 'New Class', 'filename', 'Old Class'], inplace=True)
all_features

Unnamed: 0,Image Path,ViT_Embedding_Element_0,ViT_Embedding_Element_1,ViT_Embedding_Element_2,ViT_Embedding_Element_3,ViT_Embedding_Element_4,ViT_Embedding_Element_5,ViT_Embedding_Element_6,ViT_Embedding_Element_7,ViT_Embedding_Element_8,...,ViT_Embedding_Element_1272,ViT_Embedding_Element_1273,ViT_Embedding_Element_1274,ViT_Embedding_Element_1275,ViT_Embedding_Element_1276,ViT_Embedding_Element_1277,ViT_Embedding_Element_1278,ViT_Embedding_Element_1279,test_80_20,Class
0,../../../Images/train/No Blur/Sedan_train_orig...,-0.244975,0.085045,-0.117774,-0.399902,0.034330,0.196966,-0.172954,-0.030475,-0.433943,...,-0.059224,0.173239,0.363462,0.457626,-0.077351,-0.236950,-0.031632,-0.261893,0,Sedan
1,../../../Images/train/No Blur/SUV_train_orig_t...,-0.311410,0.080667,0.144856,-0.376064,-0.551460,0.375702,-0.282547,0.083350,-0.309041,...,0.215697,0.023538,0.393746,0.455197,0.223018,-0.265846,-0.200683,-0.405006,0,SUV
2,../../../Images/train/No Blur/Convertible_trai...,-0.048161,0.171966,-0.329507,-0.446328,-0.108216,0.354181,0.223273,-0.240402,-0.386665,...,0.111003,0.158716,0.380261,0.493224,0.169883,-0.105756,0.124275,-0.446003,0,Convertible
3,../../../Images/train/No Blur/Pickup_train_ori...,-0.291613,0.124919,0.101113,-0.145411,-0.197138,0.369833,-0.114621,0.203537,-0.129010,...,0.048522,-0.032885,0.400770,0.430547,0.214644,-0.323948,-0.276459,-0.414079,0,Pickup
4,../../../Images/train/No Blur/SUV_train_orig_t...,-0.288975,0.369587,-0.056602,-0.335338,0.238191,0.122485,-0.092619,-0.128675,-0.206104,...,-0.089879,0.237541,0.321542,0.343445,0.047995,-0.305656,0.091199,-0.370617,0,SUV
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/test/No Blur/Sedan_test_orig_t...,0.002720,0.064492,-0.028643,-0.436636,-0.030246,0.498974,-0.132503,0.068092,-0.512507,...,0.023480,0.073889,0.333791,0.452840,0.038384,-0.209088,-0.219002,-0.133379,1,Ambiguous
8248,../../../Images/test/No Blur/SUV_test_orig_tes...,-0.179484,0.125944,-0.084191,-0.555147,-0.085796,0.493499,-0.062326,-0.075578,-0.409936,...,-0.112296,0.220512,0.196491,0.279989,0.168716,-0.193009,-0.030066,-0.041473,1,SUV
8249,../../../Images/test/No Blur/Sedan_test_orig_t...,0.057125,0.207224,-0.054070,-0.465893,0.019294,0.266582,-0.084520,0.011075,-0.483120,...,0.052621,0.168497,0.294833,0.398839,-0.061976,-0.102406,-0.211565,-0.086189,1,Ambiguous
8250,../../../Images/test/No Blur/Sedan_test_orig_t...,-0.037063,0.001804,-0.052823,-0.423519,-0.096491,0.231060,0.051790,-0.220513,-0.374592,...,-0.054918,0.249961,0.387619,0.443451,0.075971,-0.162953,-0.078282,-0.177154,1,Sedan


## Output Training and Testing Dataframes

In [30]:
# Split df into train and test
all_features_train = all_features[all_features['test_80_20'] == 0]
all_features_test = all_features[all_features['test_80_20'] == 1]

In [31]:
def split_df(df, dataset_name, out_folder, num_pieces):
    '''
    Splits dataframes into num_pieces and saves them as parquet files in out_folder. Reduces file size to comply with GitHub limits.
    '''
    # Tracking total length of pieces
    total_len_pieces = 0
    # Delete previous pieces, all contents of out_folder
    for filename in os.listdir(out_folder):
        file_path = os.path.join(out_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    # Save pieces
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(df) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(df) // num_pieces
        # get piece
        piece = df[start_index:end_index]
        piece.to_parquet(out_folder + '/' + dataset_name + '_piece_' + str(i) + '.parquet', index=False)
        #print(len(piece))
        total_len_pieces += len(piece)
    
    # check total piece length and length of vit_embeddings_df
    print('length check passed')
    print(total_len_pieces == len(df))

In [32]:
# Run on all_features dataframes
split_df(all_features_train, 'all_features_train', '../../../Data/Features/All Features/train', 16)
split_df(all_features_test, 'all_features_test', '../../../Data/Features/All Features/test', 16)

length check passed
True
length check passed
True
