# Combine All Features

Load parquet files of features, stack as needed, then merge them into a single dataframe.

In [1]:
# Packages
import os
import pandas as pd
import shutil

## Generalized Function to Load and Combine All DFs in a Directory

In [2]:
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    # If path does not end in a slash, add one
    if directory_path[-1] != '/':
        directory_path += '/'
    # list of files in directory
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    # read in all parquet files
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    # Return combined dataframe
    return combined_df

## Function to create harmonized Image file name

In [3]:
def create_harmonized_filename(file_path):
    '''
    Strip Blur vs. No Blur and other data from file path.
    '''
    # Split path by '/' and take last element
    harmonized_filename = file_path.split('/')[-1]
    # Remove _no_blur and _blurred from file name
    harmonized_filename = harmonized_filename.replace('_no_blur', '')
    harmonized_filename = harmonized_filename.replace('_blurred', '')
    # Return harmonized file name
    return harmonized_filename

In [4]:
def prep_dataset_for_merge(df, feature_name):
    '''
    Prepares dataset for merging.
    '''
    # Create new column for harmonized file name
    df['harmonized_filename'] = df['Image Path'].apply(create_harmonized_filename)
    # Rename Image Path to Image Path + feature name
    df.rename(columns={'Image Path': 'Image Path ' + feature_name}, inplace=True)
    # Return dataset
    return df

## Load Features

In [5]:
# VGG Vectors
vgg_vectors = combine_directory_parquets('../../Data/Features/VGG')
# Prep dataset for merging
vgg_vectors = prep_dataset_for_merge(vgg_vectors, 'VGG')
vgg_vectors

Unnamed: 0,Image Path VGG,VGG_Embedding_Element_0,VGG_Embedding_Element_1,VGG_Embedding_Element_2,VGG_Embedding_Element_3,VGG_Embedding_Element_4,VGG_Embedding_Element_5,VGG_Embedding_Element_6,VGG_Embedding_Element_7,VGG_Embedding_Element_8,...,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,test_80_20,harmonized_filename
0,../../../Images/train/No Blur/Sedan_train_orig...,0.057273,0.085857,0.014088,0.000000,0.000000,0.000000,1.265887,0.025079,0.020789,...,0.288115,0.000000,0.005827,0.039591,0.000000,0.063602,0.365371,0.000000,0,Sedan_train_orig_train_04172_resized.jpg
1,../../../Images/train/No Blur/Convertible_trai...,0.293713,0.714101,0.075956,0.144106,0.011360,0.000000,2.309738,0.094079,0.409755,...,0.154690,0.000000,0.000000,0.037017,0.350120,0.543494,0.100478,0.000000,0,Convertible_train_orig_test_01764_resized.jpg
2,../../../Images/train/No Blur/SUV_train_orig_t...,0.147546,0.793779,0.000000,0.014114,0.237972,0.191854,1.408566,0.000000,0.314723,...,0.115495,0.099091,0.000000,0.019142,0.110762,0.750666,0.198542,0.275273,0,SUV_train_orig_test_07060_resized.jpg
3,../../../Images/train/No Blur/Sedan_train_orig...,0.043017,1.089323,0.000000,0.004603,0.012446,0.000000,0.802531,0.000000,0.444981,...,0.072416,0.164781,0.000000,0.063925,0.189980,0.426336,0.031850,0.021951,0,Sedan_train_orig_test_07195_resized.jpg
4,../../../Images/train/No Blur/Sedan_train_orig...,0.027134,0.049786,0.030874,0.000000,0.003062,0.000000,1.098564,0.000000,0.005797,...,0.791576,0.000000,0.000000,0.017989,0.000000,0.532629,0.047045,0.000000,0,Sedan_train_orig_test_04947_resized.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,../../../Images/train/No Blur/Sedan_train_orig...,0.000000,0.355245,0.014701,0.000000,0.063166,0.000000,0.168762,0.068016,0.209514,...,0.068890,0.000000,0.013373,0.039992,0.152950,0.097856,0.072973,0.033932,0,Sedan_train_orig_train_00112_resized.jpg
1028,../../../Images/train/No Blur/Pickup_train_ori...,0.114592,0.210919,0.000000,0.000000,0.031758,0.000000,0.007828,0.000000,0.209584,...,0.454710,0.000000,0.000000,0.519490,0.037494,0.018614,1.264154,0.019924,0,Pickup_train_orig_test_01147_resized.jpg
1029,../../../Images/train/No Blur/SUV_train_orig_t...,0.084685,1.241065,0.000000,0.000000,0.025655,0.000000,1.467685,0.000000,0.184841,...,0.032092,0.000000,0.000000,0.476425,0.031688,0.863446,0.086722,0.031863,0,SUV_train_orig_test_08000_resized.jpg
1030,../../../Images/train/No Blur/Sedan_train_orig...,0.532457,0.549263,0.000000,0.000000,0.000000,0.020113,0.588399,0.000000,0.198008,...,0.033320,0.000000,0.000000,0.099895,0.018707,0.320334,0.286392,0.000000,0,Sedan_train_orig_test_01132_resized.jpg


In [6]:
# Check harmonized_filename column
vgg_vectors['harmonized_filename'].iloc[0]

'Sedan_train_orig_train_04172_resized.jpg'

## Add Class Labels and Use Corrections

In [7]:
# Correct class for items in train dataset
# Load "relabeled_train_no_blur_old_and_new_labels.xlsx"
relabeled_train_no_blur_old_and_new_labels = pd.read_excel('../../Data/Relabeled_Train_No_Blur/relabeled_train_no_blur_old_and_new_labels.xlsx')
relabeled_train_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_train_orig_test_00002_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00002_resized.jpg
1,Convertible_train_orig_test_00037_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00037_resized.jpg
2,Convertible_train_orig_test_00060_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00060_resized.jpg
3,Convertible_train_orig_test_00087_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00087_resized.jpg
4,Convertible_train_orig_test_00112_resized_no_b...,2,Convertible,Convertible,Convertible_train_orig_test_00112_resized.jpg
...,...,...,...,...,...
6591,Sedan_train_orig_train_08136_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08136_resized.jpg
6592,Sedan_train_orig_train_08137_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08137_resized.jpg
6593,Sedan_train_orig_train_08138_resized_no_blur.jpg,1,Sedan,Sedan,Sedan_train_orig_train_08138_resized.jpg
6594,Sedan_train_orig_train_08139_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_train_orig_train_08139_resized.jpg


In [8]:
# Correct class for items in test dataset
# Load "relabeled_test_no_blur_old_and_new_labels.xlsx"
relabeled_test_no_blur_old_and_new_labels = pd.read_excel('../../Data/Relabeled_Test_No_Blur/relabeled_test_no_blur_old_and_new_labels.xlsx')
relabeled_test_no_blur_old_and_new_labels

Unnamed: 0,filename,label,New Class,Old Class,harmonized_filename
0,Convertible_test_orig_test_00023_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00023_resized.jpg
1,Convertible_test_orig_test_00096_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00096_resized.jpg
2,Convertible_test_orig_test_00107_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00107_resized.jpg
3,Convertible_test_orig_test_00135_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00135_resized.jpg
4,Convertible_test_orig_test_00147_resized_no_bl...,2,Convertible,Convertible,Convertible_test_orig_test_00147_resized.jpg
...,...,...,...,...,...
1651,Sedan_test_orig_train_08026_resized_no_blur.jpg,5,Ambiguous,Sedan,Sedan_test_orig_train_08026_resized.jpg
1652,Sedan_test_orig_train_08043_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08043_resized.jpg
1653,Sedan_test_orig_train_08046_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08046_resized.jpg
1654,Sedan_test_orig_train_08098_resized_no_blur.jpg,3,Sedan,Sedan,Sedan_test_orig_train_08098_resized.jpg


In [9]:
# Stack train and test relabeled dataframes
# Keep limited columns
relabeled = pd.concat([relabeled_train_no_blur_old_and_new_labels, relabeled_test_no_blur_old_and_new_labels])[['harmonized_filename', 'New Class', 'Old Class']]
relabeled

Unnamed: 0,harmonized_filename,New Class,Old Class
0,Convertible_train_orig_test_00002_resized.jpg,Convertible,Convertible
1,Convertible_train_orig_test_00037_resized.jpg,Convertible,Convertible
2,Convertible_train_orig_test_00060_resized.jpg,Convertible,Convertible
3,Convertible_train_orig_test_00087_resized.jpg,Convertible,Convertible
4,Convertible_train_orig_test_00112_resized.jpg,Convertible,Convertible
...,...,...,...
1651,Sedan_test_orig_train_08026_resized.jpg,Ambiguous,Sedan
1652,Sedan_test_orig_train_08043_resized.jpg,Sedan,Sedan
1653,Sedan_test_orig_train_08046_resized.jpg,Sedan,Sedan
1654,Sedan_test_orig_train_08098_resized.jpg,Sedan,Sedan


In [22]:
# Merge on 'filename'
all_features = vgg_vectors.copy()
all_features = all_features.merge(relabeled, on='harmonized_filename', how='left')
all_features

Unnamed: 0,Image Path VGG,VGG_Embedding_Element_0,VGG_Embedding_Element_1,VGG_Embedding_Element_2,VGG_Embedding_Element_3,VGG_Embedding_Element_4,VGG_Embedding_Element_5,VGG_Embedding_Element_6,VGG_Embedding_Element_7,VGG_Embedding_Element_8,...,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,test_80_20,harmonized_filename,New Class,Old Class
0,../../../Images/train/No Blur/Sedan_train_orig...,0.057273,0.085857,0.014088,0.000000,0.000000,0.000000,1.265887,0.025079,0.020789,...,0.005827,0.039591,0.000000,0.063602,0.365371,0.000000,0,Sedan_train_orig_train_04172_resized.jpg,Sedan,Sedan
1,../../../Images/train/No Blur/Convertible_trai...,0.293713,0.714101,0.075956,0.144106,0.011360,0.000000,2.309738,0.094079,0.409755,...,0.000000,0.037017,0.350120,0.543494,0.100478,0.000000,0,Convertible_train_orig_test_01764_resized.jpg,Convertible,Convertible
2,../../../Images/train/No Blur/SUV_train_orig_t...,0.147546,0.793779,0.000000,0.014114,0.237972,0.191854,1.408566,0.000000,0.314723,...,0.000000,0.019142,0.110762,0.750666,0.198542,0.275273,0,SUV_train_orig_test_07060_resized.jpg,Ambiguous,SUV
3,../../../Images/train/No Blur/Sedan_train_orig...,0.043017,1.089323,0.000000,0.004603,0.012446,0.000000,0.802531,0.000000,0.444981,...,0.000000,0.063925,0.189980,0.426336,0.031850,0.021951,0,Sedan_train_orig_test_07195_resized.jpg,Sedan,Sedan
4,../../../Images/train/No Blur/Sedan_train_orig...,0.027134,0.049786,0.030874,0.000000,0.003062,0.000000,1.098564,0.000000,0.005797,...,0.000000,0.017989,0.000000,0.532629,0.047045,0.000000,0,Sedan_train_orig_test_04947_resized.jpg,Sedan,Sedan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/train/No Blur/Sedan_train_orig...,0.000000,0.355245,0.014701,0.000000,0.063166,0.000000,0.168762,0.068016,0.209514,...,0.013373,0.039992,0.152950,0.097856,0.072973,0.033932,0,Sedan_train_orig_train_00112_resized.jpg,Sedan,Sedan
8248,../../../Images/train/No Blur/Pickup_train_ori...,0.114592,0.210919,0.000000,0.000000,0.031758,0.000000,0.007828,0.000000,0.209584,...,0.000000,0.519490,0.037494,0.018614,1.264154,0.019924,0,Pickup_train_orig_test_01147_resized.jpg,Pickup,Pickup
8249,../../../Images/train/No Blur/SUV_train_orig_t...,0.084685,1.241065,0.000000,0.000000,0.025655,0.000000,1.467685,0.000000,0.184841,...,0.000000,0.476425,0.031688,0.863446,0.086722,0.031863,0,SUV_train_orig_test_08000_resized.jpg,SUV,SUV
8250,../../../Images/train/No Blur/Sedan_train_orig...,0.532457,0.549263,0.000000,0.000000,0.000000,0.020113,0.588399,0.000000,0.198008,...,0.000000,0.099895,0.018707,0.320334,0.286392,0.000000,0,Sedan_train_orig_test_01132_resized.jpg,Sedan,Sedan


### Create Class

In [23]:
# Value counts of 'New Class'
all_features['New Class'].value_counts()

Sedan                    3114
SUV                      2057
Convertible              1419
Pickup                    902
Ambiguous                 754
Clearly None of the 4       6
Name: New Class, dtype: int64

In [24]:
# Compare 'New Class' and 'Old Class' columns
old_v_new_class = all_features[['Old Class', 'New Class']].value_counts().reset_index().sort_values(by=['Old Class', 'New Class'])
old_v_new_class

Unnamed: 0,Old Class,New Class,0
7,Convertible,Ambiguous,86
2,Convertible,Convertible,1417
9,Convertible,Sedan,11
8,Pickup,Ambiguous,78
3,Pickup,Pickup,799
5,SUV,Ambiguous,245
6,SUV,Pickup,101
1,SUV,SUV,2057
10,SUV,Sedan,6
4,Sedan,Ambiguous,345


In [25]:
# Set 'Class' to 'New Class'
all_features['Class'] = all_features['New Class']
# Keep cases where 'New Class' is in 'SUV', 'Sedan', 'Pickup', 'Convertible'
all_features = all_features[all_features['Class'].isin(['SUV', 'Sedan', 'Pickup', 'Convertible'])]
# Drop 'New Class' and 'Old Class' columns
all_features.drop(columns=['New Class', 'Old Class'], inplace=True)
all_features

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features.drop(columns=['New Class', 'Old Class'], inplace=True)


Unnamed: 0,Image Path VGG,VGG_Embedding_Element_0,VGG_Embedding_Element_1,VGG_Embedding_Element_2,VGG_Embedding_Element_3,VGG_Embedding_Element_4,VGG_Embedding_Element_5,VGG_Embedding_Element_6,VGG_Embedding_Element_7,VGG_Embedding_Element_8,...,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511,test_80_20,harmonized_filename,Class
0,../../../Images/train/No Blur/Sedan_train_orig...,0.057273,0.085857,0.014088,0.000000,0.000000,0.000000,1.265887,0.025079,0.020789,...,0.000000,0.005827,0.039591,0.000000,0.063602,0.365371,0.000000,0,Sedan_train_orig_train_04172_resized.jpg,Sedan
1,../../../Images/train/No Blur/Convertible_trai...,0.293713,0.714101,0.075956,0.144106,0.011360,0.000000,2.309738,0.094079,0.409755,...,0.000000,0.000000,0.037017,0.350120,0.543494,0.100478,0.000000,0,Convertible_train_orig_test_01764_resized.jpg,Convertible
3,../../../Images/train/No Blur/Sedan_train_orig...,0.043017,1.089323,0.000000,0.004603,0.012446,0.000000,0.802531,0.000000,0.444981,...,0.164781,0.000000,0.063925,0.189980,0.426336,0.031850,0.021951,0,Sedan_train_orig_test_07195_resized.jpg,Sedan
4,../../../Images/train/No Blur/Sedan_train_orig...,0.027134,0.049786,0.030874,0.000000,0.003062,0.000000,1.098564,0.000000,0.005797,...,0.000000,0.000000,0.017989,0.000000,0.532629,0.047045,0.000000,0,Sedan_train_orig_test_04947_resized.jpg,Sedan
5,../../../Images/train/No Blur/Sedan_train_orig...,0.023348,0.939732,0.000000,0.000000,0.000000,0.007038,1.081222,0.000000,0.370489,...,0.000000,0.000000,0.009107,0.030275,0.246710,0.672856,0.009486,0,Sedan_train_orig_train_00667_resized.jpg,Sedan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,../../../Images/train/No Blur/Sedan_train_orig...,0.000000,0.355245,0.014701,0.000000,0.063166,0.000000,0.168762,0.068016,0.209514,...,0.000000,0.013373,0.039992,0.152950,0.097856,0.072973,0.033932,0,Sedan_train_orig_train_00112_resized.jpg,Sedan
8248,../../../Images/train/No Blur/Pickup_train_ori...,0.114592,0.210919,0.000000,0.000000,0.031758,0.000000,0.007828,0.000000,0.209584,...,0.000000,0.000000,0.519490,0.037494,0.018614,1.264154,0.019924,0,Pickup_train_orig_test_01147_resized.jpg,Pickup
8249,../../../Images/train/No Blur/SUV_train_orig_t...,0.084685,1.241065,0.000000,0.000000,0.025655,0.000000,1.467685,0.000000,0.184841,...,0.000000,0.000000,0.476425,0.031688,0.863446,0.086722,0.031863,0,SUV_train_orig_test_08000_resized.jpg,SUV
8250,../../../Images/train/No Blur/Sedan_train_orig...,0.532457,0.549263,0.000000,0.000000,0.000000,0.020113,0.588399,0.000000,0.198008,...,0.000000,0.000000,0.099895,0.018707,0.320334,0.286392,0.000000,0,Sedan_train_orig_test_01132_resized.jpg,Sedan


In [26]:
# Value counts for 'Class'
all_features['Class'].value_counts()

Sedan          3114
SUV            2057
Convertible    1419
Pickup          902
Name: Class, dtype: int64

In [27]:
# Value counts of 'test_80_20'
all_features['test_80_20'].value_counts()

0    6003
1    1489
Name: test_80_20, dtype: int64

## Clean Up Columns

In [28]:
# Get image path columns
image_path_columns = [col for col in all_features.columns if 'Image Path' in col]
# Split off and delete Image Path columns
image_path_df = all_features[['harmonized_filename'] + image_path_columns]
#image_path_df.to_excel('../../Data/Features/All Features Image Paths/All_Features_Image_Paths.xlsx', index=False)
all_features.drop(columns=image_path_columns, inplace=True)
# Reorder columns
front_cols = ['Class', 'harmonized_filename', 'test_80_20']
all_features = all_features[front_cols + [col for col in all_features.columns if col not in front_cols]]
all_features

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features.drop(columns=image_path_columns, inplace=True)


Unnamed: 0,Class,harmonized_filename,test_80_20,VGG_Embedding_Element_0,VGG_Embedding_Element_1,VGG_Embedding_Element_2,VGG_Embedding_Element_3,VGG_Embedding_Element_4,VGG_Embedding_Element_5,VGG_Embedding_Element_6,...,VGG_Embedding_Element_502,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511
0,Sedan,Sedan_train_orig_train_04172_resized.jpg,0,0.057273,0.085857,0.014088,0.000000,0.000000,0.000000,1.265887,...,0.052797,0.479325,0.288115,0.000000,0.005827,0.039591,0.000000,0.063602,0.365371,0.000000
1,Convertible,Convertible_train_orig_test_01764_resized.jpg,0,0.293713,0.714101,0.075956,0.144106,0.011360,0.000000,2.309738,...,0.066524,0.156991,0.154690,0.000000,0.000000,0.037017,0.350120,0.543494,0.100478,0.000000
3,Sedan,Sedan_train_orig_test_07195_resized.jpg,0,0.043017,1.089323,0.000000,0.004603,0.012446,0.000000,0.802531,...,0.252442,0.081028,0.072416,0.164781,0.000000,0.063925,0.189980,0.426336,0.031850,0.021951
4,Sedan,Sedan_train_orig_test_04947_resized.jpg,0,0.027134,0.049786,0.030874,0.000000,0.003062,0.000000,1.098564,...,0.119230,1.161802,0.791576,0.000000,0.000000,0.017989,0.000000,0.532629,0.047045,0.000000
5,Sedan,Sedan_train_orig_train_00667_resized.jpg,0,0.023348,0.939732,0.000000,0.000000,0.000000,0.007038,1.081222,...,0.055250,0.061042,0.148557,0.000000,0.000000,0.009107,0.030275,0.246710,0.672856,0.009486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,Sedan,Sedan_train_orig_train_00112_resized.jpg,0,0.000000,0.355245,0.014701,0.000000,0.063166,0.000000,0.168762,...,0.076708,0.052927,0.068890,0.000000,0.013373,0.039992,0.152950,0.097856,0.072973,0.033932
8248,Pickup,Pickup_train_orig_test_01147_resized.jpg,0,0.114592,0.210919,0.000000,0.000000,0.031758,0.000000,0.007828,...,0.092542,0.043503,0.454710,0.000000,0.000000,0.519490,0.037494,0.018614,1.264154,0.019924
8249,SUV,SUV_train_orig_test_08000_resized.jpg,0,0.084685,1.241065,0.000000,0.000000,0.025655,0.000000,1.467685,...,0.343811,0.004149,0.032092,0.000000,0.000000,0.476425,0.031688,0.863446,0.086722,0.031863
8250,Sedan,Sedan_train_orig_test_01132_resized.jpg,0,0.532457,0.549263,0.000000,0.000000,0.000000,0.020113,0.588399,...,0.000000,0.239137,0.033320,0.000000,0.000000,0.099895,0.018707,0.320334,0.286392,0.000000


## Add Blur and No-Blur Paths

In [29]:
def construct_path(train_test, blur_no_blur, harmonized_filename):
    return '../../../Images/' + train_test + '/' + blur_no_blur + '/' + harmonized_filename

# Add image_path_blur column
all_features['image_path_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'Blurred', x['harmonized_filename']), axis=1)
# Add image_path_no_blur column
all_features['image_path_no_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'No_Blur', x['harmonized_filename']), axis=1)
# Reorder columns - Class, harmonized_filename, test_80_20, image_path_blur, image_path_no_blur, all other columns
front_cols = front_cols + ['image_path_blur', 'image_path_no_blur']
all_features = all_features[front_cols + [col for col in all_features.columns if col not in front_cols]]
all_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features['image_path_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'Blurred', x['harmonized_filename']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features['image_path_no_blur'] = all_features.apply(lambda x: construct_path('train' if x['test_80_20'] == 0 else 'test', 'No_Blur', x['harmonized_filename']), axis=1)


Unnamed: 0,Class,harmonized_filename,test_80_20,image_path_blur,image_path_no_blur,VGG_Embedding_Element_0,VGG_Embedding_Element_1,VGG_Embedding_Element_2,VGG_Embedding_Element_3,VGG_Embedding_Element_4,...,VGG_Embedding_Element_502,VGG_Embedding_Element_503,VGG_Embedding_Element_504,VGG_Embedding_Element_505,VGG_Embedding_Element_506,VGG_Embedding_Element_507,VGG_Embedding_Element_508,VGG_Embedding_Element_509,VGG_Embedding_Element_510,VGG_Embedding_Element_511
0,Sedan,Sedan_train_orig_train_04172_resized.jpg,0,../../../Images/train/Blurred/Sedan_train_orig...,../../../Images/train/No_Blur/Sedan_train_orig...,0.057273,0.085857,0.014088,0.000000,0.000000,...,0.052797,0.479325,0.288115,0.000000,0.005827,0.039591,0.000000,0.063602,0.365371,0.000000
1,Convertible,Convertible_train_orig_test_01764_resized.jpg,0,../../../Images/train/Blurred/Convertible_trai...,../../../Images/train/No_Blur/Convertible_trai...,0.293713,0.714101,0.075956,0.144106,0.011360,...,0.066524,0.156991,0.154690,0.000000,0.000000,0.037017,0.350120,0.543494,0.100478,0.000000
3,Sedan,Sedan_train_orig_test_07195_resized.jpg,0,../../../Images/train/Blurred/Sedan_train_orig...,../../../Images/train/No_Blur/Sedan_train_orig...,0.043017,1.089323,0.000000,0.004603,0.012446,...,0.252442,0.081028,0.072416,0.164781,0.000000,0.063925,0.189980,0.426336,0.031850,0.021951
4,Sedan,Sedan_train_orig_test_04947_resized.jpg,0,../../../Images/train/Blurred/Sedan_train_orig...,../../../Images/train/No_Blur/Sedan_train_orig...,0.027134,0.049786,0.030874,0.000000,0.003062,...,0.119230,1.161802,0.791576,0.000000,0.000000,0.017989,0.000000,0.532629,0.047045,0.000000
5,Sedan,Sedan_train_orig_train_00667_resized.jpg,0,../../../Images/train/Blurred/Sedan_train_orig...,../../../Images/train/No_Blur/Sedan_train_orig...,0.023348,0.939732,0.000000,0.000000,0.000000,...,0.055250,0.061042,0.148557,0.000000,0.000000,0.009107,0.030275,0.246710,0.672856,0.009486
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8247,Sedan,Sedan_train_orig_train_00112_resized.jpg,0,../../../Images/train/Blurred/Sedan_train_orig...,../../../Images/train/No_Blur/Sedan_train_orig...,0.000000,0.355245,0.014701,0.000000,0.063166,...,0.076708,0.052927,0.068890,0.000000,0.013373,0.039992,0.152950,0.097856,0.072973,0.033932
8248,Pickup,Pickup_train_orig_test_01147_resized.jpg,0,../../../Images/train/Blurred/Pickup_train_ori...,../../../Images/train/No_Blur/Pickup_train_ori...,0.114592,0.210919,0.000000,0.000000,0.031758,...,0.092542,0.043503,0.454710,0.000000,0.000000,0.519490,0.037494,0.018614,1.264154,0.019924
8249,SUV,SUV_train_orig_test_08000_resized.jpg,0,../../../Images/train/Blurred/SUV_train_orig_t...,../../../Images/train/No_Blur/SUV_train_orig_t...,0.084685,1.241065,0.000000,0.000000,0.025655,...,0.343811,0.004149,0.032092,0.000000,0.000000,0.476425,0.031688,0.863446,0.086722,0.031863
8250,Sedan,Sedan_train_orig_test_01132_resized.jpg,0,../../../Images/train/Blurred/Sedan_train_orig...,../../../Images/train/No_Blur/Sedan_train_orig...,0.532457,0.549263,0.000000,0.000000,0.000000,...,0.000000,0.239137,0.033320,0.000000,0.000000,0.099895,0.018707,0.320334,0.286392,0.000000


In [30]:
# Print all columns in all_features
for col in all_features.columns:
    print(col)

Class
harmonized_filename
test_80_20
image_path_blur
image_path_no_blur
VGG_Embedding_Element_0
VGG_Embedding_Element_1
VGG_Embedding_Element_2
VGG_Embedding_Element_3
VGG_Embedding_Element_4
VGG_Embedding_Element_5
VGG_Embedding_Element_6
VGG_Embedding_Element_7
VGG_Embedding_Element_8
VGG_Embedding_Element_9
VGG_Embedding_Element_10
VGG_Embedding_Element_11
VGG_Embedding_Element_12
VGG_Embedding_Element_13
VGG_Embedding_Element_14
VGG_Embedding_Element_15
VGG_Embedding_Element_16
VGG_Embedding_Element_17
VGG_Embedding_Element_18
VGG_Embedding_Element_19
VGG_Embedding_Element_20
VGG_Embedding_Element_21
VGG_Embedding_Element_22
VGG_Embedding_Element_23
VGG_Embedding_Element_24
VGG_Embedding_Element_25
VGG_Embedding_Element_26
VGG_Embedding_Element_27
VGG_Embedding_Element_28
VGG_Embedding_Element_29
VGG_Embedding_Element_30
VGG_Embedding_Element_31
VGG_Embedding_Element_32
VGG_Embedding_Element_33
VGG_Embedding_Element_34
VGG_Embedding_Element_35
VGG_Embedding_Element_36
VGG_Embedding

## Output Training and Testing Dataframes

In [31]:
# Split df into train and test
all_features_train = all_features[all_features['test_80_20'] == 0]
all_features_test = all_features[all_features['test_80_20'] == 1]

# Drop test_80_20 column
all_features_train.drop(columns='test_80_20', inplace=True)
all_features_test.drop(columns='test_80_20', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features_train.drop(columns='test_80_20', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_features_test.drop(columns='test_80_20', inplace=True)


In [32]:
def split_df(df, dataset_name, out_folder, num_pieces):
    '''
    Splits dataframes into num_pieces and saves them as parquet files in out_folder. Reduces file size to comply with GitHub limits.
    '''
    # Tracking total length of pieces
    total_len_pieces = 0
    # Delete previous pieces, all contents of out_folder
    for filename in os.listdir(out_folder):
        file_path = os.path.join(out_folder, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    # Save pieces
    for i in range(num_pieces):
        # start index for piece rows
        start_index = i * len(df) // num_pieces
        # end index for piece rows
        end_index = (i + 1) * len(df) // num_pieces
        # get piece
        piece = df[start_index:end_index]
        piece.to_parquet(out_folder + '/' + dataset_name + '_piece_' + str(i) + '.parquet', index=False)
        #print(len(piece))
        total_len_pieces += len(piece)
    
    # check total piece length and length of vit_embeddings_df
    print('length check passed')
    print(total_len_pieces == len(df))

In [34]:
# Run on all_features dataframes
split_df(all_features_train, 'VGG_train', '../../Data/Features/VGG/train', 16)
split_df(all_features_test, 'VGG_test', '../../Data/Features/VGG/test', 16)

length check passed
True
length check passed
True


## Output 100 row sample to Excel

In [28]:
#all_features_train.sample(100).to_excel('../../Data/Features/All Features Train Sample/all_features_train_sample.xlsx', index=False)