## XGBoost Classifier Predictions

In [1]:
# Packages
import joblib
import pandas as pd
import os
import sklearn

In [3]:
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    if directory_path[-1] != '/':
        directory_path += '/'
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    return combined_df

In [4]:
test_data = combine_directory_parquets('../../../Data/Features/All Features/test')
print('all test data')
print(test_data)

all test data
          Class                            harmonized_filename  \
0         Sedan        Sedan_test_orig_train_00139_resized.jpg   
1         Sedan        Sedan_test_orig_train_07621_resized.jpg   
2   Convertible  Convertible_test_orig_train_03174_resized.jpg   
3         Sedan        Sedan_test_orig_train_03693_resized.jpg   
4         Sedan        Sedan_test_orig_train_02582_resized.jpg   
..          ...                                            ...   
88        Sedan         Sedan_test_orig_test_05891_resized.jpg   
89          SUV          SUV_test_orig_train_07587_resized.jpg   
90  Convertible   Convertible_test_orig_test_01119_resized.jpg   
91          SUV          SUV_test_orig_train_00986_resized.jpg   
92  Convertible   Convertible_test_orig_test_03679_resized.jpg   

                                      image_path_blur  \
0   ../../../Images/test/Blurred/Sedan_test_orig_t...   
1   ../../../Images/test/Blurred/Sedan_test_orig_t...   
2   ../../../Images/te

In [None]:
model = joblib.load('Best_XGBoost_Model.joblib')
print('Best XGBoost model:')
print(best_model)

In [None]:
# Keep only numeric columns for prediction
X_test = test_data.select_dtypes(include=[int, float])

# Make predictions with the XGBoost model
predictions = best_model.predict(X_test)
print('Predictions head:')
print(predictions[:5])

# Add predictions to test_data
test_data['XGBoost_Classification'] = predictions

# Keep only string columns in test_data if needed
# Assuming you want to keep columns with string data, but the current line keeps columns with names that are strings,
# which is redundant since column names are always strings. If you want to keep columns with string data, use:
test_data = test_data.select_dtypes(include=[object])

# Save to Excel
test_data.to_excel('../../../Data/Predictions/XGBoost/XGBoost_Classifier_Predictions.xlsx', index=False)