In [1]:
# !pip3 install xgboost

In [2]:
import pandas as pd
import os
import joblib
import time
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [3]:
# # Start timer
# start_time = time.time()

# Load Training Data
def combine_directory_parquets(directory_path):
    '''
    Combines all parquet files in a directory into a single dataframe.
    '''
    if directory_path[-1] != '/':
        directory_path += '/'
    file_list = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    combined_df = pd.concat([pd.read_parquet(directory_path + f) for f in file_list])
    return combined_df

In [4]:
training_data = combine_directory_parquets('../../../Data/Features/All Features/train')
print('All training data:')
print(training_data)

All training data:
           Class                             harmonized_filename  \
0         Pickup       Pickup_train_orig_train_02014_resized.jpg   
1          Sedan         Sedan_train_orig_test_03371_resized.jpg   
2          Sedan        Sedan_train_orig_train_01087_resized.jpg   
3          Sedan         Sedan_train_orig_test_05800_resized.jpg   
4          Sedan        Sedan_train_orig_train_01720_resized.jpg   
..           ...                                             ...   
370          SUV           SUV_train_orig_test_06741_resized.jpg   
371  Convertible  Convertible_train_orig_train_03194_resized.jpg   
372        Sedan         Sedan_train_orig_test_07086_resized.jpg   
373       Pickup        Pickup_train_orig_test_04744_resized.jpg   
374  Convertible  Convertible_train_orig_train_05719_resized.jpg   

                                       image_path_blur  \
0    ../../../Images/train/Blurred/Pickup_train_ori...   
1    ../../../Images/train/Blurred/Sedan_train_o

In [5]:
# Filter data
training_data = training_data[training_data['Class'].isin(['SUV', 'Pickup', 'Sedan', 'Convertible'])]
print(training_data)

           Class                             harmonized_filename  \
0         Pickup       Pickup_train_orig_train_02014_resized.jpg   
1          Sedan         Sedan_train_orig_test_03371_resized.jpg   
2          Sedan        Sedan_train_orig_train_01087_resized.jpg   
3          Sedan         Sedan_train_orig_test_05800_resized.jpg   
4          Sedan        Sedan_train_orig_train_01720_resized.jpg   
..           ...                                             ...   
370          SUV           SUV_train_orig_test_06741_resized.jpg   
371  Convertible  Convertible_train_orig_train_03194_resized.jpg   
372        Sedan         Sedan_train_orig_test_07086_resized.jpg   
373       Pickup        Pickup_train_orig_test_04744_resized.jpg   
374  Convertible  Convertible_train_orig_train_05719_resized.jpg   

                                       image_path_blur  \
0    ../../../Images/train/Blurred/Pickup_train_ori...   
1    ../../../Images/train/Blurred/Sedan_train_orig...   
2    ../.

In [6]:
sample_run = False
if sample_run:
    training_data = training_data.sample(frac=0.01)

In [7]:
#XGBoost
hyperparameter_settings = [
    {
        'learning_rate': [0.01, 0.1, 0.3], 
        'n_estimators': [100, 200],
        'max_depth': [3, 6], 
        'subsample': [0.8, 1], 
        'colsample_bytree': [0.8, 1],
    }
]

In [8]:
# Create feature matrix (X) and target vector (y)
num_cols = training_data.select_dtypes(include=['float64', 'int64']).columns
X = training_data[num_cols]
y = training_data['Class']

# Encode class labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [9]:
# Initialize XGBoost classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [None]:
# Grid search with 5-fold cross-validation
gs = GridSearchCV(xgb_clf, hyperparameter_settings, scoring='accuracy', cv=5, n_jobs=-1).fit(X, y_encoded)

In [None]:
print("Tuned hyperparameters:", gs.best_params_)
print("Accuracy:", gs.best_score_)
print("Best model:", gs.best_estimator_)

In [None]:
# Save the best model
joblib.dump(gs.best_estimator_, 'Best_XGBoost_Model.joblib')

In [None]:
print('Runtime in minutes:', (end_time - start_time) / 60)
print('Runtime per image in minutes:', ((end_time - start_time) / len(training_data)) / 60)