# Vehicle Type Classification with **XGBoost**

In [12]:
import pandas as pd

# Load the dataset
file_path = "/Users/jakob/Library/CloudStorage/OneDrive-student.kit.edu/Studium/02_Master/4. Semester/seminar/RoadTrafficNoise/IDMT-Traffic/datasets/df_main_encoded_only.csv"

# Load the dataset
df_vehicle_classification = pd.read_csv(file_path)

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Extract features and target
#X = df_vehicle_classification.drop(columns=['file', 'vehicle_encoded', 'is_background_encoded', 'date_time_encoded'])  # Drop "file" and "vehicle_encoded"
X = df_vehicle_classification.drop(columns=df_vehicle_classification.loc[:, :'channel_encoded'].columns) # only keep sound features
y = df_vehicle_classification['vehicle_encoded']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the XGBoost classifier
xgb_clf = XGBClassifier(eval_metric='mlogloss', random_state=42)
xgb_clf.fit(X_train, y_train)

# Make predictions
y_pred = xgb_clf.predict(X_test)

# Evaluate the model
print("Classification Report on test set:")
print(classification_report(y_test, y_pred))

Classification Report on test set:
              precision    recall  f1-score   support

           0       0.87      0.62      0.72        21
           1       0.91      0.98      0.94      1561
           2       0.94      0.93      0.94        86
           3       0.67      0.27      0.39       205

    accuracy                           0.90      1873
   macro avg       0.85      0.70      0.75      1873
weighted avg       0.88      0.90      0.88      1873



In [14]:
import pandas as pd

# Extract feature importances from the trained XGBoost model
feature_importances = pd.Series(xgb_clf.feature_importances_, index=X.columns)

# Sort the feature importances in descending order
sorted_importances = feature_importances.sort_values(ascending=False)

# Print the sorted feature importances
print(sorted_importances)

mfcc_3             0.097257
band_18_dB         0.056902
peak_freq_1        0.056064
band_26_dB         0.055829
band_19_dB         0.045221
band_3_dB          0.030636
band_4_dB          0.026614
octband_dB_mean    0.026291
mfcc_9             0.022527
mfcc_5             0.021532
band_27_dB         0.021077
band_20_dB         0.020969
band_28_dB         0.020612
band_1_dB          0.019877
band_9_dB          0.019482
mfcc_8             0.018983
band_2_dB          0.018804
band_25_dB         0.017604
band_22_dB         0.017254
mfcc_7             0.016700
band_12_dB         0.016129
mfcc_10            0.015981
peak_freq_2        0.015852
peak_dB_3          0.015821
peak_dB_2          0.015509
mfcc_13            0.014579
band_16_dB         0.014412
band_13_dB         0.014314
band_11_dB         0.014163
band_29_dB         0.014056
band_5_dB          0.014000
band_6_dB          0.013925
band_15_dB         0.013421
band_17_dB         0.013389
band_24_dB         0.013310
mfcc_11            0

Key Observations:

    Class 1 Dominance:
        Class 1 has the highest support (1561 samples), and the model performs very well on it with a high F1-score (0.95).
        This may indicate that the model is biased towards this class due to its prevalence in the dataset.

    Poor Performance on Class 3:
        Class 3 has a relatively lower recall (0.36) and F1-score (0.47), indicating that the model struggles to correctly identify samples of this class.
        This could be due to insufficient data for this class or features that do not distinguish it well from others.

    Macro Average vs. Weighted Average:
        The macro average shows the model's performance across all classes equally (unweighted), which is lower (F1-score: 0.79) due to poor performance on minority classes.
        The weighted average is higher (F1-score: 0.90), influenced by the dominance of Class 1.

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from xgboost import XGBClassifier, DMatrix

# Extract features and target
X = df[features]
y = df['vehicle']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the pipeline
pipeline = Pipeline([
    ('xgb', XGBClassifier(eval_metric='mlogloss'))  # Multiclass support
])

# Define a smaller parameter grid for GridSearchCV
param_grid = {
    'xgb__n_estimators': [100],             # Number of boosting rounds
    'xgb__learning_rate': [0.1, 0.3],       # Learning rate (eta)
    'xgb__max_depth': [3],                  # Maximum depth of trees
    'xgb__gamma': [0, 1],                   # Minimum loss reduction to split
    'xgb__reg_alpha': [0, 0.1],             # L1 regularization term
    'xgb__reg_lambda': [1, 1.5]             # L2 regularization term
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,                 # 5-fold cross-validation
    scoring='f1_macro',   # Optimize for macro F1 score for multiclass
    verbose=1,
    n_jobs=-1             # Use all available CPUs
)

# Perform Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_mapping.values()))

# Save the label mapping
print("Label Mapping (Numeric to Vehicle):")
print(label_mapping)

NameError: name 'features' is not defined