## Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline


## Load Data

In [9]:
data_path = '../IDMT-Traffic/datasets/df_main_encoded_only.csv'  
df = pd.read_csv(data_path)
#df = df.drop(columns=['file', 'Unnamed: 0', 'is_background_encoded'])
df = df.drop(columns=['file', 'Unnamed: 0', 'is_background_encoded', 'date_time_encoded'])

print(df.head(2))

   location_encoded  speed_kmh_encoded  daytime_encoded  weather_encoded  \
0                 0                  0                1                0   
1                 0                  0                1                0   

   vehicle_encoded  source_direction_encoded  microphone_encoded  \
0                1                         1                   0   
1                1                         1                   1   

   channel_encoded     mfcc_1     mfcc_2  ...  band_27_dB  band_28_dB  \
0                0  55.967507   8.496416  ...   37.024301   38.508511   
1                1  52.986820  12.135305  ...   34.516289   34.960402   

   band_29_dB  peak_dB_1  peak_freq_1  peak_dB_2  peak_freq_2  peak_dB_3  \
0   35.946349  50.180933    31.622777  49.528332  1000.000000  47.901831   
1   33.187933  54.903541  1000.000000  53.196406   794.328235  51.942355   

   peak_freq_3  octband_dB_mean  
0   794.328235        42.383307  
1   501.187234        42.050454  

[2 rows x 57 c

## Data Initialization & Split

In [10]:
# set target
target = 'daytime_encoded'  # Zielvariable
X = df.drop(columns=[target])  # Features (alle Spalten außer 'daytime')
y = df[target]  # Zielvariable

# Train test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Ausgabe der Größen
print(f"Trainingsdaten: {X_train.shape}, Testdaten: {X_test.shape}")


# 1st split: train/ test 
#X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# 2nd split: train / validation 
#X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42, stratify=y_train_full)
# TRAIN 60% VAL 20% TEST 20%
#print(f"Trainingsdaten: {X_train.shape}, Validierungsdaten: {X_val.shape}, Testdaten: {X_test.shape}")
# standardize data
#scaler = StandardScaler() 
#X_filtered_scaled = scaler.fit_transform(X)

Trainingsdaten: (6552, 56), Testdaten: (2809, 56)


### Baseline Approach

In [11]:
# initialize model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# train model
xgb_model.fit(X_train, y_train)

# results
y_pred = xgb_model.predict(X_test)

target_names = ['A', 'M']

print("Accuracy:", accuracy_score(y_test, y_pred))
print('-'*80)

print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names= target_names))
print('-'*80)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print('-'*80)

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9409042363830544
--------------------------------------------------------------------------------

Classification Report:
               precision    recall  f1-score   support

           A       0.95      0.95      0.95      1642
           M       0.93      0.92      0.93      1167

    accuracy                           0.94      2809
   macro avg       0.94      0.94      0.94      2809
weighted avg       0.94      0.94      0.94      2809

--------------------------------------------------------------------------------

Confusion Matrix:
 [[1564   78]
 [  88 1079]]
--------------------------------------------------------------------------------


### Pipeline with Scaler, Cross Validation

In [12]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Schritt 1: Standardisierung
    ('classifier', XGBClassifier(  # Schritt 2: XGBoost
        random_state=42,
        eval_metric='logloss'    # Bewertungsmetrik
    ))  
])

# Cross-Validation auf den Trainingsdaten
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')

# Pipeline trainieren auf dem gesamten Training-Set
pipeline.fit(X_train, y_train)

# Vorhersagen auf den Testdaten
y_pred = pipeline.predict(X_test)

# Ergebnisse ausgeben
target_names = ['A', 'M']
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))
print("\nTestdaten-Auswertung:")
print(classification_report(y_test, y_pred, target_names= target_names))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Cross-Validation Accuracy Scores: [0.935164   0.93363844 0.9259542  0.91526718 0.92824427]
Mean CV Accuracy: 0.9276536179479565

Testdaten-Auswertung:
              precision    recall  f1-score   support

           A       0.95      0.95      0.95      1642
           M       0.93      0.92      0.93      1167

    accuracy                           0.94      2809
   macro avg       0.94      0.94      0.94      2809
weighted avg       0.94      0.94      0.94      2809


Confusion Matrix:
 [[1564   78]
 [  88 1079]]
