In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import mutual_info_classif
import os

In [3]:
# --- Schritt 1: Korrelationsanalyse und initiale Feature-Selektion ---
def remove_highly_correlated_features(df, threshold=0.9):
    correlation_matrix = df.corr().abs()
    upper_triangle = correlation_matrix.where(
        np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
    )
    
    # Finden der hoch korrelierten Features
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    return df.drop(columns=to_drop), to_drop

# --- Schritt 2: Modelltraining und Speicherung der wichtigen Features ---
def train_and_evaluate(X, y, threshold=0.85):
    # Train-Test-Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Modell initialisieren und trainieren
    log_reg = LogisticRegression(max_iter=1000, random_state=42)
    log_reg.fit(X_train, y_train)

    # Vorhersagen und Metriken
    y_pred = log_reg.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # Extraktion der wichtigen Features
    important_features = pd.Series(log_reg.coef_[0], index=X.columns)
    important_features = important_features[important_features.abs() > threshold]
    return important_features, log_reg

# --- Schritt 3: Analyse der wichtigen Features ---
def analyze_important_features(df, important_features):
    # Korrelationsmatrix für wichtige Features
    important_corr = df[important_features.index].corr()
    print("Correlation Matrix of Important Features:")
    print(important_corr)
    return important_corr


In [6]:
# --- Anwendung der Pipeline ---
# Daten vorbereiten
path = '../IDMT-Traffic/datasets/df_main_encoded_only.csv' 
df = pd.read_csv(path)
df = df.drop(columns=['file', 'Unnamed: 0', 'is_background_encoded'])

target = 'daytime_encoded'
X = df.drop(columns=[target])
y = df[target]

# Initialisierung der Parameter
corr_threshold = 0.95
coef_threshold = 0.85
iterations = 3

# Ergebnisse speichern
used_features = []
current_df = X.copy()

for iteration in range(iterations):
    print(f"\n--- Iteration {iteration + 1} ---")

    # Schritt 1: Entfernen hoch korrelierter Features
    current_df, dropped_features = remove_highly_correlated_features(current_df, threshold=corr_threshold)
    print(f"Dropped Features due to high correlation: {dropped_features}")

    # Schritt 2: Modelltraining und wichtige Features speichern
    important_features, model = train_and_evaluate(current_df, y, threshold=coef_threshold)
    print(f"Important Features (>|{coef_threshold}|): {list(important_features.index)}")

    # Schritt 3: Analyse der wichtigen Features
    analyze_important_features(df, important_features)

    # Speichern der verwendeten Features
    used_features.append(list(current_df.columns))
    with open(f"features_iteration_{iteration + 1}.txt", "w") as f:
        f.write(f"Iteration {iteration + 1} Features:\n")
        f.write("\n".join(current_df.columns))


--- Iteration 1 ---
Dropped Features due to high correlation: ['band_2_dB', 'band_14_dB', 'band_15_dB', 'band_16_dB', 'band_17_dB', 'band_18_dB', 'band_19_dB', 'band_20_dB', 'band_21_dB', 'band_22_dB', 'band_23_dB', 'band_24_dB', 'band_25_dB', 'band_26_dB', 'band_27_dB', 'band_28_dB', 'band_29_dB', 'peak_dB_2', 'peak_dB_3', 'octband_dB_mean']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.893556425774297
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1645
           1       0.88      0.86      0.87      1164

    accuracy                           0.89      2809
   macro avg       0.89      0.89      0.89      2809
weighted avg       0.89      0.89      0.89      2809

Important Features (>|0.85|): ['mfcc_2']
Correlation Matrix of Important Features:
        mfcc_2
mfcc_2     1.0

--- Iteration 2 ---
Dropped Features due to high correlation: []


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.893556425774297
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1645
           1       0.88      0.86      0.87      1164

    accuracy                           0.89      2809
   macro avg       0.89      0.89      0.89      2809
weighted avg       0.89      0.89      0.89      2809

Important Features (>|0.85|): ['mfcc_2']
Correlation Matrix of Important Features:
        mfcc_2
mfcc_2     1.0

--- Iteration 3 ---
Dropped Features due to high correlation: []
Accuracy: 0.893556425774297
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1645
           1       0.88      0.86      0.87      1164

    accuracy                           0.89      2809
   macro avg       0.89      0.89      0.89      2809
weighted avg       0.89      0.89      0.89      2809

Important Features (>|0.85|): ['mfcc_2']
Correlation Matrix of Important Features:
        mfcc_2
mfcc_2     1.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
