## Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

## Preprocess and Encode Data

In [None]:
path = "../IDMT-Traffic/datasets/df_main.csv"
data = pd.read_csv(path) #data cleaning -> welche Daten nehmen wir, welche nicht? 

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.drop(["sample_pos"], axis=1, inplace=True)  
data.drop(["file"], axis=1, inplace=True)  

data["speed_kmh"]= data["speed_kmh"].replace('UNK', np.nan)
data["speed_kmh"] = pd.to_numeric(data["speed_kmh"], errors="coerce")
data.dropna(inplace=True)  

# Verify no NaN values remain
print(data.isnull().values.any())

## Load Data

In [None]:
data_path = '../IDMT-Traffic/datasets/df_main_encoded_only.csv'  
df = pd.read_csv(data_path)
#df = df.drop(columns=['file'])
#print(df.head(2))

## Data Initialization & Split

In [3]:
target = 'daytime_encoded'  # Zielvariable
X = df.drop(columns=[target])  # Features (alle Spalten außer 'daytime')
y = df[target]  # Zielvariable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Reduzieren des Trainingsdatensatzes auf 200 Instanzen (Probedatensatz)
X_train = X_train[:200]
y_train = y_train[:200]

## Initialize Model

In [4]:
logreg = LogisticRegression(max_iter=1000, random_state=42)

## Train Model

In [None]:
# Trainieren des Modells
logreg.fit(X_train, y_train)

# Vorhersage auf Testdaten
y_pred = logreg.predict(X_test)

## Results

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print('-'*80)

print("\nClassification Report:\n", classification_report(y_test, y_pred))
print('-'*80)

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print('-'*80)