In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from w1_feature_fraud_mk import collectAllFeaturesBaseline


df_fraud_aggregated = collectAllFeaturesBaseline() 

In [9]:
total = len(df_fraud_aggregated)
report = (
    pd.DataFrame({
    'column': df_fraud_aggregated.columns,
        'num_missing': df_fraud_aggregated.isna().sum().values
    })
    .assign(
        pct_missing=lambda d: d['num_missing'] / total * 100,
        has_missing=lambda d: d['num_missing'] > 0
    )
)
print (report)

                                        column  num_missing  pct_missing   
0                                    client_id            0     0.000000  \
1                                       target            0     0.000000   
2                     f_invoive_date_diff_days         4212     3.108648   
3                 f_invoive_date_median_months         4212     3.108648   
4                  f_invoive_date_median_years         4212     3.108648   
5               f_counter_statue_error_occured            0     0.000000   
6                            f_counter_regions            0     0.000000   
7                          f_region_fraud_rate            0     0.000000   
8        f_region_median_billing_frequence_per            2     0.001476   
9   f_region_std_deviation_consumption_level_1            0     0.000000   
10  f_region_std_deviation_consumption_level_2            0     0.000000   
11  f_region_std_deviation_consumption_level_3            0     0.000000   
12  f_region

In [10]:
X = df_fraud_aggregated.drop(columns=["target", "client_id"])
X.fillna(0, inplace=True)  
y = df_fraud_aggregated["target"]

# 2. Train/Test-Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

# 3. Feature-Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 4. Modell-Definition und Training
model = LogisticRegression(
    class_weight='balanced',  # hier werden die Klassengewichte automatisch angepasst
    solver='liblinear',       # geeignet für kleine bis mittelgroße Datensätze
    random_state=42
)
model.fit(X_train_scaled, y_train)

# 5. Evaluation
y_pred = model.predict(X_test_scaled)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[25734 12644]
 [  847  1423]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.67      0.79     38378
         1.0       0.10      0.63      0.17      2270

    accuracy                           0.67     40648
   macro avg       0.53      0.65      0.48     40648
weighted avg       0.92      0.67      0.76     40648

