In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score       

from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline  


from fraud import collectAllFeaturesBaseline


df_fraud_aggregated = collectAllFeaturesBaseline() 

  self._frames[str(p)] = pd.read_csv(p)


In [3]:


# 0) Daten vorbereiten
X = df_fraud_aggregated.drop(columns=["target", "client_id"]).fillna(0)
y = df_fraud_aggregated["target"]

# 1) Stratified KFold anlegen
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2) Listen für Kennzahlen
precisions, recalls, f1s = [], [], []

# 3) Durch die Folds iterieren
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # class imbalance weight für XGBoost
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    # Pipeline mit ROS + XGBClassifier
    pipeline = Pipeline([
        ("ros", RandomOverSampler(random_state=42)),
        ("xgb", XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            n_estimators=150,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=scale_pos_weight,
            random_state=42
        ))
    ])

    # Trainieren & Vorhersagen
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"--- Fold {fold} ---")
    print("Confusion Matrix:")
    print(cm)

    # Einzel-Metriken berechnen (pos_label=1)
    p = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
    r = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
    f = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
    print(f"Precision (1): {p:.3f}, Recall (1): {r:.3f}, F1 (1): {f:.3f}\n")

    precisions.append(p)
    recalls.append(r)
    f1s.append(f)

# 4) Durchschnitt über alle Folds
print("=== Durchschnitt über 5 Stratified Folds (positive Klasse) ===")
print(f"Precision: {np.mean(precisions):.3f}")
print(f"Recall:    {np.mean(recalls):.3f}")
print(f"F1-score:  {np.mean(f1s):.3f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 1 ---
Confusion Matrix:
[[ 6976 18610]
 [   91  1422]]
Precision (1): 0.071, Recall (1): 0.940, F1 (1): 0.132



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 2 ---
Confusion Matrix:
[[ 6850 18736]
 [  107  1406]]
Precision (1): 0.070, Recall (1): 0.929, F1 (1): 0.130



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 3 ---
Confusion Matrix:
[[ 7011 18574]
 [   78  1436]]
Precision (1): 0.072, Recall (1): 0.948, F1 (1): 0.133



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 4 ---
Confusion Matrix:
[[ 6930 18655]
 [  102  1411]]
Precision (1): 0.070, Recall (1): 0.933, F1 (1): 0.131



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 5 ---
Confusion Matrix:
[[ 7161 18424]
 [  105  1408]]
Precision (1): 0.071, Recall (1): 0.931, F1 (1): 0.132

=== Durchschnitt über 5 Stratified Folds (positive Klasse) ===
Precision: 0.071
Recall:    0.936
F1-score:  0.132
