In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix       

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline  # imblearn-eigene Pipeline, kompatibel zu sklearn

from w1_feature_fraud_mk import Fraud
# Murat
from w1_feature_fraud_mk import  left_join_on, add_invoice_frequency_features, add_counter_statue_error_occured_features
from w1_feature_fraud_mk import add_counter_regions_features, add_region_fraud_rate_features
from w1_feature_fraud_mk import add_median_billing_frequence_per_region, add_sdt_dev_consumption_region
# Dana
from w1_feature_fraud_mk import calculate_mutual_information, visualize_mutual_information, add_consump_agg

In [2]:
fraud = Fraud(["./data/train/client_train.csv", "./data/train/invoice_train.csv"], target_column="target")
client  = fraud["./data/train/client_train.csv"]
invoice = fraud["./data/train/invoice_train.csv"]
fraud_merged = left_join_on("client_id", client, invoice)

  self._frames[str(p)] = pd.read_csv(p)


In [3]:
df_fraud_aggregated = fraud.get_target()
df_fraud_aggregated = add_invoice_frequency_features(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_counter_statue_error_occured_features(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_counter_regions_features(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_region_fraud_rate_features(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_median_billing_frequence_per_region(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_sdt_dev_consumption_region(fraud_merged, df_fraud_aggregated, postfix_consumption="_level_1")
df_fraud_aggregated = add_sdt_dev_consumption_region(fraud_merged, df_fraud_aggregated, postfix_consumption="_level_2")
df_fraud_aggregated = add_sdt_dev_consumption_region(fraud_merged, df_fraud_aggregated, postfix_consumption="_level_3")
df_fraud_aggregated = add_sdt_dev_consumption_region(fraud_merged, df_fraud_aggregated, postfix_consumption="_level_4")
df_fraud_aggregated = add_consump_agg(df_fraud_aggregated, invoice)

In [4]:
total = len(df_fraud_aggregated)
report = (
    pd.DataFrame({
    'column': df_fraud_aggregated.columns,
        'num_missing': df_fraud_aggregated.isna().sum().values
    })
    .assign(
        pct_missing=lambda d: d['num_missing'] / total * 100,
        has_missing=lambda d: d['num_missing'] > 0
    )
)
print (report)

                                        column  num_missing  pct_missing   
0                                    client_id            0     0.000000  \
1                                       target            0     0.000000   
2                     f_invoive_date_diff_days         4212     3.108648   
3                 f_invoive_date_median_months         4212     3.108648   
4                  f_invoive_date_median_years         4212     3.108648   
5               f_counter_statue_error_occured            0     0.000000   
6                            f_counter_regions            0     0.000000   
7                          f_region_fraud_rate            0     0.000000   
8        f_region_median_billing_frequence_per            2     0.001476   
9   f_region_std_deviation_consumption_level_1            0     0.000000   
10  f_region_std_deviation_consumption_level_2            0     0.000000   
11  f_region_std_deviation_consumption_level_3            0     0.000000   
12  f_region

In [28]:


# 0) Daten vorbereiten
X = df_fraud_aggregated.drop(columns=["target", "client_id"]).fillna(0)
y = df_fraud_aggregated["target"]

# 1) Stratified KFold anlegen
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2) Listen für Kennzahlen
precisions, recalls, f1s = [], [], []

# 3) Durch die Folds iterieren
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # class imbalance weight für XGBoost
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    # Pipeline mit ROS + XGBClassifier
    pipeline = Pipeline([
        ("ros", RandomOverSampler(random_state=42)),
        ("xgb", XGBClassifier(
            use_label_encoder=False,
            eval_metric="logloss",
            n_estimators=150,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=scale_pos_weight,
            random_state=42
        ))
    ])

    # Trainieren & Vorhersagen
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"--- Fold {fold} ---")
    print("Confusion Matrix:")
    print(cm)

    # Einzel-Metriken berechnen (pos_label=1)
    p = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
    r = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
    f = f1_score(y_test, y_pred, pos_label=1, zero_division=0)
    print(f"Precision (1): {p:.3f}, Recall (1): {r:.3f}, F1 (1): {f:.3f}\n")

    precisions.append(p)
    recalls.append(r)
    f1s.append(f)

# 4) Durchschnitt über alle Folds
print("=== Durchschnitt über 5 Stratified Folds (positive Klasse) ===")
print(f"Precision: {np.mean(precisions):.3f}")
print(f"Recall:    {np.mean(recalls):.3f}")
print(f"F1-score:  {np.mean(f1s):.3f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 1 ---
Confusion Matrix:
[[ 7221 18365]
 [   81  1432]]
Precision (1): 0.072, Recall (1): 0.946, F1 (1): 0.134



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 2 ---
Confusion Matrix:
[[ 6898 18688]
 [   95  1418]]
Precision (1): 0.071, Recall (1): 0.937, F1 (1): 0.131



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 3 ---
Confusion Matrix:
[[ 6939 18646]
 [   86  1428]]
Precision (1): 0.071, Recall (1): 0.943, F1 (1): 0.132



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 4 ---
Confusion Matrix:
[[ 7029 18556]
 [  111  1402]]
Precision (1): 0.070, Recall (1): 0.927, F1 (1): 0.131



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Fold 5 ---
Confusion Matrix:
[[ 6845 18740]
 [   92  1421]]
Precision (1): 0.070, Recall (1): 0.939, F1 (1): 0.131

=== Durchschnitt über 5 Stratified Folds (positive Klasse) ===
Precision: 0.071
Recall:    0.939
F1-score:  0.132


In [6]:
# ---------------------------------------------------------------------
# Beispiel: BalancedBaggingClassifier (imblearn) mit XGBClassifier
# ---------------------------------------------------------------------

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.ensemble import BalancedBaggingClassifier
from xgboost import XGBClassifier

# 1) DataFrame laden / bereitstellen
#    Ersetze das hier durch dein echtes df_fraud_aggregated
df = df_fraud_aggregated  

# 2) Features und Target definieren
X = df.drop(columns=["client_id", "target"]).fillna(0)
y = df["target"]

# 3) Train/Test-Split (stratifiziert)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

# 4) Basis-XGBoost-Modell (wichtigste Hyperparameter)
xgb_base = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# 5) Imbalanced-learn Ensemble mit BalancedBagging
bbc = BalancedBaggingClassifier(
    estimator=xgb_base,
    sampling_strategy="auto",   # Minority-Klasse auf Majorität hochziehen
    n_estimators=10,            # Anzahl der Bagging-Modelle
    random_state=42,
    replacement=False           # ohne Zurücklegen
)

# 6) Training
bbc.fit(X_train, y_train)

# 7) Vorhersage & Evaluation
y_pred = bbc.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Confusion Matrix:
[[26951 11427]
 [  796  1474]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.70      0.82     38378
         1.0       0.11      0.65      0.19      2270

    accuracy                           0.70     40648
   macro avg       0.54      0.68      0.50     40648
weighted avg       0.92      0.70      0.78     40648

