In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import make_classification
from xgboost import XGBClassifier


from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline  # imblearn-eigene Pipeline, kompatibel zu sklearn

from w1_feature_fraud_mk import Fraud
# Murat
from w1_feature_fraud_mk import  left_join_on, add_invoice_frequency_features, add_counter_statue_error_occured_features
from w1_feature_fraud_mk import add_counter_regions_features, add_region_fraud_rate_features
from w1_feature_fraud_mk import add_median_billing_frequence_per_region, add_sdt_dev_consumption_region
# Dana
from w1_feature_fraud_mk import calculate_mutual_information, visualize_mutual_information, add_consump_agg

In [2]:
fraud = Fraud(["./data/train/client_train.csv", "./data/train/invoice_train.csv"], target_column="target")
client  = fraud["./data/train/client_train.csv"]
invoice = fraud["./data/train/invoice_train.csv"]
fraud_merged = left_join_on("client_id", client, invoice)

  self._frames[str(p)] = pd.read_csv(p)


In [3]:
df_fraud_aggregated = fraud.get_target()
df_fraud_aggregated = add_invoice_frequency_features(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_counter_statue_error_occured_features(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_counter_regions_features(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_region_fraud_rate_features(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_median_billing_frequence_per_region(fraud_merged, df_fraud_aggregated)
df_fraud_aggregated = add_sdt_dev_consumption_region(fraud_merged, df_fraud_aggregated, postfix_consumption="_level_1")
df_fraud_aggregated = add_sdt_dev_consumption_region(fraud_merged, df_fraud_aggregated, postfix_consumption="_level_2")
df_fraud_aggregated = add_sdt_dev_consumption_region(fraud_merged, df_fraud_aggregated, postfix_consumption="_level_3")
df_fraud_aggregated = add_sdt_dev_consumption_region(fraud_merged, df_fraud_aggregated, postfix_consumption="_level_4")
df_fraud_aggregated = add_consump_agg(df_fraud_aggregated, invoice)

In [4]:
total = len(df_fraud_aggregated)
report = (
    pd.DataFrame({
    'column': df_fraud_aggregated.columns,
        'num_missing': df_fraud_aggregated.isna().sum().values
    })
    .assign(
        pct_missing=lambda d: d['num_missing'] / total * 100,
        has_missing=lambda d: d['num_missing'] > 0
    )
)
print (report)

                                        column  num_missing  pct_missing   
0                                    client_id            0     0.000000  \
1                                       target            0     0.000000   
2                     f_invoive_date_diff_days         4212     3.108648   
3                 f_invoive_date_median_months         4212     3.108648   
4                  f_invoive_date_median_years         4212     3.108648   
5               f_counter_statue_error_occured            0     0.000000   
6                            f_counter_regions            0     0.000000   
7                          f_region_fraud_rate            0     0.000000   
8        f_region_median_billing_frequence_per            2     0.001476   
9   f_region_std_deviation_consumption_level_1            0     0.000000   
10  f_region_std_deviation_consumption_level_2            0     0.000000   
11  f_region_std_deviation_consumption_level_3            0     0.000000   
12  f_region

In [5]:
X = df_fraud_aggregated.drop(columns=["target", "client_id"])
X.fillna(0, inplace=True)  
y = df_fraud_aggregated["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

pipeline_smote = Pipeline([
    ("smote", SMOTE(random_state=42, sampling_strategy="auto")),
    ("xgb", XGBClassifier(
        use_label_encoder=False,   # neuere XGBoost-Versionen
        eval_metric="logloss",     # oder "auc"
        n_estimators=150,          # Anzahl der Bäume
        max_depth=6,               # maximale Tiefe der Bäume
        learning_rate=0.1,         # Shrinkage
        subsample=0.8,             # Bagging-Rate
        colsample_bytree=0.8,      # Feature-Subsampling
        scale_pos_weight=scale_pos_weight,
        random_state=42
    ))
])

pipeline_smote.fit(X_train, y_train)
y_pred_smote = pipeline_smote.predict(X_test)

print(confusion_matrix(y_test, y_pred_smote))
print("=== SMOTE + XGBClassifier ===")
print(classification_report(y_test, y_pred_smote))




Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[[14939 23439]
 [  339  1931]]
=== SMOTE + XGBClassifier ===
              precision    recall  f1-score   support

         0.0       0.98      0.39      0.56     38378
         1.0       0.08      0.85      0.14      2270

    accuracy                           0.42     40648
   macro avg       0.53      0.62      0.35     40648
weighted avg       0.93      0.42      0.53     40648

