In [44]:
%matplotlib inline

import os, sys, json, warnings, time
warnings.filterwarnings("ignore")
os.environ["KERAS_BACKEND"] = "torch"
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn import datasets
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


import torch
from torchvision.transforms import v2

import keras
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

f"PyTorch version: {torch.__version__}"

'PyTorch version: 2.8.0+cpu'

Pfade definieren

In [11]:
# ---------- Paths ----------
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd() if Path.cwd().name not in ("notebooks","tools","tests") else Path.cwd().parent

REPORTS_IN  = Path(os.getenv("REPORTS_IN")  or (ROOT / "reports"))         # shared inputs (split, features)
REPORTS_OUT = Path(os.getenv("REPORTS_OUT") or (ROOT / "reports_Charly"))   # your outputs
REPORTS_OUT.mkdir(parents=True, exist_ok=True)

# import loading method
sys.path.insert(0, str(ROOT))
from src.data_loader import load_and_save_data

Speed-Profile von Luccas

In [12]:
# ---------- Speed/Profile ----------
SPEED = os.getenv("SPEED", "MEDIUM").upper().strip()
def speed_cfg():
    cfg = dict(CV=5, N_EST=6000, EARLY_STOP=200, MODELS=["lgbm","xgb"], LR=0.03)
    if SPEED == "FAST":
        cfg.update(CV=3, N_EST=2000, EARLY_STOP=50, MODELS=["lgbm"], LR=0.05)
    elif SPEED == "MEDIUM":
        cfg.update(CV=5, N_EST=4000, EARLY_STOP=100)
    elif SPEED == "FULL":
        cfg.update(CV=5, N_EST=8000, EARLY_STOP=300)
    return cfg

CFG        = speed_cfg()
RND        = int(os.getenv("RND", "42"))
CV         = int(os.getenv("CV", str(CFG["CV"])))
N_EST      = int(os.getenv("N_EST", str(CFG["N_EST"])))
ESR        = int(os.getenv("EARLY_STOP", str(CFG["EARLY_STOP"])))
MODELS     = [m.strip() for m in os.getenv("MODELS", ",".join(CFG["MODELS"])).split(",") if m.strip()]
IMB        = os.getenv("IMB", "spw").lower()   # 'iso' (LGBM is_unbalance) or 'spw' (scale_pos_weight)
LR         = float(os.getenv("LR", str(CFG["LR"])))
MEMBER     = os.getenv("MEMBER", "Lucas")

Load the Dataset

In [79]:
data=load_and_save_data()



Lade Datensatz aus dem Cache.


Strukturierung des Datensatzes von Lucas

In [80]:
# ---------- Utils ----------
def split_cols(cols):
    cat = [c for c in cols if str(c).endswith("_cat")]
    bin_ = [c for c in cols if str(c).endswith("_bin")]
    num  = [c for c in cols if c not in cat and c not in bin_ and c != "target"]
    return cat, bin_, num

def load_selected_feature_list():
    f = REPORTS_IN / "features_selected.csv"
    if not f.exists():
        raise FileNotFoundError(f"Missing {f}. Run feature-gate first.")
    s = pd.read_csv(f)
    if "raw_feature" not in s.columns:
        raise ValueError("features_selected.csv must have column 'raw_feature'.")
    return s["raw_feature"].astype(str).tolist()

var_cat, var_bin, var_num = split_cols(data.columns)
var_selected = load_selected_feature_list() # 35 Feature anstatt 57

X = data[var_selected[:-2]]  # alles von Anfang bis zweitletzte Spalte
y = data["target"]

X.shape, y.shape


((595212, 35), (595212,))

### Pre-Processing

Aus der EDA kennen wir die einzelne Variablen. 
- es müssen fehlende Werte ersetzt werden
- es gibt einige kategoriale Variablen mit sehr vielen kategorien -> hier ist ein embedding layer sinnvoller als One-hot encoding
-> ab 10 wird ein Layer eingefügt, für weniger Kategorien wird one-hot-encoding verwendet
- Standardisierung der numerischen Variablen

In [81]:
# Sortieren der Variablen nach vielen und wenigen Kategorien
small_cat = [col for col in var_cat if data[col].nunique() < 10]
large_cat = [col for col in var_cat if data[col].nunique() >= 10]
small_cat,large_cat

(['ps_ind_02_cat',
  'ps_ind_04_cat',
  'ps_ind_05_cat',
  'ps_car_02_cat',
  'ps_car_03_cat',
  'ps_car_05_cat',
  'ps_car_07_cat',
  'ps_car_08_cat',
  'ps_car_09_cat',
  'ps_car_10_cat'],
 ['ps_car_01_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_11_cat'])

In [82]:
# Pipeline für numerische Features
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Pipeline für kleine kategoriale Features
cat_small_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value=-1)),
    ("onehot", OneHotEncoder(sparse_output=False, drop='first', handle_unknown="ignore"))
])

# Pipeline für binäre Features
bin_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="constant", fill_value=-1))
])

ps_preprocessor = ColumnTransformer([
    ("var_num", num_pipeline, var_num),
    ("small_cat", cat_small_pipeline, small_cat),
    ("var_bin", bin_pipeline, var_bin)
])

ps_preprocessor

0,1,2
,transformers,"[('var_num', ...), ('small_cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,-1
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,-1
,copy,True
,add_indicator,False
,keep_empty_features,False


In [100]:
# Datentypen der Spalten für ColumnTransformer vereinheitlichen
data[small_cat] = data[small_cat].astype(str).fillna("-1")
data[large_cat] = data[large_cat].astype(str).fillna("-1")
# transformieren der Daten
X_processed = ps_preprocessor.fit_transform(data)

# große Kategoriale Variablen
X_large_cat = data[large_cat]

Train-test split

In [105]:

X_processed_train, X_processed_test, X_large_train, X_large_test, y_train, y_test = train_test_split(
    X_processed.astype(np.float32),
    X_large_cat.astype(np.int32),
    y,
    test_size=0.2,
    random_state = 42,
    stratify=y,
)

X_processed_train.shape, X_large_train.shape, y_train.shape, X_processed_test.shape, X_large_test.shape, y_test.shape

ValueError: invalid literal for int() with base 10: 'nan'

Torch Tensors

In [93]:
X_processed_train_t, X_processed_test_t = (
    torch.from_numpy(X_processed_train).to(torch.float32),
    torch.from_numpy(X_processed_test).to(torch.float32)
)

X_large_train_t, X_large_test_t = (
    torch.from_numpy(X_large_train).to(torch.int32),
    torch.from_numpy(X_large_test).to(torch.int32)
)

y_train_t, y_test_t = (
    torch.from_numpy(y_train).to(torch.long),
    torch.from_numpy(y_test).to(torch.long)
)

X_processed_train_t.size(), X_large_train_t.size(), y_train_t.size(), X_processed_test_t.size(), X_large_test_t.size(), y_test_t.size()

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

Model Setup

In [None]:
RANDOM_SEED = 42
in_features = 35
# Numerische Spalten standardisieren
X = X.select_dtypes(include='number')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)