In [1]:
%matplotlib inline

import os, sys, json, warnings, time
warnings.filterwarnings("ignore")
os.environ["KERAS_BACKEND"] = "torch"
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn import datasets
from sklearn.metrics import average_precision_score, roc_auc_score, brier_score_loss, precision_recall_curve
from sklearn.calibration import calibration_curve
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


import torch
from torchvision.transforms import v2

import keras
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

f"PyTorch version: {torch.__version__}"

'PyTorch version: 2.8.0+cpu'

Pfade definieren

In [2]:
# ---------- Paths ----------
if "__file__" in globals():
    ROOT = Path(__file__).resolve().parents[1]
else:
    ROOT = Path.cwd() if Path.cwd().name not in ("notebooks","tools","tests") else Path.cwd().parent

REPORTS_IN  = Path(os.getenv("REPORTS_IN")  or (ROOT / "reports"))         # shared inputs (split, features)
REPORTS_OUT = Path(os.getenv("REPORTS_OUT") or (ROOT / "reports_Charly"))   # your outputs
REPORTS_OUT.mkdir(parents=True, exist_ok=True)

# import loading method
sys.path.insert(0, str(ROOT))
from src.data_loader import load_and_save_data

Speed-Profile erstellen für die Berechnung der Modelle

In [4]:
# ---------- Speed/Profile ----------
SPEED = os.getenv("SPEED", "MEDIUM").upper().strip()
def speed_cfg():
    cfg = dict(CV=5, N_EST=6000, EARLY_STOP=200, MODELS=["lgbm","xgb"], LR=0.03)
    if SPEED == "FAST":
        cfg.update(CV=3, N_EST=2000, EARLY_STOP=50, MODELS=["lgbm"], LR=0.05)
    elif SPEED == "MEDIUM":
        cfg.update(CV=5, N_EST=4000, EARLY_STOP=100)
    elif SPEED == "FULL":
        cfg.update(CV=5, N_EST=8000, EARLY_STOP=300)
    return cfg

CFG        = speed_cfg()
RND        = int(os.getenv("RND", "42"))
CV         = int(os.getenv("CV", str(CFG["CV"])))
N_EST      = int(os.getenv("N_EST", str(CFG["N_EST"])))
ESR        = int(os.getenv("EARLY_STOP", str(CFG["EARLY_STOP"])))
MODELS     = [m.strip() for m in os.getenv("MODELS", ",".join(CFG["MODELS"])).split(",") if m.strip()]
IMB        = os.getenv("IMB", "spw").lower()   # 'iso' (LGBM is_unbalance) or 'spw' (scale_pos_weight)
LR         = float(os.getenv("LR", str(CFG["LR"])))
MEMBER     = os.getenv("MEMBER", "Lucas")

Load the Dataset

In [33]:
# Laden des Datensatzes und alle -1 durch nan ersetzen, da das ML nicht mit nans umgehen kann
data = load_and_save_data().replace(np.nan, -1)

# zur Kontrolle
total_nans = data.isna().sum().sum()
print("total nans: ",total_nans)


Lade Datensatz aus dem Cache.
total nans:  0


Strukturierung des Datensatzes von Lucas

In [49]:
# ---------- Utils ----------
def split_cols(cols):
    cat = [c for c in cols if str(c).endswith("_cat")]
    bin = [c for c in cols if str(c).endswith("_bin")]
    num  = [c for c in cols if c not in cat and c not in bin and c != "target"]
    return cat, bin, num

def load_selected_feature_list():
    f = REPORTS_IN / "features_selected.csv"
    if not f.exists():
        raise FileNotFoundError(f"Missing {f}. Run feature-gate first.")
    s = pd.read_csv(f)
    if "raw_feature" not in s.columns:
        raise ValueError("features_selected.csv must have column 'raw_feature'.")
    return s["raw_feature"].astype(str).tolist()

var_selected = load_selected_feature_list() # 35 Feature anstatt 57

X = data[var_selected[:-2]]  # alles von Anfang bis zweitletzte Spalte
y = data["target"]

var_cat, var_bin, var_num = split_cols(X.columns)

X.shape, y.shape


((595212, 35), (595212,))

In [52]:
# zur Kontrolle
print(var_cat) # 13 Variablen
print(var_num) # 11 Variablen
print(var_bin) # 11 Variablen

['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_11_cat']
['ps_ind_01', 'ps_ind_03', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']
['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin']


Train-Test Split

In [53]:
## Split von split_indices übernommen: 
split_p = REPORTS_IN / "split_indices.json"
split = json.loads(split_p.read_text())

# Datensatz aufsplitten nach der json-Datei
X_train = X.loc[split["train"]]
y_train = y.loc[split["train"]]
X_test = X.loc[split["test"]]
y_test = y.loc[split["test"]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((200000, 35), (200000,), (50000, 35), (50000,))

### Pre-Processing

Aus der EDA kennen wir die einzelne Variablen. 
- es müssen fehlende Werte ersetzt werden
- es gibt einige kategoriale Variablen mit sehr vielen kategorien -> hier ist ein embedding layer sinnvoller als One-hot encoding
-> ab 10 wird ein Layer eingefügt, für weniger Kategorien wird one-hot-encoding verwendet
- Standardisierung der numerischen Variablen

In [54]:
# Sortieren der kategorialen  Variablen nach vielen und wenigen Kategorien
small_cat = [col for col in var_cat if data[col].nunique() < 10]
large_cat = [col for col in var_cat if data[col].nunique() >= 10]
small_cat,large_cat

(['ps_ind_02_cat',
  'ps_ind_04_cat',
  'ps_ind_05_cat',
  'ps_car_02_cat',
  'ps_car_03_cat',
  'ps_car_05_cat',
  'ps_car_07_cat',
  'ps_car_08_cat',
  'ps_car_09_cat'],
 ['ps_car_01_cat', 'ps_car_04_cat', 'ps_car_06_cat', 'ps_car_11_cat'])

In [56]:
# Pipeline für numerische Features
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Pipeline für kleine kategoriale Features
cat_small_pipeline = Pipeline([
    # kein Imputer, da -1 jetzt als eigene Kategorie mit einbezogen wird
    ("onehot", OneHotEncoder(sparse_output=False, drop='first', handle_unknown="ignore"))
])

# Pipeline für binäre Features
bin_pipeline = Pipeline([
    # bei binären Variablen wird die häufigste Kategorie gewählt, damit sie binär bleiben und keine 3. Kategorie dazu kommt
    ("impute", SimpleImputer(strategy="most_frequent", missing_values=-1))
])

ps_preprocessor = ColumnTransformer([
    ("var_num", num_pipeline, var_num),
    ("small_cat", cat_small_pipeline, small_cat),
    ("var_bin", bin_pipeline, var_bin)
])

ps_preprocessor

0,1,2
,transformers,"[('var_num', ...), ('small_cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,-1
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False


In [61]:
# Transformieren der Daten
X_processed_train = ps_preprocessor.fit_transform(X_train)
X_processed_test = ps_preprocessor.fit_transform(X_test)

# große Kategoriale Variablen
X_large_train = X_train[large_cat]
X_large_test = X_test[large_cat]

Torch Tensors

In [64]:
X_processed_train_t, X_processed_test_t = (
    torch.from_numpy(X_processed_train).to(torch.float32),
    torch.from_numpy(X_processed_test).to(torch.float32)
)

# müssen noch von DataFrame zu NumpyArray umgewandelt werden -> daher .values verwendet
X_large_train_t, X_large_test_t = (
    torch.from_numpy(X_large_train.values).to(torch.int32),
    torch.from_numpy(X_large_test.values).to(torch.int32)
)

y_train_t, y_test_t = (
    torch.from_numpy(y_train.values).to(torch.long),
    torch.from_numpy(y_test.values).to(torch.long)
)

X_processed_train_t.size(), X_large_train_t.size(), y_train_t.size(), X_processed_test_t.size(), X_large_test_t.size(), y_test_t.size()

(torch.Size([200000, 49]),
 torch.Size([200000, 4]),
 torch.Size([200000]),
 torch.Size([50000, 49]),
 torch.Size([50000, 4]),
 torch.Size([50000]))

Model Setup

In [None]:
RANDOM_SEED = 42
in_features = 35
# Numerische Spalten standardisieren
X = X.select_dtypes(include='number')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# from Chat gpt


In [None]:
# Embeddings einzeln bestimmen

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Flatten

# Beispiel: X_large_train_t und X_large_test_t als NumPy-Arrays oder DataFrames
# Annahme: jede Spalte enthält integer-codierte Kategorien (0..num_categories-1)

embedding_dim = 8  # typische Wahl

def create_embeddings(X_large, embedding_dim=8):
    embedded_arrays = []
    for i in range(X_large.shape[1]):
        num_categories = int(X_large[:, i].max() + 1)  # Anzahl Kategorien
        # Embedding-Layer temporär erstellen und sofort auf Input anwenden
        emb_layer = Embedding(input_dim=num_categories, output_dim=embedding_dim, input_length=1)
        # Input in Tensor umwandeln
        x_tensor = tf.convert_to_tensor(X_large[:, i], dtype=tf.int32)
        # Embedding anwenden und flach machen
        x_emb = Flatten()(emb_layer(x_tensor[:, tf.newaxis]))
        embedded_arrays.append(x_emb.numpy())
    # Alle Spalten zusammenführen
    return np.concatenate(embedded_arrays, axis=1)

# Embeddings für Training und Test berechnen
X_large_train_emb = create_embeddings(X_large_train_t.values, embedding_dim)
X_large_test_emb  = create_embeddings(X_large_test_t.values, embedding_dim)

In [None]:
# Alle Features zusammen führen für das Modell: 
# X_processed_train_t und X_processed_test_t sollten NumPy-Arrays sein
X_combined_train = np.concatenate([X_processed_train_t.values, X_large_train_emb], axis=1)
X_combined_test  = np.concatenate([X_processed_test_t.values, X_large_test_emb], axis=1)

Model Setup

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(128, activation="relu", input_shape=(X_combined_train.shape[1],)),
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")  # Binary classification
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
# von Chat gpt gesamte Lösung in einema
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Concatenate, Flatten
from tensorflow.keras.models import Model

# 1️⃣ Input Layer
# Numerische + kleine kategoriale Features
input_processed = Input(shape=(X_processed_train_t.shape[1],), name="processed_features")

# Große kategoriale Features
# Wir nehmen an, X_large_train_t hat 4 Spalten, jede mit integer-codierten Kategorien
input_large = Input(shape=(X_large_train_t.shape[1],), dtype=tf.int32, name="large_cat_features")

# 2️⃣ Embedding Layer für jede große kategoriale Spalte
embedding_layers = []
embedding_dim = 8  # typische Wahl: min(50, (num_categories+1)//2)

for i in range(X_large_train_t.shape[1]):
    # Wir müssen die Anzahl der Kategorien pro Feature kennen
    num_categories = int(X_large_train_t[:, i].max().item() + 1)  # +1 für 0-index
    x = Embedding(input_dim=num_categories, output_dim=embedding_dim, input_length=1)(input_large[:, i])
    x = Flatten()(x)
    embedding_layers.append(x)

# 3️⃣ Alle Embeddings zusammenfassen
x_large = Concatenate()(embedding_layers) if len(embedding_layers) > 1 else embedding_layers[0]

# 4️⃣ Alles zusammenführen
x = Concatenate()([input_processed, x_large])

# 5️⃣ Dense Layers
x = Dense(128, activation="relu")(x)
x = Dense(64, activation="relu")(x)
output = Dense(1, activation="sigmoid")(x)  # binary classification

# 6️⃣ Modell erstellen
model = Model(inputs=[input_processed, input_large], outputs=output)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
model.fit(
    [X_processed_train_t, X_large_train_t],
    y_train_t,
    validation_data=([X_processed_test_t, X_large_test_t], y_test_t),
    batch_size=1024,
    epochs=10
)