# Feature Engineering für Bank-Adviser AI

Dieses Notebook erstellt Features für das Machine Learning Modell.

In [None]:
# Zelle 1: Import pandas und laden der CSV-Dateien
import pandas as pd
import numpy as np

# Lade Kundendaten
customers_df = pd.read_csv('../data/customers.csv')
print(f"Customers geladen: {len(customers_df)} Zeilen")
print(customers_df.columns.tolist())

# Lade Ownership-Daten
ownership_df = pd.read_csv('../data/ownership.csv')
print(f"\nOwnership geladen: {len(ownership_df)} Zeilen")
print(ownership_df.columns.tolist())

# Zeige erste Zeilen
print("\nCustomers:")
display(customers_df.head())
print("\nOwnership:")
display(ownership_df.head())

In [None]:
# Zelle 2: Erstelle One-Hot-Encoding für Produktbesitz

# Alle verfügbaren Produkt-IDs ermitteln
unique_products = ownership_df['prod_id'].unique()
print(f"Verfügbare Produkte: {sorted(unique_products)}")

# Starte mit Kundendaten
features_df = customers_df.copy()

# Erstelle für jedes Produkt eine has_<prod_id> Spalte
for prod_id in unique_products:
    # Kunden, die dieses Produkt besitzen
    customers_with_product = ownership_df[ownership_df['prod_id'] == prod_id]['cust_id'].unique()
    
    # Erstelle One-Hot-Spalte
    column_name = f'has_{prod_id}'
    features_df[column_name] = features_df['cust_id'].isin(customers_with_product).astype(int)
    
    print(f"Spalte {column_name} erstellt: {features_df[column_name].sum()} Kunden haben dieses Produkt")

# Zeige Ergebnis
print(f"\nFeatures DataFrame: {features_df.shape}")
print(f"Spalten: {features_df.columns.tolist()}")
display(features_df.head())

In [None]:
# Zelle 3: Features auswählen und als Parquet speichern

# Definiere gewünschte Features
base_features = ['cust_id', 'age_bucket', 'revenue', 'credit_score']
product_features = [col for col in features_df.columns if col.startswith('has_')]

# Alle Features zusammenfassen
selected_features = base_features + product_features
print(f"Gewählte Features: {selected_features}")

# Features DataFrame erstellen
final_features_df = features_df[selected_features].copy()

# Statistiken anzeigen
print(f"\n📊 Feature-Statistiken:")
print(f"   • Anzahl Kunden: {len(final_features_df)}")
print(f"   • Anzahl Features: {len(selected_features)}")
print(f"   • Durchschnittliches Revenue: €{final_features_df['revenue'].mean():,.0f}")
print(f"   • Durchschnittlicher Credit Score: {final_features_df['credit_score'].mean():.0f}")

print(f"\n🎯 Altersverteilung:")
age_dist = final_features_df['age_bucket'].value_counts().sort_index()
for age, count in age_dist.items():
    print(f"   • {age}: {count} Kunden ({count/len(final_features_df)*100:.1f}%)")

print(f"\n📦 Produktbesitz-Verteilung:")
for col in product_features:
    count = final_features_df[col].sum()
    percentage = count / len(final_features_df) * 100
    print(f"   • {col}: {count} Kunden ({percentage:.1f}%)")

# Als Parquet speichern
final_features_df.to_parquet('../data/features.parquet', index=False)
print(f"\n✅ Features gespeichert als '../data/features.parquet'")

# Zeige erste Zeilen
print(f"\n📋 Final Features DataFrame:")
display(final_features_df.head())

# Info über das DataFrame
print(f"\nDataFrame Info:")
final_features_df.info()