In [2]:
import pandas as pd
import numpy as np
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
import matplotlib.pyplot as plt

# Force usage of Mac GPU (MPS)
device_name = 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"ðŸš€ Device selected: {device_name}")

ðŸš€ Device selected: mps


In [None]:
def load_and_prep_data(file_path):
    df = pd.read_csv(file_path)
    
    # 1. Fix TotalCharges
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce").fillna(0)
    
    # 2. Drop ID
    if 'customerID' in df.columns:
        df = df.drop(columns=['customerID'])
        
    # 3. Encode Target (Yes/No -> 1/0)
    target = 'Churn'
    if df[target].dtype == 'object':
        df[target] = df[target].map({'Yes': 1, 'No': 0})
    
    # 4. Define Lists for TabNet
    nunique = df.nunique()
    types = df.dtypes
    categorical_columns = []
    categorical_dims = {}
    
    # 5. Preprocessing Loop
    for col in df.columns:
        if col == target:
            continue
        
        # LOGIC: Treat as Categorical if Object OR <50 unique values (covers Binary & small ints)
        if types[col] == 'object' or nunique[col] < 50:
            df[col] = df[col].fillna("Unknown")
            l_enc = LabelEncoder()
            # Convert to string first to handle mixed types
            df[col] = l_enc.fit_transform(df[col].astype(str).values)
            categorical_columns.append(col)
            categorical_dims[col] = len(l_enc.classes_)
        else:
            # Treat as Numerical -> Fill with Mean
            df[col] = df[col].fillna(df[col].mean())
            
    return df, categorical_columns, categorical_dims

# Execute
FILE_PATH = "/Users/jonaslorler/master-thesis-uq-churn/data/raw/kaggle_churn/WA_Fn-UseC_-Telco-Customer-Churn.csv"
df, cat_cols, cat_dims = load_and_prep_data(FILE_PATH)

print(f"âœ… Data Processed. Shape: {df.shape}")
print(f"Categorical columns found: {len(cat_cols)}")

âœ… Data Processed. Shape: (7043, 20)
Categorical columns found: 17


In [None]:
target = 'Churn'
features = [col for col in df.columns if col != target]

# Get indices of categorical columns for TabNet
cat_idxs = [i for i, f in enumerate(features) if f in cat_cols]
cat_dims_list = [cat_dims[f] for f in features if f in cat_cols]

# Force features to be 32bit floats for MAC
X = df[features].values.astype('float32') 
y = df[target].values

# 1. Split Train (80%) vs Temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Split Temp into Valid (10%) and Test (10%)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train Shape: {X_train.shape}")
print(f"Valid Shape: {X_valid.shape}")
print(f"Test Shape:  {X_test.shape}")
print(f"Data Type:   {X_train.dtype}") # Should say float32

Train Shape: (5634, 19)
Valid Shape: (704, 19)
Test Shape:  (705, 19)
Data Type:   float32


In [8]:
# Initialize TabNet
clf = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims_list,
    cat_emb_dim=1,           # Low dimension is often sufficient for tabular
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax',      # "entmax" or "sparsemax"
    device_name=device_name, # <--- Uses your Mac GPU
    verbose=1
)

# Fit the model
print("ðŸš€ Training started...")
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],     # Tracking AUC
    max_epochs=50,           # 50 is a good start for baseline
    patience=15,             # Stop if no improvement for 15 epochs
    batch_size=1024, 
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)
print("âœ… Training complete.")



ðŸš€ Training started...




epoch 0  | loss: 0.64894 | train_auc: 0.51074 | valid_auc: 0.49209 |  0:00:00s




epoch 1  | loss: 0.5076  | train_auc: 0.60361 | valid_auc: 0.60491 |  0:00:01s




epoch 2  | loss: 0.48124 | train_auc: 0.66599 | valid_auc: 0.69805 |  0:00:01s




epoch 3  | loss: 0.46494 | train_auc: 0.7017  | valid_auc: 0.72581 |  0:00:01s




epoch 4  | loss: 0.45276 | train_auc: 0.73321 | valid_auc: 0.74413 |  0:00:02s




epoch 5  | loss: 0.44438 | train_auc: 0.75488 | valid_auc: 0.7536  |  0:00:02s




epoch 6  | loss: 0.43339 | train_auc: 0.76153 | valid_auc: 0.765   |  0:00:03s




epoch 7  | loss: 0.42989 | train_auc: 0.76652 | valid_auc: 0.75574 |  0:00:03s




epoch 8  | loss: 0.42378 | train_auc: 0.77784 | valid_auc: 0.76641 |  0:00:03s




epoch 9  | loss: 0.41561 | train_auc: 0.7852  | valid_auc: 0.77219 |  0:00:04s




epoch 10 | loss: 0.41251 | train_auc: 0.78106 | valid_auc: 0.77649 |  0:00:04s




epoch 11 | loss: 0.41434 | train_auc: 0.78464 | valid_auc: 0.78612 |  0:00:05s




epoch 12 | loss: 0.41    | train_auc: 0.79884 | valid_auc: 0.80002 |  0:00:05s




epoch 13 | loss: 0.40905 | train_auc: 0.80039 | valid_auc: 0.80282 |  0:00:05s




epoch 14 | loss: 0.40668 | train_auc: 0.80192 | valid_auc: 0.79296 |  0:00:06s




epoch 15 | loss: 0.40355 | train_auc: 0.80206 | valid_auc: 0.78012 |  0:00:06s




epoch 16 | loss: 0.40297 | train_auc: 0.80695 | valid_auc: 0.78087 |  0:00:07s




epoch 17 | loss: 0.39946 | train_auc: 0.81342 | valid_auc: 0.79944 |  0:00:07s




epoch 18 | loss: 0.39667 | train_auc: 0.81811 | valid_auc: 0.81255 |  0:00:07s




epoch 19 | loss: 0.39697 | train_auc: 0.81319 | valid_auc: 0.80871 |  0:00:08s




epoch 20 | loss: 0.3958  | train_auc: 0.81578 | valid_auc: 0.80778 |  0:00:08s




epoch 21 | loss: 0.39514 | train_auc: 0.82242 | valid_auc: 0.81398 |  0:00:09s




epoch 22 | loss: 0.39101 | train_auc: 0.82435 | valid_auc: 0.81055 |  0:00:09s




epoch 23 | loss: 0.3859  | train_auc: 0.82835 | valid_auc: 0.82301 |  0:00:09s




epoch 24 | loss: 0.38138 | train_auc: 0.83392 | valid_auc: 0.82949 |  0:00:10s




epoch 25 | loss: 0.38161 | train_auc: 0.83657 | valid_auc: 0.82779 |  0:00:10s




epoch 26 | loss: 0.3823  | train_auc: 0.83817 | valid_auc: 0.83172 |  0:00:11s




epoch 27 | loss: 0.38394 | train_auc: 0.84652 | valid_auc: 0.8309  |  0:00:11s




epoch 28 | loss: 0.38112 | train_auc: 0.8508  | valid_auc: 0.82911 |  0:00:12s




epoch 29 | loss: 0.37632 | train_auc: 0.85042 | valid_auc: 0.83279 |  0:00:12s




epoch 30 | loss: 0.37694 | train_auc: 0.85334 | valid_auc: 0.83806 |  0:00:12s




epoch 31 | loss: 0.37237 | train_auc: 0.85289 | valid_auc: 0.83372 |  0:00:13s




epoch 32 | loss: 0.37682 | train_auc: 0.85568 | valid_auc: 0.83554 |  0:00:13s




epoch 33 | loss: 0.37119 | train_auc: 0.85586 | valid_auc: 0.83043 |  0:00:14s




epoch 34 | loss: 0.36995 | train_auc: 0.85866 | valid_auc: 0.82491 |  0:00:14s




epoch 35 | loss: 0.37125 | train_auc: 0.8622  | valid_auc: 0.82727 |  0:00:15s




epoch 36 | loss: 0.3715  | train_auc: 0.86323 | valid_auc: 0.82198 |  0:00:15s




epoch 37 | loss: 0.37145 | train_auc: 0.86602 | valid_auc: 0.82475 |  0:00:16s




epoch 38 | loss: 0.36915 | train_auc: 0.86985 | valid_auc: 0.83791 |  0:00:16s




epoch 39 | loss: 0.36018 | train_auc: 0.8669  | valid_auc: 0.83806 |  0:00:16s




epoch 40 | loss: 0.36238 | train_auc: 0.87585 | valid_auc: 0.83339 |  0:00:17s




epoch 41 | loss: 0.36092 | train_auc: 0.87445 | valid_auc: 0.82927 |  0:00:17s




epoch 42 | loss: 0.36233 | train_auc: 0.87324 | valid_auc: 0.83049 |  0:00:18s




epoch 43 | loss: 0.35883 | train_auc: 0.87517 | valid_auc: 0.82724 |  0:00:18s




epoch 44 | loss: 0.36853 | train_auc: 0.87906 | valid_auc: 0.82535 |  0:00:18s
epoch 45 | loss: 0.35971 | train_auc: 0.88129 | valid_auc: 0.82253 |  0:00:19s

Early stopping occurred at epoch 45 with best_epoch = 30 and best_valid_auc = 0.83806
âœ… Training complete.




In [9]:
def calculate_top_decile_lift(y_true, y_pred_proba):
    # Create a temporary DataFrame
    df_res = pd.DataFrame({'y': y_true, 'p': y_pred_proba})
    
    # Sort by predicted probability (highest risk first)
    df_res = df_res.sort_values('p', ascending=False)
    
    # Select the top 10%
    top_decile = df_res.head(len(df_res) // 10)
    
    # Calculate churn rates
    actual_churn_rate = y_true.mean()
    top_decile_churn_rate = top_decile['y'].mean()
    
    # Calculate Lift
    return top_decile_churn_rate / actual_churn_rate

# Predict on Test Set
preds = clf.predict_proba(X_test)[:, 1]

# Calculate Metrics
auc_roc = roc_auc_score(y_test, preds)
auc_pr = average_precision_score(y_test, preds)
lift = calculate_top_decile_lift(y_test, preds)

print("\n--- ðŸ“Š Thesis Baseline Results ---")
print(f"AUC-ROC: {auc_roc:.4f}")
print(f"AUC-PR:  {auc_pr:.4f} (Primary Metric)")
print(f"Lift:    {lift:.4f}   (Primary Metric)")


--- ðŸ“Š Thesis Baseline Results ---
AUC-ROC: 0.7866
AUC-PR:  0.5357 (Primary Metric)
Lift:    2.3697   (Primary Metric)
