# Training Model 

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [63]:
train = pd.read_csv("data/feature_engineering/loans_train.csv")
test = pd.read_csv("data/feature_engineering/loans_test.csv")
valid = pd.read_csv("data/feature_engineering/loans_valid.csv")

### training model preparation

In [64]:
# Fix Datetime Columns Before Training (concise)
import pandas as pd
import numpy as np

DATE_COLS = ['FirstPaymentDate', 'MaturityDate']

def extract_true_datetime_parts(df: pd.DataFrame) -> pd.DataFrame:
    dfc = df.copy()
    for col in DATE_COLS:
        if col in dfc.columns:
            dfc[col] = pd.to_datetime(dfc[col], format='%Y-%m-%d', errors='coerce')
            dfc[f'{col}_year'] = dfc[col].dt.year
            dfc[f'{col}_month'] = dfc[col].dt.month
            dfc.drop(columns=[col], inplace=True)
    return dfc

train = extract_true_datetime_parts(train)
valid = extract_true_datetime_parts(valid)
test  = extract_true_datetime_parts(test)

In [65]:
# Extract YYYY, MM, DD from Datetime Columns
import pandas as pd
import numpy as np

def extract_datetime_features(df):
    df_copy = df.copy()
    
    for col in ['FirstPaymentDate', 'MaturityDate']:
        if col in df_copy.columns:
            # Safely convert to datetime if not already
            df_copy[col] = pd.to_datetime(df_copy[col], errors='coerce')

            # Extract components using datetime accessor
            df_copy[col + '_year'] = df_copy[col].dt.year
            df_copy[col + '_month'] = df_copy[col].dt.month
            df_copy[col + '_day'] = df_copy[col].dt.day

            # Drop original datetime column
            df_copy.drop(columns=[col], inplace=True)
            print(f"Extracted year, month, day from {col}")
    
    return df_copy

# Apply extraction to all datasets
train = extract_datetime_features(train)
valid = extract_datetime_features(valid)
test  = extract_datetime_features(test)

print("\n✅ Datetime extraction completed successfully!")



✅ Datetime extraction completed successfully!


In [66]:
# 1. Drop all original panel columns (0_...13_...)
panel_cols = [c for c in train.columns if any(c.startswith(f"{i}_") for i in range(14))]
train = train.drop(columns=panel_cols, errors='ignore')
valid = valid.drop(columns=panel_cols, errors='ignore')
test  = test.drop(columns=panel_cols,  errors='ignore')

# 2. Keep aggregated statistical features
# (e.g. EstimatedLTV_mean, EstimatedLTV_slope, LoanAge_range, etc.)
X_train = train.drop(['index', 'target'], axis=1)
y_train = train['target']
X_valid = valid.drop(['index', 'target'], axis=1)
y_valid = valid['target']
X_test  = test.drop(['Id'], axis=1)


In [67]:
# Use Cleaned Datasets for Model Training

from signal import valid_signals
from sklearn.impute import SimpleImputer

# Use your cleaned datasets that already have datetime features extracted
X_train = train.drop(columns=['index', 'target'], errors='ignore')
y_train = train['target']
X_valid = valid.drop(columns=['index', 'target'], errors='ignore')
y_valid = valid['target']
X_test = test.drop(columns=['Id'], errors='ignore')

# Ensure all datasets have the same columns
common_cols = set(X_train.columns) & set(X_valid.columns) & set(X_test.columns)
X_train = X_train[list(common_cols)]
X_valid = X_valid[list(common_cols)]
X_test = X_test[list(common_cols)]

# Drop all string/categorical columns (union across splits)
obj_cols = set(X_train.select_dtypes(include=['object', 'category']).columns)
obj_cols |= set(X_valid.select_dtypes(include=['object', 'category']).columns)
obj_cols |= set(X_test.select_dtypes(include=['object', 'category']).columns)
obj_cols = list(obj_cols)

X_train = X_train.drop(columns=obj_cols, errors='ignore')
X_valid = X_valid.drop(columns=obj_cols, errors='ignore')
X_test  = X_test.drop(columns=obj_cols, errors='ignore')



In [68]:
# Fill remaining NaN values using robust methods

print("Step 2: Fill remaining NaN values")

# For numeric columns, use median imputation
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if X_train[col].isna().any():
        median_val = X_train[col].median()
        if pd.isna(median_val):  # If median is also NaN, use 0
            median_val = 0
        X_train[col] = X_train[col].fillna(median_val)
        X_valid[col] = X_valid[col].fillna(median_val)
        X_test[col] = X_test[col].fillna(median_val)

# For categorical columns, use mode imputation
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    if X_train[col].isna().any():
        mode_val = X_train[col].mode()
        if len(mode_val) > 0:
            X_train[col] = X_train[col].fillna(mode_val[0])
            X_valid[col] = X_valid[col].fillna(mode_val[0])
            X_test[col] = X_test[col].fillna(mode_val[0])
        else:
            # If no mode exists, use 'unknown'
            X_train[col] = X_train[col].fillna('unknown')
            X_valid[col] = X_valid[col].fillna('unknown')
            X_test[col] = X_test[col].fillna('unknown')

# 3. Verify the fix results
print("Step 3: Verify fix results")
print(f"NaN count in X_train: {X_train.isna().sum().sum()}")
print(f"NaN count in X_valid: {X_valid.isna().sum().sum()}")
print(f"NaN count in X_test: {X_test.isna().sum().sum()}")

Step 2: Fill remaining NaN values
Step 3: Verify fix results
NaN count in X_train: 0
NaN count in X_valid: 0
NaN count in X_test: 0


### Baseline Model

In [69]:
# Print target=1 ratio in valid (concise)
print(f"valid positive ratio: {y_valid.mean():.4f}")

valid positive ratio: 0.1261


In [73]:
# ===========================================
# Unsupervised anomaly detection baseline (Full-train version)
# Train: 使用全部训练样本 (包含 target=0/1)
# Valid: 混合样本，用于评估
# Metrics: AP (main), ROC-AUC (secondary)
# ===========================================

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.decomposition import PCA
from sklearn.svm import OneClassSVM
from sklearn.metrics import average_precision_score, roc_auc_score

# ---------- 数据准备 ----------
X_train_ = X_train.copy()
y_train_ = y_train.copy()
X_valid_ = X_valid.copy()
y_valid_ = y_valid.copy()

# 标准化（保持 pipeline 结构统一）
scaler = StandardScaler()
scaler.fit(X_train_)
X_train_scaled = scaler.transform(X_train_)
X_valid_scaled = scaler.transform(X_valid_)

# ---------- 定义模型 ----------
models = {
    "IsolationForest": IsolationForest(
        n_estimators=200,
        contamination=0.05,
        random_state=42
    ),
    "LocalOutlierFactor": LocalOutlierFactor(
        n_neighbors=10,
        novelty=True,
        contamination=0.05
    ),
    "PCA_Reconstruction": PCA(n_components=0.95, random_state=42),
    "OneClassSVM": OneClassSVM(
        kernel="rbf",
        gamma="scale",
        nu=0.05
    )
}

# ---------- 评估 ----------
results = []

for name, model in models.items():
    if name == "PCA_Reconstruction":
        model.fit(X_train_scaled)
        Xv_rec = model.inverse_transform(model.transform(X_valid_scaled))
        scores = np.mean((X_valid_scaled - Xv_rec) ** 2, axis=1)
    else:
        model.fit(X_train_scaled)
        scores = -model.decision_function(X_valid_scaled)

    ap = average_precision_score(y_valid_, scores)
    roc = roc_auc_score(y_valid_, scores)
    results.append((name, round(ap, 4), round(roc, 4)))

# ---------- 输出结果 ----------
results_df = pd.DataFrame(results, columns=["Model", "AP", "ROC-AUC"]).sort_values("AP", ascending=False)
print("✅ Unsupervised Baseline Results (Full Train):")
print(results_df)


✅ Unsupervised Baseline Results (Full Train):
                Model      AP  ROC-AUC
1  LocalOutlierFactor  0.2567   0.6363
2  PCA_Reconstruction  0.2326   0.6011
3         OneClassSVM  0.2048   0.6026
0     IsolationForest  0.1337   0.5491


In [71]:
for n in [10, 20, 30, 50]:
    lof = LocalOutlierFactor(n_neighbors=n, novelty=True, contamination=0.05)
    lof.fit(scaler.transform(X_train_[y_train_==0]))
    scores = -lof.decision_function(X_valid_scaled)
    ap = average_precision_score(y_valid_, scores)
    print(f"n_neighbors={n} | AP={ap:.4f}")


n_neighbors=10 | AP=0.2567
n_neighbors=20 | AP=0.2507
n_neighbors=30 | AP=0.2491
n_neighbors=50 | AP=0.2455


In [72]:
lof = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.05)
pca = PCA(n_components=0.95, random_state=42)

lof.fit(X_train_scaled[y_train_==0])
pca.fit(X_train_scaled[y_train_==0])

# 得分融合（标准化后求均值）
lof_score = -lof.decision_function(X_valid_scaled)
pca_score = np.mean((X_valid_scaled - pca.inverse_transform(pca.transform(X_valid_scaled)))**2, axis=1)
scores = (lof_score / np.std(lof_score)) + (pca_score / np.std(pca_score))

ap = average_precision_score(y_valid_, scores)
roc = roc_auc_score(y_valid_, scores)
print(f"Ensemble — AP={ap:.4f}, ROC-AUC={roc:.4f}")


Ensemble — AP=0.2507, ROC-AUC=0.6309
