# Training Model 

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
train = pd.read_csv("data/feature_engineering/loans_train.csv")
test = pd.read_csv("data/feature_engineering/loans_test.csv")
valid = pd.read_csv("data/feature_engineering/loans_valid.csv")

### training model preparation

In [42]:
# Fix Datetime Columns Before Training (concise)
import pandas as pd
import numpy as np

DATE_COLS = ['FirstPaymentDate', 'MaturityDate']

def extract_true_datetime_parts(df: pd.DataFrame) -> pd.DataFrame:
    dfc = df.copy()
    for col in DATE_COLS:
        if col in dfc.columns:
            dfc[col] = pd.to_datetime(dfc[col], format='%Y-%m-%d', errors='coerce')
            dfc[f'{col}_year'] = dfc[col].dt.year
            dfc[f'{col}_month'] = dfc[col].dt.month
            dfc.drop(columns=[col], inplace=True)
    return dfc

train = extract_true_datetime_parts(train)
valid = extract_true_datetime_parts(valid)
test  = extract_true_datetime_parts(test)

In [43]:
# Extract YYYY, MM, DD from Datetime Columns
import pandas as pd
import numpy as np

def extract_datetime_features(df):
    df_copy = df.copy()
    
    for col in ['FirstPaymentDate', 'MaturityDate']:
        if col in df_copy.columns:
            # Safely convert to datetime if not already
            df_copy[col] = pd.to_datetime(df_copy[col], errors='coerce')

            # Extract components using datetime accessor
            df_copy[col + '_year'] = df_copy[col].dt.year
            df_copy[col + '_month'] = df_copy[col].dt.month
            df_copy[col + '_day'] = df_copy[col].dt.day

            # Drop original datetime column
            df_copy.drop(columns=[col], inplace=True)
            print(f"Extracted year, month, day from {col}")
    
    return df_copy

# Apply extraction to all datasets
train = extract_datetime_features(train)
valid = extract_datetime_features(valid)
test  = extract_datetime_features(test)

print("\n✅ Datetime extraction completed successfully!")



✅ Datetime extraction completed successfully!


In [44]:
# 1. Drop all original panel columns (0_...13_...)
panel_cols = [c for c in train.columns if any(c.startswith(f"{i}_") for i in range(14))]
train = train.drop(columns=panel_cols, errors='ignore')
valid = valid.drop(columns=panel_cols, errors='ignore')
test  = test.drop(columns=panel_cols,  errors='ignore')

# 2. Keep aggregated statistical features
# (e.g. EstimatedLTV_mean, EstimatedLTV_slope, LoanAge_range, etc.)
X_train = train.drop(['index', 'target'], axis=1)
y_train = train['target']
X_valid = valid.drop(['index', 'target'], axis=1)
y_valid = valid['target']
X_test  = test.drop(['Id'], axis=1)


In [45]:
# Use Cleaned Datasets for Model Training

from signal import valid_signals
from sklearn.impute import SimpleImputer

# Use your cleaned datasets that already have datetime features extracted
X_train = train.drop(columns=['index', 'target'], errors='ignore')
y_train = train['target']
X_valid = valid.drop(columns=['index', 'target'], errors='ignore')
y_valid = valid['target']
X_test = test.drop(columns=['Id'], errors='ignore')

# Ensure all datasets have the same columns
common_cols = set(X_train.columns) & set(X_valid.columns) & set(X_test.columns)
X_train = X_train[list(common_cols)]
X_valid = X_valid[list(common_cols)]
X_test = X_test[list(common_cols)]

# Drop all string/categorical columns (union across splits)
obj_cols = set(X_train.select_dtypes(include=['object', 'category']).columns)
obj_cols |= set(X_valid.select_dtypes(include=['object', 'category']).columns)
obj_cols |= set(X_test.select_dtypes(include=['object', 'category']).columns)
obj_cols = list(obj_cols)

X_train = X_train.drop(columns=obj_cols, errors='ignore')
X_valid = X_valid.drop(columns=obj_cols, errors='ignore')
X_test  = X_test.drop(columns=obj_cols, errors='ignore')



In [46]:
# Fill remaining NaN values using robust methods

print("Step 2: Fill remaining NaN values")

# For numeric columns, use median imputation
numeric_cols = X_train.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if X_train[col].isna().any():
        median_val = X_train[col].median()
        if pd.isna(median_val):  # If median is also NaN, use 0
            median_val = 0
        X_train[col] = X_train[col].fillna(median_val)
        X_valid[col] = X_valid[col].fillna(median_val)
        X_test[col] = X_test[col].fillna(median_val)

# For categorical columns, use mode imputation
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns
for col in categorical_cols:
    if X_train[col].isna().any():
        mode_val = X_train[col].mode()
        if len(mode_val) > 0:
            X_train[col] = X_train[col].fillna(mode_val[0])
            X_valid[col] = X_valid[col].fillna(mode_val[0])
            X_test[col] = X_test[col].fillna(mode_val[0])
        else:
            # If no mode exists, use 'unknown'
            X_train[col] = X_train[col].fillna('unknown')
            X_valid[col] = X_valid[col].fillna('unknown')
            X_test[col] = X_test[col].fillna('unknown')

# 3. Verify the fix results
print("Step 3: Verify fix results")
print(f"NaN count in X_train: {X_train.isna().sum().sum()}")
print(f"NaN count in X_valid: {X_valid.isna().sum().sum()}")
print(f"NaN count in X_test: {X_test.isna().sum().sum()}")

Step 2: Fill remaining NaN values
Step 3: Verify fix results
NaN count in X_train: 0
NaN count in X_valid: 0
NaN count in X_test: 0


### Baseline Model

In [47]:
import pandas as pd
import numpy as np

# 确保都是 DataFrame
X_train = pd.DataFrame(X_train)
X_valid = pd.DataFrame(X_valid)
X_test  = pd.DataFrame(X_test)

# 列顺序对齐
X_valid = X_valid[X_train.columns]
X_test  = X_test[X_train.columns]

# 保证类别特征为 string，数值特征为 float
for df in [X_train, X_valid, X_test]:
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            df[c] = df[c].astype(float)
        else:
            df[c] = df[c].astype(str).fillna("missing")


In [48]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import LabelEncoder


In [49]:
class RFOD:
    """
    RFOD: Random Forest-based Outlier Detection for Tabular Data
    - Feature-wise conditional modeling
    - OOB-based tree pruning (MODIFIED TO INCLUDE)
    - Adjusted Gower’s Distance (AGD)
    - Uncertainty Weighted Averaging (UWA)
    """

    def __init__(self, n_estimators=300, max_depth=None, beta=0.5, alpha=0.02, n_jobs=-1, random_state=42):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.beta = beta                  # keep top β fraction of trees
        self.alpha = alpha                # quantile for AGD normalization
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.models = {}                  # store per-feature models

    # ===============================
    # Helper: detect numeric/categorical
    # ===============================
    def _split_features(self, X):
        num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
        cat_cols = [c for c in X.columns if c not in num_cols]
        return num_cols, cat_cols

    # ===============================
    # Fit phase
    # ===============================
    def fit(self, X: pd.DataFrame):
        """
        Train a separate RF for each feature using leave-one-feature-out scheme.
        Includes OOB-based Forest Pruning.
        """
        self.num_cols, self.cat_cols = self._split_features(X)
        self.models = {}
        
        # OOB 评估需要这些指标
        # 确保它们已在 Cell [36] 中导入
        # from sklearn.metrics import r2_score, accuracy_score

        for col in X.columns:
            X_others = X.drop(columns=[col])
            y = X[col]
            
            all_indices = set(range(len(X_others)))

            # train numeric or categorical RF
            if col in self.num_cols:
                model = RandomForestRegressor(
                    n_estimators=self.n_estimators,
                    max_depth=self.max_depth,
                    n_jobs=self.n_jobs,
                    random_state=self.random_state,
                    bootstrap=True,
                    oob_score=True
                )
                model.fit(X_others, y)
                
                # === MODIFICATION: START FOREST PRUNING (Module 2) ===
                trees = model.estimators_
                in_bag_samples = model.estimators_samples_
                tree_scores = []

                for i, tree in enumerate(trees):
                    oob_idx = list(all_indices - set(in_bag_samples[i]))
                    if not oob_idx:
                        tree_scores.append(-np.inf)
                        continue
                    
                    X_oob = X_others.iloc[oob_idx]
                    y_oob = y.iloc[oob_idx]
                    
                    try:
                        y_pred_oob = tree.predict(X_oob)
                        # 论文建议使用 R^2 (coefficient of determination) [cite: 224]
                        score = r2_score(y_oob, y_pred_oob)
                        tree_scores.append(score)
                    except ValueError:
                        tree_scores.append(-np.inf) # Handle errors

                sorted_indices = np.argsort(tree_scores)[::-1]
                num_to_keep = int(np.floor(self.beta * len(trees)))
                top_indices = sorted_indices[:num_to_keep]
                pruned_trees = [trees[i] for i in top_indices]
                # === MODIFICATION: END FOREST PRUNING ===
                
                q_low, q_high = y.quantile([self.alpha, 1 - self.alpha])
                
                # MODIFICATION: 存储剪枝后的树列表，而不是完整的模型
                self.models[col] = {"type": "num", "model": pruned_trees, "q_low": q_low, "q_high": q_high}

            else:
                le = LabelEncoder()
                y_enc = le.fit_transform(y.astype(str))
                model = RandomForestClassifier(
                    n_estimators=self.n_estimators,
                    max_depth=self.max_depth,
                    n_jobs=self.n_jobs,
                    random_state=self.random_state,
                    bootstrap=True,
                    oob_score=True
                )
                model.fit(X_others, y_enc)
                
                # === MODIFICATION: START FOREST PRUNING (Module 2) ===
                trees = model.estimators_
                in_bag_samples = model.estimators_samples_
                tree_scores = []
                
                for i, tree in enumerate(trees):
                    oob_idx = list(all_indices - set(in_bag_samples[i]))
                    if not oob_idx:
                        tree_scores.append(-np.inf)
                        continue

                    X_oob = X_others.iloc[oob_idx]
                    y_oob_enc = y_enc[oob_idx] # 使用编码后
                    
                    try:
                        y_pred_oob = tree.predict(X_oob)
                        # 论文建议使用 AUC-ROC，但 Accuracy 更简单且已导入 [cite: 224]
                        score = accuracy_score(y_oob_enc, y_pred_oob)
                        tree_scores.append(score)
                    except ValueError:
                        tree_scores.append(-np.inf)
                        
                sorted_indices = np.argsort(tree_scores)[::-1]
                num_to_keep = int(np.floor(self.beta * len(trees)))
                top_indices = sorted_indices[:num_to_keep]
                pruned_trees = [trees[i] for i in top_indices]
                # === MODIFICATION: END FOREST PRUNING ===

                # MODIFICATION: 存储剪枝后的树列表
                self.models[col] = {"type": "cat", "model": pruned_trees, "encoder": le}

        return self

    # ===============================
    # Predict phase
    # ===============================
    def _predict_numeric(self, model, X, y_true, q_low, q_high):
        # MODIFICATION: 'model' 现在是树的列表，不再是RF对象
        trees = model 
        preds = np.column_stack([t.predict(X) for t in trees])
        mean_pred = preds.mean(axis=1)
        std_pred = preds.std(axis=1)
        agd = np.abs(y_true - mean_pred) / (q_high - q_low + 1e-9)
        return agd, std_pred

    def _predict_categorical(self, model, le, X, y_true):
        y_true_enc = le.transform(y_true.astype(str))
        
        # MODIFICATION: 'model' 现在是树的列表
        trees = model 

        # per-tree correctness variance (uncertainty)
        tree_preds = np.column_stack([t.predict(X) for t in trees])
        correct = (tree_preds == y_true_enc[:, None]).astype(float)
        std_uncert = correct.std(axis=1)

        # mean prob of true class
        avg_proba = np.mean([t.predict_proba(X) for t in trees], axis=0)
        p_true = avg_proba[np.arange(len(y_true_enc)), y_true_enc]
        agd = 1 - p_true
        return agd, std_uncert

    # ===============================
    # Compute anomaly scores
    # ===============================
    def _compute_scores(self, X: pd.DataFrame):
        S = pd.DataFrame(np.zeros_like(X, dtype=float), columns=X.columns)
        U = pd.DataFrame(np.zeros_like(X, dtype=float), columns=X.columns)

        for col in X.columns:
            model_info = self.models[col]
            X_others = X.drop(columns=[col])
            y_true = X[col]

            if model_info["type"] == "num":
                agd, uncert = self._predict_numeric(
                    model_info["model"], X_others, y_true.astype(float),
                    model_info["q_low"], model_info["q_high"]
                )
            else:
                agd, uncert = self._predict_categorical(
                    model_info["model"], model_info["encoder"], X_others, y_true
                )

            S[col] = agd
            U[col] = uncert

        return S, U

    # ===============================
    # Public APIs
    # ===============================
    def cell_anomaly_scores(self, X):
        S, _ = self._compute_scores(X)
        return S

    def score_samples(self, X):
        S, U = self._compute_scores(X)
        U_sum = U.sum(axis=1).replace(0, np.finfo(float).eps)
        U_norm = U.div(U_sum, axis=0)
        W = 1 - U_norm
        s_row = (W.values * S.values).mean(axis=1)
        return s_row

    def fit_score(self, X_train_normal, X_test):
        self.fit(X_train_normal)
        S = self.cell_anomaly_scores(X_test)
        s = self.score_samples(X_test)
        return S, s

In [58]:
import warnings
# 忽略这个特定的"feature names"警告 (UserWarning 是 Python 内置的)
warnings.filterwarnings("ignore", category=UserWarning, message=".*X has feature names.*")


rfod = RFOD(n_estimators=150, beta=0.5, alpha=0.02, n_jobs=-1, random_state=42)

# 用正常样本训练
X_train_normal = X_train[y_train == 0]
rfod.fit(X_train_normal)

# 验证集
S_valid = rfod.cell_anomaly_scores(X_valid)
s_valid = rfod.score_samples(X_valid)

# 评估
from sklearn.metrics import roc_auc_score, average_precision_score
roc = roc_auc_score(y_valid, s_valid)
ap = average_precision_score(y_valid, s_valid)
print(f"Validation ROC-AUC={roc:.4f}, AP={ap:.4f}")


Validation ROC-AUC=0.5609, AP=0.1956


In [None]:
S_test, s_test = rfod.fit_score(X_train_normal, X_test)

submission = pd.DataFrame({
    "Id": np.arange(len(s_test)),
    "target": s_test
})
submission.to_csv("submission_stage2.csv", index=False)
print(submission.head())
