In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
## --- Load dataset and calculate returns ---

import pandas as pd
import numpy as np

# Load dataset
path = "/content/drive/MyDrive/lending_club_dataset.pickle"
data = pd.read_pickle(path)
df = data[0]

# Calculate return metrics
df['raw_roi'] = (df['total_pymnt'] + df['recoveries']) / df['loan_amnt'] - 1
df['ann_roi'] = df['raw_roi'] / (df['term_num'] / 12)
df['log_return'] = np.log1p(df['ann_roi'])

# Feature engineering
df['installment_to_income'] = df['installment'] / df['annual_inc']
df['loan_to_income'] = df['loan_amnt'] / df['annual_inc']

# Drop leaky features
leak_features = ['total_pymnt', 'recoveries', 'last_pymnt_d', 'ret_PESS', 'ret_OPT', 'ret_INTa', 'ret_INTb', 'ret_INTc']
df = df.drop(columns=[col for col in leak_features if col in df.columns])



In [None]:
# --- Setup preprocessing ---

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Features
numerical = ["loan_amnt", "funded_amnt", "installment", "int_rate", "annual_inc", "loan_length", "term_num",
             "installment_to_income", "loan_to_income"]
categorical = ["home_ownership", "grade", "emp_length"]
features = numerical + categorical

# Fill missing values
for col in numerical:
    df[col] = df[col].fillna(df[col].median())
for col in categorical:
    df[col] = df[col].fillna("Missing")

X = df[features]
y_raw_roi = df["raw_roi"]

# Split the data once based on raw_roi
X_train, X_test, y_train_raw_roi, y_test_raw_roi = train_test_split(X, y_raw_roi, test_size=0.2, random_state=42)

# Now calculate ann_roi and log_return correctly
term_num_train = X_train["term_num"]
term_num_test = X_test["term_num"]

y_train_ann_roi = y_train_raw_roi / (term_num_train / 12)
y_test_ann_roi = y_test_raw_roi / (term_num_test / 12)

y_train_log_return = np.log1p(y_train_ann_roi)
y_test_log_return = np.log1p(y_test_ann_roi)

# Setup preprocessor
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numerical),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
])

# Fit preprocessor
preprocessor.fit(X_train)

# Transform the data
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)


In [None]:
# --- Train Random Forest Models Separately (with subsampling) ---

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Subsampling training function
def train_rf_on_sample(X_full, y_full, sample_size=50000):
    idx = np.random.choice(range(X_full.shape[0]), size=sample_size, replace=False)
    X_sample = X_full[idx]
    y_sample = y_full.iloc[idx]

    model = RandomForestRegressor(n_estimators=30, max_depth=10, random_state=42)
    model.fit(X_sample, y_sample)
    return model

# Model 1: RAW ROI
model_raw = train_rf_on_sample(X_train_processed, y_train_raw_roi)
pred_raw = model_raw.predict(X_test_processed)

print("Model 1: RAW ROI")
print(f"RAW ROI - MSE: {mean_squared_error(y_test_raw_roi, pred_raw):.6f} | R²: {r2_score(y_test_raw_roi, pred_raw):.4f}")

# Model 2: ANN ROI
model_ann = train_rf_on_sample(X_train_processed, y_train_ann_roi)
pred_ann = model_ann.predict(X_test_processed)

print("\nModel 2: ANN ROI")
print(f"ANN ROI - MSE: {mean_squared_error(y_test_ann_roi, pred_ann):.6f} | R²: {r2_score(y_test_ann_roi, pred_ann):.4f}")

# Model 3: LOG RETURN
model_log = train_rf_on_sample(X_train_processed, y_train_log_return)
pred_log = model_log.predict(X_test_processed)

print("\nModel 3: LOG RETURN")
print(f"LOG RETURN - MSE: {mean_squared_error(y_test_log_return, pred_log):.6f} | R²: {r2_score(y_test_log_return, pred_log):.4f}")


Model 1: RAW ROI
RAW ROI - MSE: 0.053504 | R²: 0.2710

Model 2: ANN ROI
ANN ROI - MSE: 0.004253 | R²: 0.2826

Model 3: LOG RETURN
LOG RETURN - MSE: 0.005018 | R²: 0.2683
