# Dimensionality Reduction

# Dependencies

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from catboost import CatBoostClassifier, CatBoostRegressor

from config import TREATMENT, OUTCOME
from utils.preprocessing import create_stratification_split_col
from modules.dimensionality_reduction.sequential_forward_selector import SFSCV
from modules.dimensionality_reduction.partial_correlation_filter import PartialCorrelationFeatureSelector
from modules.dimensionality_reduction.partial_correlation_outcome_predictors import OutcomePartialCorrelationFeatureSelector

from modules.dimensionality_reduction.hybrid_selector import HybridSelector

# Env vars

In [2]:
CONTROLS_CANDIDATES_PATH = "data/controls_candidates.txt"

# Data

In [3]:
df = pd.read_parquet("data/synthetic_data.parquet")

In [4]:
strat_feature = create_stratification_split_col(df, TREATMENT, OUTCOME)
train, test = train_test_split(
    df, test_size=0.2, random_state=42, 
    stratify=strat_feature,
)

# $P(Y|T, X)$ - Partial correlation based filters

In [5]:
X_train, y_train = (
    train.drop(columns=[OUTCOME]),
    train[[OUTCOME]],
)
X_test, y_test = (
    test.drop(columns=[OUTCOME]),
    test[[OUTCOME]],
)
print(X_train.shape, X_test.shape)

(12000, 411) (3000, 411)


## Partial correlation between 𝑇 and 𝑌 controlling for 𝑋𝑗: ρ(𝑇,𝑌∣ 𝑋𝑗)

**confounder-detection perspective**: we want to see whether controlling for 𝑋𝑗 changes the 𝑇--𝑌 relationship. If it does, 𝑋𝑗 is a candidate confounder. (Especially if 𝑋𝑗 also correlates with 𝑇, that is the reason why we add a  two-stage feature selection later).

In [6]:
selector_confounders = PartialCorrelationFeatureSelector(
    correlation_threshold=0.6,
    treatment_variable=TREATMENT,
    min_partial_correlation=0.01 
)
X_transformed = selector_confounders.fit_transform(X_train, y_train)
selected_features_confounders = list(X_transformed.columns)
print(len(selected_features_confounders))
print(selected_features_confounders)

# You can also inspect the partial correlations
print("Partial correlations improvemnt:")
for feat, corr in selector_confounders.feature_partial_correlations_.items():
    if abs(corr) > 0.01:
        print(f"{feat}: {corr:.3f}")

12
['n_loans', 'years_since_default', 'loss_given_default', 'default_debt_amount', 'n_cards', 'n_refin', 'debt_cirbe', 'perc_debt_forgivness', 'redundantfeature4_n_loans', 'redundantfeature28_n_loans', 'redundantfeature37_n_loans', 'redundantfeature93_years_since_default']
Partial correlations with outcome:
n_loans: 0.060
years_since_default: 0.042
loss_given_default: 0.046
default_debt_amount: 0.033
n_cards: 0.015
n_refin: 0.018
debt_cirbe: 0.021
redundantfeature1_n_cards: 0.011
redundantfeature4_n_loans: 0.014
redundantfeature14_default_debt_amount: 0.022
redundantfeature20_n_refin: 0.012
redundantfeature26_years_since_default: 0.022
redundantfeature28_n_loans: 0.012
redundantfeature30_years_since_default: 0.022
redundantfeature34_n_loans: 0.025
redundantfeature35_loss_given_default: 0.018
redundantfeature37_n_loans: 0.013
redundantfeature41_years_since_default: 0.014
redundantfeature53_loss_given_default: 0.033
redundantfeature57_n_refin: 0.013
redundantfeature77_debt_cirbe: 0.013
r

## Partial correlation between 𝑋𝑗 and 𝑌 controlling for 𝑇: ρ(𝑋𝑗,𝑌∣ 𝑇)

**outcome-predictors perspective**: may tell you which features are good predictors of the outcome above and beyond treatment, but does not directly show how 𝑋𝑗 affects the relationship between 𝑇 and 𝑌.

In [9]:

selector_y_predictors = OutcomePartialCorrelationFeatureSelector(
    correlation_threshold=0.6,
    treatment_variable=TREATMENT,
    min_partial_correlation=0.1  # minimum required partial correlation with OUTCOME
)
X_transformed = selector_y_predictors.fit_transform(X_train, y_train)
selected_features_y_predictors = list(X_transformed.columns)
print(len(selected_features_y_predictors))
print(selected_features_y_predictors)

# You can also inspect the partial correlations
print("Partial correlations with outcome:")
for feat, corr in selector_y_predictors.feature_partial_correlations_.items():
    if abs(corr) > 0.2:
        print(f"{feat}: {corr:.3f}")

5
['years_history', 'n_accounts', 'n_loans', 'loss_given_default', 'perc_debt_forgivness']
Partial correlations with outcome:
years_history: 0.245


# $P(T|X)$ - Fast Correlation Based Filter + Sequential Forward Seletector

In [10]:
X_train_treatment, y_train_treatment = (
    train.drop(columns=[TREATMENT, OUTCOME]).copy(),
    train[[TREATMENT]],
)
X_test_treatment, y_test_treatment = (
    test[X_train_treatment.columns],
    test[[TREATMENT]],
)
print(X_train_treatment.shape, X_test_treatment.shape)

(12000, 410) (3000, 410)


In [11]:
init_params_reg = {
    "n_estimators": 400,
    "depth": 4,
    "min_data_in_leaf": round(X_train_treatment.shape[0]*(2/3)*0.01),
    "learning_rate": 0.01,
    "subsample": 1,
    "rsm": 1,
    "objective": "RMSE",
    "silent": True,
    "l2_leaf_reg": 3,
    "random_seed": 42
}
reg = CatBoostRegressor(**init_params_reg)
scorer = metrics.make_scorer(metrics.root_mean_squared_error)

In [12]:
selector_t = HybridSelector(
    model=reg,
    scorer=scorer,
    metric_direction='minimize',
    cv=KFold(n_splits=3, random_state=42, shuffle=True),
    categorical_features=None,
    min_improvement=0.00,
    task='regression',
    fcbf_threshold=0.0001,
    include_treatment=False,
)
X_train_treatment_selected = selector_t.fit_transform(X_train_treatment, y_train_treatment)
selected_features_ptx = list(X_train_treatment_selected.columns)
print(len(selected_features_ptx))
print(selected_features_ptx)

Selected columns by FCBF: 8
['years_since_default', 'n_loans', 'default_debt_amount', 'loss_given_default', 'debt_cirbe', 'n_refin', 'n_cards', 'noise*feature*12']

Initial score with no features: inf
Iteration 1: Added 'years_since_default' (Score: 17.693293, Performance gain: inf)
Iteration 2: Added 'default_debt_amount' (Score: 15.261596, Performance gain: 2.431696)
Iteration 3: Added 'n_loans' (Score: 13.153244, Performance gain: 2.108352)
Iteration 4: Added 'loss_given_default' (Score: 11.718335, Performance gain: 1.434909)
Iteration 5: Added 'debt_cirbe' (Score: 10.233927, Performance gain: 1.484408)
Iteration 6: Added 'n_refin' (Score: 9.716674, Performance gain: 0.517254)
Iteration 7: Added 'n_cards' (Score: 9.272555, Performance gain: 0.444119)

Stopping: No features can be added that improve performance

SFS Feature selection completed:
- Started with 0 features
- Added 7 features
- Left 1 features unused
- Final score: 9.2726
7
['years_since_default', 'default_debt_amount', 

# Save features

In [14]:
controls_candidates = list(set(selected_features_confounders + selected_features_y_predictors + selected_features_ptx))
controls_candidates = [c for c in controls_candidates if c not in [TREATMENT, OUTCOME]]
len(controls_candidates)
controls_candidates

['redundantfeature28_n_loans',
 'n_accounts',
 'default_debt_amount',
 'years_since_default',
 'n_loans',
 'loss_given_default',
 'redundantfeature4_n_loans',
 'n_refin',
 'years_history',
 'debt_cirbe',
 'redundantfeature37_n_loans',
 'redundantfeature93_years_since_default',
 'n_cards']

In [15]:
!rm -rf file_name
confounder_candidates = [c for c in controls_candidates if c not in [TREATMENT, OUTCOME]]
with open(CONTROLS_CANDIDATES_PATH, "w") as output:
    for row in controls_candidates:
        output.write(str(row) + "\n")

"rm" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
