# Dimensionality Reduction

# Dependencies

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from catboost import CatBoostClassifier, CatBoostRegressor

from config import TREATMENT, OUTCOME, SEED
from utils.preprocessing import create_stratification_split_col
from modules.dimensionality_reduction.partial_correlation_filter import PartialCorrelationFeatureSelector
from modules.dimensionality_reduction.partial_correlation_outcome_predictors import OutcomePartialCorrelationFeatureSelector

from modules.dimensionality_reduction.hybrid_selector import HybridSelector

# Env vars

In [2]:
CONTROLS_CANDIDATES_PATH = "data/controls_candidates.txt"
np.random.seed(SEED)

# Data

In [3]:
df = pd.read_parquet("data/synthetic_data.parquet")

In [4]:
strat_feature = create_stratification_split_col(df, TREATMENT, OUTCOME)
train, test = train_test_split(
    df, 
    test_size=0.2, 
    random_state=SEED, 
    stratify=strat_feature,
)

# $P(Y|T, X)$ - Partial correlation based filters

In [5]:
X_train, y_train = (
    train.drop(columns=[OUTCOME]),
    train[[OUTCOME]],
)
X_test, y_test = (
    test.drop(columns=[OUTCOME]),
    test[[OUTCOME]],
)
print(X_train.shape, X_test.shape)

(12000, 411) (3000, 411)


## Partial correlation between 𝑇 and 𝑌 controlling for 𝑋𝑗: ρ(𝑇,𝑌∣ 𝑋𝑗)

**confounder-detection perspective**: we want to see whether controlling for 𝑋𝑗 changes the 𝑇--𝑌 relationship. If it does, 𝑋𝑗 is a candidate confounder. (Especially if 𝑋𝑗 also correlates with 𝑇, that is the reason why we add a  two-stage feature selection later).

In [6]:
selector_confounders = PartialCorrelationFeatureSelector(
    correlation_threshold=0.5,
    treatment_variable=TREATMENT,
    min_partial_correlation=0.01 
)
X_transformed = selector_confounders.fit_transform(X_train, y_train)
selected_features_confounders = list(X_transformed.columns)
print(len(selected_features_confounders))
print(selected_features_confounders)

# You can also inspect the partial correlations
print("Partial correlations improvemnt:")
for feat, corr in selector_confounders.feature_partial_correlations_.items():
    if abs(corr) > 0.01:
        print(f"{feat}: {corr:.3f}")

9
['n_loans', 'years_since_default', 'loss_given_default', 'default_debt_amount', 'n_cards', 'n_refin', 'debt_cirbe', 'perc_debt_loss', 'redundantfeature45_n_loans']
Partial correlations improvemnt:
n_loans: 0.053
years_since_default: 0.056
loss_given_default: 0.050
default_debt_amount: 0.022
n_cards: 0.015
n_refin: 0.020
debt_cirbe: 0.024
redundantfeature13_loss_given_default: 0.014
redundantfeature16_debt_cirbe: 0.017
redundantfeature19_n_loans: 0.019
redundantfeature23_loss_given_default: 0.013
redundantfeature24_loss_given_default: 0.036
redundantfeature31_default_debt_amount: 0.016
redundantfeature33_n_refin: 0.012
redundantfeature38_n_loans: 0.025
redundantfeature39_n_refin: 0.010
redundantfeature45_n_loans: 0.012
redundantfeature47_n_loans: 0.021
redundantfeature52_loss_given_default: 0.013
redundantfeature59_years_since_default: 0.016
redundantfeature62_n_refin: 0.014
redundantfeature66_loss_given_default: 0.021
redundantfeature67_loss_given_default: 0.033
redundantfeature75_ye

## Partial correlation between 𝑋𝑗 and 𝑌 controlling for 𝑇: ρ(𝑋𝑗,𝑌∣ 𝑇)

**outcome-predictors perspective**: may tell you which features are good predictors of the outcome above and beyond treatment, but does not directly show how 𝑋𝑗 affects the relationship between 𝑇 and 𝑌.

In [7]:
selector_y_predictors = OutcomePartialCorrelationFeatureSelector(
    correlation_threshold=0.5,
    treatment_variable=TREATMENT,
    min_partial_correlation=0.1  # minimum required partial correlation with OUTCOME
)
X_transformed = selector_y_predictors.fit_transform(X_train, y_train)
selected_features_y_predictors = list(X_transformed.columns)
print(len(selected_features_y_predictors))
print(selected_features_y_predictors)

# You can also inspect the partial correlations
print("Partial correlations with outcome:")
for feat, corr in selector_y_predictors.feature_partial_correlations_.items():
    if abs(corr) > 0.2:
        print(f"{feat}: {corr:.3f}")

10
['years_history', 'n_accounts', 'n_loans', 'loss_given_default', 'n_refin', 'months_since_first_payment', 'perc_debt_loss', 'redundantfeature1_years_history', 'redundantfeature29_years_history', 'redundantfeature69_years_history']
Partial correlations with outcome:
years_history: 0.246
redundantfeature42_years_history: 0.218
redundantfeature60_years_history: 0.210


# $P(T|X)$ - Fast Correlation Based Filter + Sequential Forward Seletector

In [8]:
X_train_treatment, y_train_treatment = (
    train.drop(columns=[TREATMENT, OUTCOME]).copy(),
    train[[TREATMENT]],
)
X_test_treatment, y_test_treatment = (
    test[X_train_treatment.columns],
    test[[TREATMENT]],
)
print(X_train_treatment.shape, X_test_treatment.shape)

(12000, 410) (3000, 410)


In [9]:
init_params_reg = {
    "n_estimators": 400,
    "depth": 4,
    "min_data_in_leaf": round(X_train_treatment.shape[0]*(2/3)*0.01),
    "learning_rate": 0.01,
    "subsample": 1,
    "rsm": 1,
    "objective": "RMSE",
    "silent": True,
    "l2_leaf_reg": 3,
    "random_seed": SEED,
}
reg = CatBoostRegressor(**init_params_reg)
scorer = metrics.make_scorer(metrics.root_mean_squared_error)

In [10]:
selector_t = HybridSelector(
    model=reg,
    scorer=scorer,
    metric_direction='minimize',
    cv=KFold(n_splits=3, random_state=SEED, shuffle=True),
    categorical_features=None,
    min_improvement=0.00,
    task='regression',
    fcbf_threshold=0.0001,
    include_treatment=False,
)
X_train_treatment_selected = selector_t.fit_transform(X_train_treatment, y_train_treatment)
selected_features_ptx = list(X_train_treatment_selected.columns)
print(len(selected_features_ptx))
print(selected_features_ptx)

Selected columns by FCBF: 12
['years_since_default', 'n_loans', 'default_debt_amount', 'loss_given_default', 'debt_cirbe', 'n_cards', 'n_refin', 'noise*feature*204', 'noise*feature*176', 'noise*feature*185', 'noise*feature*44', 'noise*feature*181']

Initial score with no features: inf
Iteration 1: Added 'years_since_default' (Score: 17.728896, Performance gain: inf)
Iteration 2: Added 'default_debt_amount' (Score: 15.273726, Performance gain: 2.455169)
Iteration 3: Added 'n_loans' (Score: 13.153448, Performance gain: 2.120278)
Iteration 4: Added 'loss_given_default' (Score: 11.737268, Performance gain: 1.416180)
Iteration 5: Added 'debt_cirbe' (Score: 10.293105, Performance gain: 1.444163)
Iteration 6: Added 'n_cards' (Score: 9.747022, Performance gain: 0.546083)
Iteration 7: Added 'n_refin' (Score: 9.273867, Performance gain: 0.473155)
Iteration 8: Added 'noise*feature*204' (Score: 9.265855, Performance gain: 0.008013)

Stopping: No features can be added that improve performance

SFS 

# Save features

In [11]:
controls_candidates = list(set(selected_features_confounders + selected_features_y_predictors + selected_features_ptx))
controls_candidates = [c for c in controls_candidates if c not in [TREATMENT, OUTCOME]]
print(len(controls_candidates))
controls_candidates

15


['months_since_first_payment',
 'redundantfeature29_years_history',
 'debt_cirbe',
 'n_loans',
 'n_cards',
 'years_history',
 'default_debt_amount',
 'years_since_default',
 'noise*feature*204',
 'redundantfeature1_years_history',
 'n_accounts',
 'redundantfeature69_years_history',
 'loss_given_default',
 'n_refin',
 'redundantfeature45_n_loans']

In [12]:
!rm -rf file_name
confounder_candidates = [c for c in controls_candidates if c not in [TREATMENT, OUTCOME]]
with open(CONTROLS_CANDIDATES_PATH, "w") as output:
    for row in controls_candidates:
        output.write(str(row) + "\n")

"rm" no se reconoce como un comando interno o externo,
programa o archivo por lotes ejecutable.
