In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from scipy import stats

In [3]:
df = pd.read_feather("data/dataset.feather")

In [4]:
def remove_outliers(X, y, z_threshold=3):
    z_scores = np.abs(stats.zscore(y))
    mask = (z_scores < z_threshold).all(axis=1)
    return X[mask], y[mask]

In [5]:
label_cols = ["kl_div_blowup_step", "plateau_length"]
exclude_cols = [
    "id",
    "start_prompt",
    "end_prompt",
    "perturbation_type",
    "final_kl_div",
    "perturbation_type",
    "kl_div_blowup_step",
    "plateau_length",
]
exclude_cols = set(exclude_cols) | set(label_cols)
feature_cols = [c for c in df.columns if c not in exclude_cols]

In [6]:
X, y = df[feature_cols].to_numpy(), df[label_cols].to_numpy()
print(X.shape, y.shape)

(20000, 1310) (20000, 2)


In [7]:
X, y = remove_outliers(X, y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=324592
)

print(X_train.shape, X_test.shape)

(17630, 1310) (1959, 1310)


In [11]:
from sklearn.feature_selection import SelectFromModel


def select_features(X, y, estimator):
    selector = SelectFromModel(estimator, threshold="median")
    selector.fit(X, y)
    return selector


# Assuming X_train and y_train are your full training data
selector = select_features(X_train, y_train, xgb.XGBRegressor())  # For first target
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)