In [None]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer

# -----------------------
# 1. Load CSV & prepare X, y
# -----------------------
file_path = "./feature_engineered_data.csv"
TARGET_COL = "thai_price"

df = pd.read_csv(file_path)

# Remove target & date-like columns from features
drop_cols = [TARGET_COL]
for c in ["year_month", "date", "Year", "Month"]:
    if c in df.columns:
        drop_cols.append(c)

# drop rows with missing target
df = df.dropna(subset=[TARGET_COL])

y = df[TARGET_COL]

# keep only numeric features
X_raw = df.drop(columns=drop_cols).select_dtypes(include=["number"])

print("Before imputation:")
print("X_raw shape:", X_raw.shape)
print("NaN per column (top 10):")
print(X_raw.isna().sum().sort_values(ascending=False).head(10))

# -----------------------
# 2. Impute NaN in X (median)
# -----------------------
imputer = SimpleImputer(strategy="median")
X_imputed_array = imputer.fit_transform(X_raw)

# Put back into DataFrame with same columns/index
X = pd.DataFrame(X_imputed_array, columns=X_raw.columns, index=X_raw.index)

print("\nAfter imputation:")
print("Any NaN left?", X.isna().any().any())

# -----------------------
# 3. Fit Ridge
# -----------------------
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X, y)

# -----------------------
# 4. Feature selection with SelectFromModel
# -----------------------
selector = SelectFromModel(
    ridge,
    threshold="median",   # keep features with |coef| >= median
    prefit=True
)

selected_mask = selector.get_support()
selected_features = X.columns[selected_mask]

# -----------------------
# 5. Print in `'feature',` format
# -----------------------
print("\nSelected Ridge Regression features:\n")
for f in selected_features:
    print(f"'{f}',")

print(f"\nTotal selected: {len(selected_features)} / {X.shape[1]}")



X shape: (222, 104)
y shape: (222,)


ValueError: Input X contains NaN.
Ridge does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values