In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
# Sample dataset: Study hours, previous exam scores, and pass/fail labels
data = {
    'StudyHours': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'PrevExamScore': [30, 40, 45, 50, 60, 65, 70, 75, 80, 85],
    'Pass': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]  # 0 = Fail, 1 = Pass
}

df = pd.DataFrame(data)

# Features and target variable
X = df[['StudyHours', 'PrevExamScore']]
y = df['Pass']

In [3]:
def forward_selection(X, y):
    remaining_features = set(X.columns)
    selected_features = []
    current_score = 0.0
    best_score = 0.0
    
    while remaining_features:
        scores_with_candidates = []
        
        # Loop through remaining features
        for feature in remaining_features:
            features_to_test = selected_features + [feature]
            X_train, X_test, y_train, y_test = train_test_split(X[features_to_test], y, test_size=0.2, random_state=42)
            
            # Train the model
            model = LinearRegression()
            model.fit(X_train, y_train)
            
            # Make predictions and calculate R-squared
            y_pred = model.predict(X_test)
            score = r2_score(y_test, y_pred)
            
            # Record the score with the current feature
            scores_with_candidates.append((score, feature))
        
        # Sort candidates by score (highest score first)
        scores_with_candidates.sort(reverse=True)
        best_score, best_feature = scores_with_candidates[0]
        
        # If adding the feature improves the score, add it to the model
        if current_score < best_score:
            remaining_features.remove(best_feature)
            selected_features.append(best_feature)
            current_score = best_score
        else:
            break
    
    return selected_features

# Run forward selection
best_features = forward_selection(X, y)
print("Selected features using Forward Selection:", best_features)

Selected features using Forward Selection: ['PrevExamScore']


## 为什么用 SequentialFeatureSelector + Pipeline 更好？ / Why SFS + Pipeline is better

- **避免数据泄漏 / Avoid data leakage**  
  通过 `Pipeline` 在**每个交叉验证折内**拟合 `StandardScaler`，不把测试信息泄漏到训练里。

- **用交叉验证驱动选择 / Selection driven by CV**  
  不再在同一 hold-out 集上反复试探（乐观偏差），而是用 K 折平均性能做稳健评估。

- **尺度公平 & 可比的系数 / Scale fairness & comparable effects**  
  标准化让 L1/L2 惩罚对不同量纲**一视同仁**；系数/重要性可以跨特征比较。

- **与正则化模型无缝结合 / Works seamlessly with regularized estimators**  
  可直接换成 `RidgeCV` / `LassoCV` / `ElasticNetCV`，在**强共线**场景更稳定，还能自动调参。

- **更可靠的选择结果 / More reliable selections**  
  CV 平均减少偶然性，小样本下也更稳；支持 `scoring` 自定义指标（R²、AUC、F1 等）。

- **工程友好 / Engineering-friendly**  
  `get_support()` 一步拿到所选特征，`n_jobs` 并行更快，`random_state` 保证可复现；代码更少、维护更容易。

> 简而言之：**SFS + Pipeline = 无泄漏 + 公平正则 + CV 稳健评估 + 可复现**  
> In short: **SFS + Pipeline = no leakage + fair regularization + CV-robust evaluation + reproducibility**.


In [7]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV  # 可换成 LassoCV/ElasticNetCV
import numpy as np

import warnings
from sklearn.exceptions import UndefinedMetricWarning

# warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore")

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('est', RidgeCV(alphas=np.logspace(-3,3,13), cv=5))
])

sfs = SequentialFeatureSelector(
    pipe, direction='forward', scoring='r2', cv=5, n_jobs=-1
)
sfs.fit(X, y)
selected = X.columns[sfs.get_support()]
print("Selected features:", list(selected))

Selected features: ['StudyHours']
