## Recursive Feature Elimination with Cross-Validation (RFECV)

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFECV
import warnings
warnings.filterwarnings("ignore")
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore", ConvergenceWarning)


# Load dataset
dataset_path = '/content/drive/MyDrive/Machine Learning/scaled_dataset.csv'
df = pd.read_csv(dataset_path)

X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ---- STEP 1: FILTER METHOD (Mutual Information) ----
k = 40  # Select top k features based on mutual information
selector = SelectKBest(score_func=mutual_info_regression, k=k)
X_train_val_selected = selector.fit_transform(X_train_val, y_train_val)
X_test_selected = selector.transform(X_test)
selected_features = X.columns[selector.get_support()]
print("Filter Selected Features:", selected_features)

# ---- STEP 2: WRAPPER METHOD (Recursive Feature Elimination with Cross-Validation) ----
base_model = LinearRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rfecv = RFECV(base_model, step=1, cv=kf, scoring='r2')
X_train_val_selected = rfecv.fit_transform(X_train_val_selected, y_train_val)
X_test_selected = rfecv.transform(X_test_selected)

print("Optimal number of features selected:", rfecv.n_features_)

# ---- STEP 3: EMBEDDED METHOD (Lasso Regression for Feature Selection) ----
def lasso_feature_selection(X_train, y_train, min_features=20):
    alpha = 0.001  # Start with a small alpha
    while True:
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_train, y_train)
        important_features = np.where(lasso.coef_ != 0)[0]
        if len(important_features) >= min_features:
            break  # Stop when we have at least `min_features`
        alpha *= 0.5  # Reduce alpha to be less aggressive in feature elimination
    return X_train[:, important_features], important_features

# Apply Lasso feature selection
X_train_val_selected, selected_feature_indices = lasso_feature_selection(X_train_val_selected, y_train_val, min_features=20)
X_test_selected = X_test_selected[:, selected_feature_indices]

print("Final Selected Feature Count:", len(selected_feature_indices))

# Train and evaluate models using K-Fold Cross-Validation
models = {
    "Linear Regression": LinearRegression(),
    "Polynomial Regression (Degree=2)": make_pipeline(PolynomialFeatures(2), LinearRegression()),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso()
}

results = {}
for name, model in models.items():
    mae_list, mse_list, rmse_list, r2_list = [], [], [], []
    for train_idx, val_idx in kf.split(X_train_val):
        X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
        y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mae_list.append(mean_absolute_error(y_val, y_pred))
        mse_list.append(mean_squared_error(y_val, y_pred))
        rmse_list.append(np.sqrt(mse_list[-1]))
        r2_list.append(r2_score(y_val, y_pred))
    results[name] = (np.mean(mae_list), np.mean(mse_list), np.mean(rmse_list), np.mean(r2_list))

# Display results
print("\nModel Performance on Validation Set (K-Fold CV):")
for name, (mae, mse, rmse, r2) in results.items():
    print(f"{name}:\n  MAE: {mae:.4f}\n  MSE: {mse:.4f}\n  RMSE: {rmse:.4f}\n  R2 Score: {r2:.4f}\n")

# Select best model (highest average R2 score)
best_model_name = max(results, key=lambda x: results[x][3])
best_model = models[best_model_name]
print(f"Best Model Before Tuning: {best_model_name}\n")


Filter Selected Features: Index(['MSSubClass', 'LotFrontage', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtQual', 'GarageYrBlt', 'GarageCars', 'SalePrice', 'TotalBath',
       'TotalPorchSF', 'HouseAge', 'TotalSqFt', 'QualitySize',
       'TotalBsmtFinScore', 'NeighborhoodQuality', 'ExterQual_ordinal',
       'KitchenQual_ordinal', 'LandContour_HLS', 'LandContour_Lvl',
       'Condition1_Norm', 'Condition1_RRNn', 'Condition2_PosA',
       'BldgType_TwnhsE', 'HouseStyle_2Story', 'HouseStyle_SFoyer',
       'Exterior1st_Plywood', 'Exterior1st_Stone', 'Exterior1st_Stucco',
       'Exterior1st_VinylSd', 'Exterior2nd_HdBoard', 'Exterior2nd_VinylSd',
       'Foundation_CBlock', 'Foundation_PConc', 'Functional_Typ',
       'GarageFinish_Unf', 'SaleType_ConLD', 'SaleType_New', 'SaleType_WD',
       'SaleCondition_Normal'],
      dtype='object')
Optimal number of features selected: 1


KeyboardInterrupt: 

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE

# Load dataset
dataset_path = '/content/drive/MyDrive/Machine Learning/scaled_dataset.csv'
df = pd.read_csv(dataset_path)

X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ---- STEP 1: FILTER METHOD (Mutual Information) ----
k = 40  # Select top k features based on mutual information
selector = SelectKBest(score_func=mutual_info_regression, k=k)
X_train_val_selected = selector.fit_transform(X_train_val, y_train_val)
X_test_selected = selector.transform(X_test)
selected_features = X.columns[selector.get_support()]
print("Filter Selected Features:", selected_features)

# ---- STEP 2: WRAPPER METHOD (Recursive Feature Elimination) ----
base_model = LinearRegression()
rfe = RFE(base_model, n_features_to_select=20)
X_train_val_selected = rfe.fit_transform(X_train_val_selected, y_train_val)
X_test_selected = rfe.transform(X_test_selected)

# ---- STEP 3: EMBEDDED METHOD (Lasso Regression for Feature Selection) ----
# ---- STEP 3: EMBEDDED METHOD (Lasso with Dynamic Alpha) ----
def lasso_feature_selection(X_train, y_train, min_features=20):
    alpha = 0.001  # Start with a small alpha
    while True:
        lasso = Lasso(alpha=alpha)
        lasso.fit(X_train, y_train)

        important_features = np.where(lasso.coef_ != 0)[0]
        if len(important_features) >= min_features:
            break  # Stop when we have at least `min_features`
        alpha *= 0.5  # Reduce alpha to be less aggressive in feature elimination

    return X_train[:, important_features], important_features

# Apply Lasso feature selection
X_train_val_selected, selected_feature_indices = lasso_feature_selection(X_train_val_selected, y_train_val, min_features=20)
X_test_selected = X_test_selected[:, selected_feature_indices]

print("Final Selected Feature Count:", len(selected_feature_indices))


# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)


# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Polynomial Regression (Degree=2)": make_pipeline(PolynomialFeatures(2), LinearRegression()),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso()
}

# Train and evaluate models using K-Fold Cross-Validation
results = {}
for name, model in models.items():
    mae_list, mse_list, rmse_list, r2_list = [], [], [], []

    for train_idx, val_idx in kf.split(X_train_val):
        X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
        y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        mae_list.append(mean_absolute_error(y_val, y_pred))
        mse_list.append(mean_squared_error(y_val, y_pred))
        rmse_list.append(np.sqrt(mse_list[-1]))
        r2_list.append(r2_score(y_val, y_pred))

    results[name] = (np.mean(mae_list), np.mean(mse_list), np.mean(rmse_list), np.mean(r2_list))

# Display results
print("\nModel Performance on Validation Set (K-Fold CV):")
for name, (mae, mse, rmse, r2) in results.items():
    print(f"{name}:\n  MAE: {mae:.4f}\n  MSE: {mse:.4f}\n  RMSE: {rmse:.4f}\n  R2 Score: {r2:.4f}\n")

# Select best model (highest average R2 score)
best_model_name = max(results, key=lambda x: results[x][3])
best_model = models[best_model_name]
print(f"Best Model Before Tuning: {best_model_name}\n")

# Hyperparameter tuning for the best model
if isinstance(best_model, Ridge) or isinstance(best_model, Lasso):
    param_grid = {"alpha": [0.01, 0.1, 1, 10, 100]}
elif "Polynomial" in best_model_name:
    param_grid = {"polynomialfeatures__degree": [2, 3, 4]}
else:
    param_grid = {}

if param_grid:
    grid_search = GridSearchCV(best_model, param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train_val, y_train_val)
    tuned_model = grid_search.best_estimator_

    # Training Error
    y_train_pred = tuned_model.predict(X_train_val)
    train_mae = mean_absolute_error(y_train_val, y_train_pred)
    train_mse = mean_squared_error(y_train_val, y_train_pred)
    train_rmse = np.sqrt(train_mse)
    train_r2 = r2_score(y_train_val, y_train_pred)

    # Test Error
    y_test_pred = tuned_model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)
    test_r2 = r2_score(y_test, y_test_pred)

    print("\nModel Performance After Tuning:")
    print(f"{best_model_name} (Training Set):")
    print(f"  MAE: {train_mae:.4f}\n  MSE: {train_mse:.4f}\n  RMSE: {train_rmse:.4f}\n  R2 Score: {train_r2:.4f}\n")

    print(f"{best_model_name} (Test Set):")
    print(f"  MAE: {test_mae:.4f}\n  MSE: {test_mse:.4f}\n  RMSE: {test_rmse:.4f}\n  R2 Score: {test_r2:.4f}\n")

    print(f"Best Parameters: {grid_search.best_params_}")



Filter Selected Features: Index(['YearBuilt', 'YearRemodAdd', 'BsmtQual', 'BedroomAbvGr', 'KitchenAbvGr',
       'FireplaceQu', 'GarageYrBlt', 'GarageCars', 'GarageCond', 'SalePrice',
       'TotalBath', 'TotalPorchSF', 'HouseAge', 'TotalSqFt', 'QualitySize',
       'TotalBsmtFinScore', 'NeighborhoodQuality', 'ExterQual_ordinal',
       'HeatingQC_ordinal', 'LotShape_IR3', 'LotConfig_Inside',
       'LandSlope_Mod', 'Condition1_PosN', 'Condition2_Feedr',
       'HouseStyle_SLvl', 'Exterior1st_Stone', 'Exterior1st_Stucco',
       'Exterior1st_VinylSd', 'Exterior2nd_AsphShn', 'Exterior2nd_HdBoard',
       'Exterior2nd_Stone', 'Foundation_CBlock', 'Foundation_PConc',
       'Heating_Wall', 'Electrical_FuseP', 'Functional_Sev',
       'GarageFinish_RFn', 'SaleType_New', 'SaleType_WD',
       'SaleCondition_Normal'],
      dtype='object')
Final Selected Feature Count: 20

Model Performance on Validation Set (K-Fold CV):
Linear Regression:
  MAE: 0.0107
  MSE: 0.0015
  RMSE: 0.0333
  R2 Scor