In [125]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

# Load the processed vehicles dataset
df = pd.read_csv('../data/vehicles_processed.csv')

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

Dataset shape: (312846, 14)
Columns: ['id', 'region', 'price', 'year', 'manufacturer', 'model', 'condition', 'fuel', 'odometer', 'title_status', 'transmission', 'type', 'paint_color', 'state']

First few rows:


Unnamed: 0,id,region,price,year,manufacturer,model,condition,fuel,odometer,title_status,transmission,type,paint_color,state
0,7316814884,other_al,33590,2014.0,gmc,other_gmc,good,gas,57923.0,clean,other,pickup,white,al
1,7316814758,other_al,22590,2010.0,chevrolet,silverado 1500,good,gas,71229.0,clean,other,pickup,blue,al
2,7316814989,other_al,39590,2020.0,chevrolet,other_chevrolet,good,gas,19160.0,clean,other,pickup,red,al
3,7316743432,other_al,30990,2017.0,toyota,other_toyota,good,gas,41124.0,clean,other,pickup,red,al
4,7316356412,other_al,15000,2013.0,ford,other_ford,excellent,gas,128000.0,clean,automatic,truck,black,al


In [126]:
# Split into X and Y
# Identify categorical and numerical columns
categorical_cols = ['region', 'manufacturer', 'model', 'condition', 'fuel',  
                   'title_status', 'transmission', 'type', 'paint_color', 'state']
numerical_cols = ['year', 'odometer']

# Prepare features and target
X = df[categorical_cols + numerical_cols]
y = df['price']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

Features shape: (312846, 12)
Target shape: (312846,)
Categorical columns: ['region', 'manufacturer', 'model', 'condition', 'fuel', 'title_status', 'transmission', 'type', 'paint_color', 'state']
Numerical columns: ['year', 'odometer']


In [127]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

Training set size: 250276
Test set size: 62570


## Preprocessing pipeline
- Create age instead of year by subtracting current year
- create age² and odo²

In [128]:
current_year = dt.date.today().year           # use 2025 if you prefer a fixed ref

def year_to_age(x, *, current_year=current_year):
    """Convert model-year → age in years."""
    return current_year - x

age_tf = FunctionTransformer(
    year_to_age,               # ← named function
    feature_names_out='one-to-one'
)

age_pipeline = Pipeline([
    ('to_age',  age_tf),                                             # year  → age
    ('poly',    PolynomialFeatures(degree=2, include_bias=False)),   # add age²
    ("scaler",  StandardScaler())                     # scale age, age²
])

odometer_pipeline = Pipeline([
    ('poly',    PolynomialFeatures(
                    degree=2, include_bias=False)),   # add odo²
    ("scaler",  StandardScaler())                     # scale age, age²
])

def age_odometer_product(X, *, current_year=current_year):
    """
    X has two columns: [year, odometer].
    Returns one column: (age * odometer).
    Works whether X is a DataFrame or an ndarray.
    """
    if hasattr(X, "to_numpy"):                 # pandas -> ndarray
        X = X.to_numpy()

    yr, odo = X[:, 0], X[:, 1]
    age = current_year - yr
    return (age * odo).reshape(-1, 1)

interaction_pipeline = Pipeline([
    ('age_x_odo',
        FunctionTransformer(
            age_odometer_product,
            feature_names_out=lambda _: ['age*odometer']                # nice label
        )
    ),
    ('scaler',  StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat',      OneHotEncoder(drop='first',
                                   sparse_output=False,
                                   handle_unknown='ignore'),
                     categorical_cols),
        ('age', age_pipeline,   ['year']),      # year → age, age²
        ('odometer',  odometer_pipeline, ['odometer']),# odo, odo²
        ('ageXodo',    interaction_pipeline,  ['year', 'odometer']) # age*odo
    ],
    remainder='drop',
    verbose_feature_names_out=False   # cleaner column names downstream
)

## Testing 3 models
1. RidgeCV
2. 

In [129]:
# Use a smaller subset for hyperparameter tuning to speed up the process
sample_size = 10000 # around 10% of the training set
X_train_sample = X_train.sample(n=sample_size, random_state=42)
y_train_sample = y_train[X_train_sample.index]

### RidgeCV

In [130]:
ridge_cv = RidgeCV(
    alphas=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],  # Equivalent to np.logspace(-3, 3, 7)
    scoring='neg_mean_squared_error',
    cv=5,
)

pipeline1 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', ridge_cv)
])

pipeline1

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('age', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function yea...0012B37FCDEE0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,func,<function age...0012B37FCDB20>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function <la...0012B37FCDE40>
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,alphas,"[0.001, 0.01, ...]"
,fit_intercept,True
,scoring,'neg_mean_squared_error'
,cv,5
,gcv_mode,
,store_cv_results,False
,alpha_per_target,False


In [131]:
# Fit the pipeline and get results
pipeline1.fit(X_train_sample, y_train_sample)

print("Training completed successfully!")
print(f"Best alpha selected by RidgeCV: {pipeline1.named_steps['model'].alpha_}")

# Make test predictions to verify it works
y_sample_pred = pipeline1.predict(X_train_sample)
sample_rmse = np.sqrt(mean_squared_error(y_train_sample, y_sample_pred))
print(f"Sample RMSE: ${sample_rmse:,.2f}")

InvalidIndexError: (slice(None, None, None), 0)

### Lasso selector with Ridge

In [120]:
# Create a base pipeline (we'll tune the alpha parameters)
pipeline2 = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectFromModel(Lasso(random_state=42))),
    ('model', Ridge(random_state=42))
])

pipeline2

0,1,2
,steps,"[('preprocessor', ...), ('selector', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('age', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function yea...0012B2F6FA520>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,degree,2
,interaction_only,False
,include_bias,False
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,estimator,Lasso(random_state=42)
,threshold,
,prefit,False
,norm_order,1
,max_features,
,importance_getter,'auto'

0,1,2
,alpha,1.0
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42
,selection,'cyclic'

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,42


In [97]:
# Define parameter grid for GridSearchCV
param_grid = {
    'selector__estimator__alpha': np.logspace(-2, 2, 5),    # 0.01 … 100
    'model__alpha': np.logspace(-2, 4, 7)                   # 0.01 … 10_000
}

print("Starting GridSearchCV hyperparameter search...")
print(f"Testing {len(param_grid['model__alpha'])} Model alpha values: {param_grid['model__alpha']}")
print(f"Testing {len(param_grid['selector__estimator__alpha'])} Selector alpha values: {param_grid['selector__estimator__alpha']}")
print(f"Total combinations: {len(param_grid['model__alpha']) * len(param_grid['selector__estimator__alpha'])}")

print(f"Using sample size: {sample_size} for hyperparameter tuning")

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline2,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available processors
    verbose=1
)

print("\nFitting GridSearchCV...")
grid_search.fit(X_train_sample, y_train_sample)
print("GridSearchCV completed!")

Starting GridSearchCV hyperparameter search...
Testing 7 Ridge alpha values: [1.e-02 1.e-01 1.e+00 1.e+01 1.e+02 1.e+03 1.e+04]
Testing 5 Lasso alpha values: [1.e-02 1.e-01 1.e+00 1.e+01 1.e+02]
Total combinations: 35
Using sample size: 10000 for hyperparameter tuning

Fitting GridSearchCV...
Fitting 3 folds for each of 35 candidates, totalling 105 fits
GridSearchCV completed!


In [98]:
# Extract best parameters and score
best_params   = grid_search.best_params_
best_score    = grid_search.best_score_                 # negative MSE
best_std_mse  = grid_search.cv_results_['std_test_score'][grid_search.best_index_]
best_rmse     = (-best_score) ** 0.5
best_rmse_sd  = best_std_mse / (2 * best_rmse)

print("="*60)
print("GRIDSEARCHCV RESULTS")
print("="*60)
print("Best parameters found:")
print(f"  Model  alpha: {best_params['model__alpha']}")
print(f"  Feature selection  alpha: {best_params['selector__estimator__alpha']}")
print(f"  Best cross-validated RMSE      : ${best_rmse:,.2f} (±${best_rmse_sd:,.2f})")

# Show some additional results
print(f"\nTop 5 parameter combinations:")
results_df = pd.DataFrame(grid_search.cv_results_)
top_5 = results_df.nlargest(5, 'mean_test_score')[['params',
                                                   'mean_test_score',
                                                   'std_test_score']]

for i, (_, row) in enumerate(top_5.iterrows(), 1):
    rid_alpha  = row['params']['model__alpha']
    las_alpha  = row['params']['selector__estimator__alpha']
    mse_mean   = -row['mean_test_score']            # positive MSE
    mse_sd     =  row['std_test_score']             # SD of MSE
    rmse_mean  = mse_mean ** 0.5
    rmse_sd    = mse_sd / (2 * rmse_mean)

    print(f"  {i}. | "
          f"Model α={rid_alpha:7.1f} | "
          f"Selector α={las_alpha:7.1f} | "
          f"→ RMSE: ${rmse_mean:8,.2f} (±${rmse_sd:6,.2f})")

print("="*60)

GRIDSEARCHCV RESULTS
Best parameters found:
  Model  alpha: 1.0
  Feature selection  alpha: 1.0
  Best cross-validated RMSE      : $7,397.56 (±$67.60)

Top 5 parameter combinations:
  1. | Model α=    1.0 | Selector α=    1.0 | → RMSE: $7,397.56 (±$ 67.60)
  2. | Model α=    1.0 | Selector α=    0.0 | → RMSE: $7,397.90 (±$ 65.68)
  3. | Model α=    1.0 | Selector α=    0.1 | → RMSE: $7,397.98 (±$ 66.49)
  4. | Model α=   10.0 | Selector α=    0.0 | → RMSE: $7,402.77 (±$ 61.95)
  5. | Model α=   10.0 | Selector α=    0.1 | → RMSE: $7,403.30 (±$ 62.38)


# Old stuff below

In [None]:
# Create the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)  # Keep numerical columns as-is for now
    ],
    remainder='drop'
)

# keep 30 best features (or tune n_features_to_select in GridSearch)
sfs = SequentialFeatureSelector(
    Ridge(alpha=1.0, random_state=42),
    n_features_to_select=20,
    direction='forward',
    scoring='neg_mean_squared_error',
    cv=3,
    n_jobs=-1
)

# Create a base pipeline (we'll tune the alpha parameters)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('selector', sfs),
    ('ridge', Ridge(random_state=42))
])

pipeline

In [None]:
# Find best hyperparameters using GridSearchCV

# Define parameter grid for GridSearchCV
param_grid = {
    'ridge__alpha': np.logspace(-2, 3, 6)                    # 0.01 … 1000
}

print("Starting GridSearchCV hyperparameter search...")
print(f"Testing {len(param_grid['ridge__alpha'])} Ridge alpha values: {param_grid['ridge__alpha']}")
print(f"Total combinations: {len(param_grid['ridge__alpha'])}")

# Use a smaller subset for hyperparameter tuning to speed up the process
sample_size = 1000
X_train_sample = X_train.sample(n=sample_size, random_state=42)
y_train_sample = y_train[X_train_sample.index]

print(f"Using sample size: {sample_size} for hyperparameter tuning")

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available processors
    verbose=1
)

print("\nFitting GridSearchCV...")
grid_search.fit(X_train_sample, y_train_sample)
print("GridSearchCV completed!")

In [None]:
# Extract best parameters and score
best_params   = grid_search.best_params_
best_score    = grid_search.best_score_                 # negative MSE
best_std_mse  = grid_search.cv_results_['std_test_score'][grid_search.best_index_]
best_rmse     = (-best_score) ** 0.5
best_rmse_sd  = best_std_mse / (2 * best_rmse)

print("="*60)
print("GRIDSEARCHCV RESULTS")
print("="*60)
print("Best parameters found:")
print(f"  Ridge  alpha (final model)     : {best_params['ridge__alpha']}")
print(f"  Best cross-validated RMSE      : ${best_rmse:,.2f} (±${best_rmse_sd:,.2f})")

# Show some additional results
print(f"\nTop 5 parameter combinations:")
results_df = pd.DataFrame(grid_search.cv_results_)
top_5 = results_df.nlargest(5, 'mean_test_score')[['params',
                                                   'mean_test_score',
                                                   'std_test_score']]

for i, (_, row) in enumerate(top_5.iterrows(), 1):
    rid_alpha  = row['params']['ridge__alpha']
    mse_mean   = -row['mean_test_score']            # positive MSE
    mse_sd     =  row['std_test_score']             # SD of MSE
    rmse_mean  = mse_mean ** 0.5
    rmse_sd    = mse_sd / (2 * rmse_mean)

    print(f"  {i}. | "
          f"Ridge α={rid_alpha:7.1f} "
          f"→ RMSE: ${rmse_mean:8,.2f} (±${rmse_sd:6,.2f})")

print("="*60)


In [None]:
# results_df
results_df[results_df['param_selector__threshold'] == 'mean']

In [None]:
# ── Chart 1 – Lasso α (feature selector) vs mean_test_score ───────────────────
plt.figure(figsize=(6, 4))
plt.scatter(
    results_df['param_selector__estimator__alpha'],
    results_df['mean_test_score'],
    marker='o'
)
plt.xscale('log')                       # ← log-scale on x-axis
plt.xlabel('Lasso α (log scale)')
plt.ylabel('Mean CV score')
plt.title('Effect of Lasso α on CV score')
plt.grid(True, which='both', ls='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# ── Chart 2 – Ridge α (final model) vs mean_test_score ────────────────────────
plt.figure(figsize=(6, 4))
plt.scatter(
    results_df['param_ridge__alpha'],
    results_df['mean_test_score'],
    marker='o'
)
plt.xscale('log')                       # ← log-scale on x-axis
plt.xlabel('Ridge α (log scale)')
plt.ylabel('Mean CV score')
plt.title('Effect of Ridge α on CV score')
plt.grid(True, which='both', ls='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
# Measure performance with best hyperparameters
from sklearn.metrics import mean_squared_error, r2_score

# Train final model with best hyperparameters on full training set
print("Training final model with best hyperparameters...")
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(
        Lasso(alpha=best_params['selector__estimator__alpha'], random_state=42), 
        threshold='median'
    )),
    ('ridge', Ridge(alpha=best_params['ridge__alpha'], random_state=42))
])

# Fit on full training data
final_pipeline.fit(X_train, y_train)
print("Training completed!")

# Make predictions
y_train_pred = final_pipeline.predict(X_train)
y_test_pred = final_pipeline.predict(X_test)

# Calculate metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"\nModel Performance:")
print(f"Training RMSE: ${train_rmse:,.2f}")
print(f"Test RMSE: ${test_rmse:,.2f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")

# Check feature selection results
n_features_before = final_pipeline.named_steps['poly'].n_output_features_
n_features_after = final_pipeline.named_steps['selector'].transform(
    final_pipeline.named_steps['scaler'].transform(
        final_pipeline.named_steps['poly'].transform(
            final_pipeline.named_steps['preprocessor'].transform(X_train[:1])
        )
    )
).shape[1]

print(f"\nFeature Selection Results:")
print(f"Features before selection: {n_features_before}")
print(f"Features after selection: {n_features_after}")
print(f"Features removed: {n_features_before - n_features_after}")

# Visualize model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Actual vs Predicted for test set
axes[0].scatter(y_test, y_test_pred, alpha=0.3, s=1)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price ($)')
axes[0].set_ylabel('Predicted Price ($)')
axes[0].set_title(f'Actual vs Predicted Prices (Test Set)\nR² = {test_r2:.4f}')
axes[0].grid(True, alpha=0.3)

# Plot 2: Residuals plot
residuals = y_test - y_test_pred
axes[1].scatter(y_test_pred, residuals, alpha=0.3, s=1)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Price ($)')
axes[1].set_ylabel('Residuals ($)')
axes[1].set_title('Residuals Plot (Test Set)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Display pipeline summary
print("\n" + "="*60)
print("PIPELINE SUMMARY")
print("="*60)
print("1. ✅ One-hot encoding for categorical variables")
print("2. ✅ Polynomial features (degree 2)")
print("3. ✅ Standard scaling")
print("4. ✅ Feature selection using Lasso")
print("5. ✅ Ridge regression")
print(f"\nOptimized hyperparameters:")
print(f"  - Lasso alpha (feature selection): {best_params['selector__estimator__alpha']}")
print(f"  - Ridge alpha (final model): {best_params['ridge__alpha']}")
print("="*60)


In [None]:
# Measure performance with best hyperparameters
from sklearn.metrics import mean_squared_error, r2_score

# Train final model with best hyperparameters on full training set
print("Training final model with best hyperparameters...")
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(
        Lasso(alpha=best_params['selector__estimator__alpha'], random_state=42), 
        threshold='median'
    )),
    ('ridge', Ridge(alpha=best_params['ridge__alpha'], random_state=42))
])

# Fit on full training data
final_pipeline.fit(X_train, y_train)
print("Training completed!")

# Make predictions
y_train_pred = final_pipeline.predict(X_train)
y_test_pred = final_pipeline.predict(X_test)

# Calculate metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"\nModel Performance:")
print(f"Training RMSE: ${train_rmse:,.2f}")
print(f"Test RMSE: ${test_rmse:,.2f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")

# Check feature selection results
n_features_before = final_pipeline.named_steps['poly'].n_output_features_
n_features_after = final_pipeline.named_steps['selector'].transform(
    final_pipeline.named_steps['scaler'].transform(
        final_pipeline.named_steps['poly'].transform(
            final_pipeline.named_steps['preprocessor'].transform(X_train[:1])
        )
    )
).shape[1]

print(f"\nFeature Selection Results:")
print(f"Features before selection: {n_features_before}")
print(f"Features after selection: {n_features_after}")
print(f"Features removed: {n_features_before - n_features_after}")

# Visualize model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Actual vs Predicted for test set
axes[0].scatter(y_test, y_test_pred, alpha=0.3, s=1)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price ($)')
axes[0].set_ylabel('Predicted Price ($)')
axes[0].set_title(f'Actual vs Predicted Prices (Test Set)\nR² = {test_r2:.4f}')
axes[0].grid(True, alpha=0.3)

# Plot 2: Residuals plot
residuals = y_test - y_test_pred
axes[1].scatter(y_test_pred, residuals, alpha=0.3, s=1)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Price ($)')
axes[1].set_ylabel('Residuals ($)')
axes[1].set_title('Residuals Plot (Test Set)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Display pipeline summary
print("\n" + "="*60)
print("PIPELINE SUMMARY")
print("="*60)
print("1. ✅ One-hot encoding for categorical variables")
print("2. ✅ Polynomial features (degree 2)")
print("3. ✅ Standard scaling")
print("4. ✅ Feature selection using Lasso")
print("5. ✅ Ridge regression")
print(f"\nOptimized hyperparameters:")
print(f"  - Lasso alpha (feature selection): {best_params['selector__estimator__alpha']}")
print(f"  - Ridge alpha (final model): {best_params['ridge__alpha']}")
print("="*60)
