In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, Ridge
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import warnings
warnings.filterwarnings('ignore')

# Load the processed vehicles dataset
df = pd.read_csv('../data/vehicles_processed.csv')

# Display basic information about the dataset
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()


Dataset shape: (421344, 14)
Columns: ['id', 'region', 'price', 'year', 'manufacturer', 'model', 'condition', 'fuel', 'odometer', 'title_status', 'transmission', 'type', 'paint_color', 'state']

First few rows:


Unnamed: 0,id,region,price,year,manufacturer,model,condition,fuel,odometer,title_status,transmission,type,paint_color,state
0,7316814884,other_al,33590,2014.0,gmc,other_gmc,good,gas,57923.0,clean,other,pickup,white,al
1,7316814758,other_al,22590,2010.0,chevrolet,silverado 1500,good,gas,71229.0,clean,other,pickup,blue,al
2,7316814989,other_al,39590,2020.0,chevrolet,other_chevrolet,good,gas,19160.0,clean,other,pickup,red,al
3,7316743432,other_al,30990,2017.0,toyota,other_toyota,good,gas,41124.0,clean,other,pickup,red,al
4,7316356412,other_al,15000,2013.0,ford,other_ford,excellent,gas,128000.0,clean,automatic,truck,black,al


In [2]:
# Split into X and Y
# Identify categorical and numerical columns
categorical_cols = ['region', 'manufacturer', 'model', 'condition', 'fuel', 
                   'title_status', 'transmission', 'type', 'paint_color', 'state']
numerical_cols = ['year', 'odometer']

# Prepare features and target
X = df[categorical_cols + numerical_cols]
y = df['price']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")


Features shape: (421344, 12)
Target shape: (421344,)
Categorical columns: ['region', 'manufacturer', 'model', 'condition', 'fuel', 'title_status', 'transmission', 'type', 'paint_color', 'state']
Numerical columns: ['year', 'odometer']


In [3]:
# Holdout train split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Check for missing values
print(f"\nMissing values in training set:")
print(X_train.isnull().sum())


Training set size: 337075
Test set size: 84269

Missing values in training set:
region          0
manufacturer    0
model           0
condition       0
fuel            0
title_status    0
transmission    0
type            0
paint_color     0
state           0
year            0
odometer        0
dtype: int64


In [6]:
# Create the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)  # Keep numerical columns as-is for now
    ],
    remainder='drop'
)

# Create a base pipeline (we'll tune the alpha parameters)
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(Lasso(random_state=42), threshold='median')),
    ('ridge', Ridge(random_state=42))
])

print("Pipeline structure defined with the following steps:")
print("1. One-hot encoding for categorical variables")
print("2. Polynomial features (degree 2)")
print("3. Standard scaling")
print("4. Feature selection using Lasso")
print("5. Ridge regression")


Pipeline structure defined with the following steps:
1. One-hot encoding for categorical variables
2. Polynomial features (degree 2)
3. Standard scaling
4. Feature selection using Lasso
5. Ridge regression


In [None]:
# Find best hyperparameters using GridSearchCV
# Define parameter grid for GridSearchCV
param_grid = {
    'selector__estimator__alpha': [0.001, 0.01, 0.1, 1.0, 10.0],  # Lasso alpha for feature selection
    'ridge__alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]              # Ridge alpha for final model
}

# Create a scorer for RMSE (GridSearchCV maximizes by default, so we negate)
rmse_scorer = make_scorer(lambda y_true, y_pred: -np.sqrt(mean_squared_error(y_true, y_pred)))

print("Starting GridSearchCV hyperparameter search...")
print(f"Testing {len(param_grid['selector__estimator__alpha'])} Lasso alpha values: {param_grid['selector__estimator__alpha']}")
print(f"Testing {len(param_grid['ridge__alpha'])} Ridge alpha values: {param_grid['ridge__alpha']}")
print(f"Total combinations: {len(param_grid['selector__estimator__alpha']) * len(param_grid['ridge__alpha'])}")

# Use a smaller subset for hyperparameter tuning to speed up the process
sample_size = min(5000, len(X_train))
X_train_sample = X_train.sample(n=sample_size, random_state=42)
y_train_sample = y_train[X_train_sample.index]

print(f"Using sample size: {sample_size} for hyperparameter tuning")

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring=rmse_scorer,
    n_jobs=-1,  # Use all available processors
    verbose=1
)

print("\nFitting GridSearchCV...")
grid_search.fit(X_train_sample, y_train_sample)

# Extract best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("\n" + "="*60)
print("GRIDSEARCHCV RESULTS")
print("="*60)
print(f"Best parameters found:")
print(f"  Lasso alpha (feature selection): {best_params['selector__estimator__alpha']}")
print(f"  Ridge alpha (final model): {best_params['ridge__alpha']}")
print(f"  Best cross-validated RMSE: ${-best_score:,.2f}")

# Show some additional results
print(f"\nTop 5 parameter combinations:")
results_df = pd.DataFrame(grid_search.cv_results_)
top_5 = results_df.nlargest(5, 'mean_test_score')[['params', 'mean_test_score', 'std_test_score']]
for i, (_, row) in enumerate(top_5.iterrows(), 1):
    lasso_alpha = row['params']['selector__estimator__alpha']
    ridge_alpha = row['params']['ridge__alpha']
    mean_score = row['mean_test_score']
    std_score = row['std_test_score']
    print(f"  {i}. Lasso α={lasso_alpha:5.3f}, Ridge α={ridge_alpha:7.1f} → RMSE: ${-mean_score:8,.2f} (±${std_score:6,.2f})")

print("="*60)


Starting GridSearchCV hyperparameter search...
Testing 5 Lasso alpha values: [0.001, 0.01, 0.1, 1.0, 10.0]
Testing 5 Ridge alpha values: [0.1, 1.0, 10.0, 100.0, 1000.0]
Total combinations: 25
Using sample size: 20000 for hyperparameter tuning

Fitting GridSearchCV...
Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [9]:
# Measure performance with best hyperparameters
from sklearn.metrics import mean_squared_error, r2_score

# Train final model with best hyperparameters on full training set
print("Training final model with best hyperparameters...")
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(
        Lasso(alpha=best_params['selector__estimator__alpha'], random_state=42), 
        threshold='median'
    )),
    ('ridge', Ridge(alpha=best_params['ridge__alpha'], random_state=42))
])

# Fit on full training data
final_pipeline.fit(X_train, y_train)
print("Training completed!")

# Make predictions
y_train_pred = final_pipeline.predict(X_train)
y_test_pred = final_pipeline.predict(X_test)

# Calculate metrics
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"\nModel Performance:")
print(f"Training RMSE: ${train_rmse:,.2f}")
print(f"Test RMSE: ${test_rmse:,.2f}")
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")

# Check feature selection results
n_features_before = final_pipeline.named_steps['poly'].n_output_features_
n_features_after = final_pipeline.named_steps['selector'].transform(
    final_pipeline.named_steps['scaler'].transform(
        final_pipeline.named_steps['poly'].transform(
            final_pipeline.named_steps['preprocessor'].transform(X_train[:1])
        )
    )
).shape[1]

print(f"\nFeature Selection Results:")
print(f"Features before selection: {n_features_before}")
print(f"Features after selection: {n_features_after}")
print(f"Features removed: {n_features_before - n_features_after}")

# Visualize model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot 1: Actual vs Predicted for test set
axes[0].scatter(y_test, y_test_pred, alpha=0.3, s=1)
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Price ($)')
axes[0].set_ylabel('Predicted Price ($)')
axes[0].set_title(f'Actual vs Predicted Prices (Test Set)\nR² = {test_r2:.4f}')
axes[0].grid(True, alpha=0.3)

# Plot 2: Residuals plot
residuals = y_test - y_test_pred
axes[1].scatter(y_test_pred, residuals, alpha=0.3, s=1)
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Price ($)')
axes[1].set_ylabel('Residuals ($)')
axes[1].set_title('Residuals Plot (Test Set)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Display pipeline summary
print("\n" + "="*60)
print("PIPELINE SUMMARY")
print("="*60)
print("1. ✅ One-hot encoding for categorical variables")
print("2. ✅ Polynomial features (degree 2)")
print("3. ✅ Standard scaling")
print("4. ✅ Feature selection using Lasso")
print("5. ✅ Ridge regression")
print(f"\nOptimized hyperparameters:")
print(f"  - Lasso alpha (feature selection): {best_params['selector__estimator__alpha']}")
print(f"  - Ridge alpha (final model): {best_params['ridge__alpha']}")
print("="*60)


Training final model with best hyperparameters...


NameError: name 'best_params' is not defined