In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:

# Load the dataset
housing = pd.read_csv('Ames_HousePrice.csv', index_col=0)

# Define the target variable
y = housing['SalePrice']

# Select all features excluding the target variable
X = housing.drop('SalePrice', axis=1)

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipelines for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [3]:
# Define the Lasso model with increased iterations and set alpha to 10
from sklearn.linear_model import Ridge, Lasso

lasso_model = Lasso(max_iter=10000, alpha=10)



In [4]:
# Create the final pipeline

lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lasso_model)
])

In [5]:
# Perform cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

lasso_scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
lasso_mean_score = lasso_scores.mean()

print("Mean R^2 cross-validation score for Lasso:", lasso_mean_score)


Mean R^2 cross-validation score for Lasso: 0.9122683119876338


In [6]:
#use LassoCV
from sklearn.linear_model import LassoCV

# Define the LassoCV model
lasso_cv_model = LassoCV(cv=5, max_iter=10000)

# Create the final pipeline
lasso_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lasso_cv_model)
])

# Perform cross-validation
lasso_scores = cross_val_score(lasso_pipeline, X, y, cv=5, scoring='r2')
lasso_mean_score = lasso_scores.mean()

print("Mean R^2 cross-validation score for LassoCV:", lasso_mean_score)

Mean R^2 cross-validation score for LassoCV: 0.913162393485754


In [14]:
#check the best alpha for LassoCV

# Fit the pipeline to the entire dataset
lasso_pipeline.fit(X, y)

# Extract the alpha value
best_alpha = lasso_cv_model.alpha_

print("Mean R^2 cross-validation score for LassoCV:", lasso_mean_score)
print("Best alpha value selected by LassoCV:", best_alpha)


Mean R^2 cross-validation score for LassoCV: 0.913162393485754
Best alpha value selected by LassoCV: 59.30068897867746


In [7]:
# Define the Ridge model
ridge_model = Ridge()

# Create the final pipeline
ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', ridge_model)
])

# Perform cross-validation
ridge_scores = cross_val_score(ridge_pipeline, X, y, cv=5, scoring='r2')
ridge_mean_score = ridge_scores.mean()

print("Mean R^2 cross-validation score for Ridge:", ridge_mean_score)

Mean R^2 cross-validation score for Ridge: 0.9133295773863545


In [12]:
#use ridgeCV
from sklearn.linear_model import RidgeCV

# Define the LassoCV model
Ridge_cv_model = RidgeCV(cv=5)

# Create the final pipeline
Ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge_cv_model)
])

# Perform cross-validation
Ridge_scores = cross_val_score(Ridge_pipeline, X, y, cv=5, scoring='r2')
Ridge_mean_score = Ridge_scores.mean()

print("Mean R^2 cross-validation score for RidgeCV:", Ridge_mean_score)

Mean R^2 cross-validation score for RidgeCV: 0.9115027211501705


In [16]:
#check the best alpha for ridgeCV

# Fit the pipeline to the entire dataset
Ridge_pipeline.fit(X, y)

# Extract the alpha value
best_alpha = Ridge_cv_model.alpha_

print("Mean R^2 cross-validation score for RidgeCV:", lasso_mean_score)
print("Best alpha value selected by RidgeCV:", best_alpha)

Mean R^2 cross-validation score for RidgeCV: 0.913162393485754
Best alpha value selected by RidgeCV: 10.0
