In [194]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import PredictionErrorDisplay
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [80]:
ames = pd.read_csv('ames.csv', index_col=0)
ames.head(10)

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,SaleType,SaleCondition,TotalBath,TotalSF,HQSF,yrsbtwn,BedxBath,RoomsxBath,FireplaceYN,TotalPorchSF
1,909176150,856,126000,30,RL,58.769231,7890,Pave,No Alley,Reg,...,WD,Normal,2.0,1712.0,1712.0,11,4.0,8.0,1,166
2,905476230,1049,139500,120,RL,42.0,4235,Pave,No Alley,Reg,...,WD,Normal,3.0,2098.0,2098.0,0,6.0,15.0,0,105
3,911128020,1001,124900,30,C (all),60.0,6060,Pave,No Alley,Reg,...,WD,Normal,1.0,1838.0,1838.0,77,2.0,5.0,0,282
4,535377150,1039,114000,70,RL,80.0,8146,Pave,No Alley,Reg,...,WD,Normal,1.0,1444.0,1444.0,103,2.0,6.0,0,279
5,534177230,1665,227000,60,RL,70.0,8400,Pave,No Alley,Reg,...,WD,Normal,3.5,2475.0,2475.0,0,10.5,21.0,0,45
6,908128060,1922,198500,85,RL,64.0,7301,Pave,No Alley,Reg,...,ConLD,Normal,3.0,1922.0,1922.0,0,12.0,21.0,1,177
7,902135020,936,93000,20,RM,60.0,6000,Pave,Pave,Reg,...,WD,Normal,1.0,1872.0,1872.0,0,2.0,4.0,0,144
8,528228540,1246,187687,20,RL,53.0,3710,Pave,No Alley,Reg,...,New,Partial,2.0,2392.0,2392.0,1,4.0,10.0,1,124
9,923426010,889,137500,20,RL,74.0,12395,Pave,No Alley,Reg,...,WD,Normal,1.0,1753.0,1753.0,0,3.0,6.0,0,0
10,908186050,1072,140000,180,RM,35.0,3675,Pave,No Alley,Reg,...,WD,Normal,2.0,1619.0,1619.0,0,4.0,10.0,0,44


### I. Setup
Load the dataset- remove columns with missing values. Identify numeric and categorical features and target.

In [81]:
# Load the dataset and remove columns with missing values
ames = pd.read_csv('ames.csv')

# Identify numeric and categorical features, excluding 'PID' and 'SalePrice'
ames['MSSubClass'] = ames['MSSubClass'].astype(str) #Nominal variable of 'string' integers
numeric_features = ames.select_dtypes(include=['int64', 'float64']).drop(columns=['PID', 'SalePrice', 'Unnamed: 0']).columns
categorical_features = ames.select_dtypes(include=['object']).columns
X = ames[numeric_features.tolist() + categorical_features.tolist()]


# Target variable
y = ames['SalePrice']

### II. Set up Pipeline:
For pre-processing and regression with 5-fold cross-validation. Pass through numerical data, ordinally encode ordinal categorical variables, one-hot encode all other categorical variables and drop none. Instantiate RandomForest model.

In [142]:
# Define the ordinal_categories dictionary
ordinal_categories = {
    'LotShape': ['IR3', 'IR2', 'IR1', 'Reg'],
    'LandSlope': ['Sev', 'Mod', 'Gtl'],
    ('ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual'): ['Po', 'Fa', 'TA', 'Gd', 'Ex'],
    ('BsmtQual', 'BsmtCond'): ['No Bsmt', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'BsmtExposure': ['No Bsmt', 'No', 'Mn', 'Av', 'Gd'],
    ('BsmtFinType1', 'BsmtFinType2'): ['No Bsmt', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    'Electrical': ['FuseP', 'FuseF', 'FuseA', 'SBrkr'],
    'FireplaceQu': ['No Fireplace', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'GarageFinish': ['No Garage', 'Unf', 'RFn', 'Fin'],
    ('GarageQual', 'GarageCond'): ['No Garage', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    'PavedDrive': ['N', 'P', 'Y'],
    'PoolQC': ['No Pool', 'Fa', 'TA', 'Gd', 'Ex'],
    'Fence': ['No Fence', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']
}


# Function to extract categories for each feature
def get_categories_dict(ordinal_categories):
    categories_dict = {}
    for key, value in ordinal_categories.items():
        if isinstance(key, tuple):
            for sub_key in key:
                categories_dict[sub_key] = value
        else:
            categories_dict[key] = value
    return categories_dict

# Extract categories for each feature
categories_dict = get_categories_dict(ordinal_categories)

# Separate feature names and their corresponding categories
feature_names = list(categories_dict.keys())
categories = [categories_dict[feature] for feature in feature_names]

# Define transformers for numerical and categorical features
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
ordinal_categorical_features = feature_names

non_ordinal_categorical_features = [feature for feature in categorical_features if feature not in ordinal_categorical_features]

ordinal_encoder = OrdinalEncoder(categories=categories)

categorical_transformer = OneHotEncoder(drop=None, handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))
])

### III. Run Pipeline: 
Report the untuned RandomForest R2- basic idea of how the model performs.

In [127]:
import time
start_time = time.time()

# Perform cross-validation and store results in a dictionary
cv_results = {}
scores = cross_val_score(pipeline, X, y)
cv_results = round(scores.mean(), 6)
# Output the mean cross-validation scores
print(cv_results)

#an untuned RandomForestRegressor has an R2 of .9024

print(f"{time.time() - start_time} seconds")

0.90241
27.17897605895996 seconds


### IV. Tuning: Hyperparameters (attempt 1, unsuccessful: 
(Tune from 0.9032 *down* to .9007--- what did I do wrong?)
Find optimal parameters for RandomForest using GridSearchCV.

In [139]:
# Define the parameter grid
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [100, 200, 300, 400, 500, 600], # # of trees in forest
}

#{'regressor__n_estimators': 200}
#0.9031478722700008

param_grid_1 = {
    'regressor__min_samples_leaf': [1, 2, 3, 4],
    'regressor__n_estimators': [125, 150, 175, 200, 225, 250, 275]
}


#Best Parameters: {'regressor__min_samples_leaf': 1, 
#'regressor__n_estimators': 200}
#Best R² Score: 0.9031478722700008

param_grid_2 = {
    'regressor__min_samples_leaf': [1, 2],
    'regressor__n_estimators': [180, 190, 200, 210, 220]
}

#Best Parameters: {'regressor__min_samples_leaf': 1, 
#'regressor__n_estimators': 190}
#Best R² Score: 0.9032154211851081

param_grid_3 = {
    'regressor__min_samples_leaf': [1, 2],
    'regressor__n_estimators': [185, 190, 195],
    'regressor__max_depth': [None, 1, 5, 10],
}

#Best Parameters: {'regressor__max_depth': None, 
#'regressor__min_samples_leaf': 1, 
#'regressor__n_estimators': 190} Best R² Score: 0.9032154211851081

param_grid_4 = {
    'regressor__min_samples_leaf': [1],
    'regressor__n_estimators': [188, 190, 192],
    'regressor__max_depth': [None, 1, 5],
}

#Best Parameters: {'regressor__max_depth': None, 
#'regressor__min_samples_leaf': 1, 
#'regressor__n_estimators': 192}
#Best R² Score: 0.9032616311709502

###############
param_grid_5 = {
    'regressor__min_samples_leaf': [1],
    'regressor__n_estimators': [188, 189, 190, 191, 192, 193, 194, 195],
    'regressor__max_depth': [None, 1, 2],
    'regressor__max_features': ['sqrt', 'log2']
}
##HMMMMMMMMMMMMMMMMMMMMMMMM WHAT??


# Initialize the RandomForestRegressor
random_forest = RandomForestRegressor(random_state=0)

# Set up the pipeline (assuming preprocessor is already defined)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', random_forest)
])

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid_5, 
                           cv=5, n_jobs=-1, verbose=2, scoring='r2')

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)
print(f"{time.time() - start_time} seconds")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'regressor__max_depth': None, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 1, 'regressor__n_estimators': 189}
Best R² Score: 0.9007973972353549
44.06819987297058 seconds
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=188; total time=   1.4s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=188; total time=   1.3s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=189; total time=   1.3s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=190; total time=   1.3s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=191; total time=   1.4s
[CV]

[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=188; total time=   1.5s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=189; total time=   1.4s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=189; total time=   1.4s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=190; total time=   1.3s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=191; total time=   1.3s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=192; total time=   1.4s
[CV] END regressor__max_depth=None, regressor__max_features=sqrt, regressor__min_samples_leaf=1, regressor__n_estimators=193; tota

In [None]:
#Features to tun
#     'regressor__n_estimators': [500, 550, 600, 650, 700, 750, 800], # # of trees in forest
#     'regressor__max_depth': [None, 10], #The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
#     'regressor__min_samples_split': [2, 10], #The minimum number of samples required to split an internal node
#     'regressor__min_samples_leaf': [1], #default 1
#     'regressor__max_features': ['sqrt', 'log2'] # 

## V. Tuning Hyperparameters (attempt 2)


In [143]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [100, 200, 300, 400, 500],
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters (Grid Search): {'regressor__n_estimators': 200}
Best score (Grid Search): 0.9031
438.27464294433594 seconds


In [155]:
def report_importances(mygridsearch):
    """ report_importances: returns df of importances of best model from a GridSearchCV
        mygridsearch: gridsearch object """
    best_model = mygridsearch.best_estimator_
    best_rf = best_model.named_steps['regressor']
    feature_importances = best_rf.feature_importances_
    preprocessor = best_model.named_steps['preprocessor']
    transformed_feature_names = preprocessor.get_feature_names_out()

    # Create a DataFrame for better readability
    import pandas as pd

    feature_importances_df = pd.DataFrame({
        'feature': transformed_feature_names,
        'importance': feature_importances
    })

    # Sort features by importance
    feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

    return feature_importances_df

In [157]:
report_importances(grid_search).head(20)

Unnamed: 0,feature,importance
250,num__HQSF,0.370433
216,num__OverallQual,0.356069
249,num__TotalSF,0.075099
218,num__YearBuilt,0.025838
219,num__YearRemodAdd,0.009622
215,num__LotArea,0.009439
238,num__GarageArea,0.009237
213,num__GrLivArea,0.008514
199,ord__KitchenQual,0.008005
221,num__BsmtFinSF1,0.00707


In [158]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [5, 10, 15]
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters (Grid Search): {'regressor__max_depth': 15, 'regressor__n_estimators': 100}
Best score (Grid Search): 0.9034
5.668258051077525 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.382148
216,num__OverallQual,0.35053
249,num__TotalSF,0.068687
218,num__YearBuilt,0.027987
215,num__LotArea,0.009271
213,num__GrLivArea,0.008913
219,num__YearRemodAdd,0.008639
238,num__GarageArea,0.008382
221,num__BsmtFinSF1,0.00729
223,num__BsmtUnfSF,0.007289


In [159]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [80, 120, 160, 200, 300],
    'regressor__max_depth': [12, 15, 17, 20, 25]
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best parameters (Grid Search): {'regressor__max_depth': 15, 'regressor__n_estimators': 80}
Best score (Grid Search): 0.9037
18.870924401283265 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.376098
216,num__OverallQual,0.354317
249,num__TotalSF,0.07149
218,num__YearBuilt,0.027847
215,num__LotArea,0.009369
213,num__GrLivArea,0.008739
219,num__YearRemodAdd,0.008678
238,num__GarageArea,0.008185
223,num__BsmtUnfSF,0.007335
221,num__BsmtFinSF1,0.006901


In [164]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [60, 80, 100, 120],
    'regressor__max_depth': [10, 15, 20],
    'regressor__max_features': [.5, 'sqrt']
    
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters (Grid Search): {'regressor__max_depth': 20, 'regressor__max_features': 0.5, 'regressor__n_estimators': 100}
Best score (Grid Search): 0.9061
3.1776959180831907 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.2898
216,num__OverallQual,0.229792
249,num__TotalSF,0.160171
237,num__GarageCars,0.027073
218,num__YearBuilt,0.026291
196,ord__ExterQual,0.026284
200,ord__BsmtQual,0.019138
199,ord__KitchenQual,0.016928
248,num__TotalBath,0.014949
213,num__GrLivArea,0.014862


In [165]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [60, 100, 140],
    'regressor__max_depth': [15, 20, 25],
    'regressor__max_features': [.5, 'sqrt'],
    'regressor__min_samples_split': [2, 5, 10]
    
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters (Grid Search): {'regressor__max_depth': 20, 'regressor__max_features': 0.5, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best score (Grid Search): 0.9061
7.3056085507074995 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.2898
216,num__OverallQual,0.229792
249,num__TotalSF,0.160171
237,num__GarageCars,0.027073
218,num__YearBuilt,0.026291
196,ord__ExterQual,0.026284
200,ord__BsmtQual,0.019138
199,ord__KitchenQual,0.016928
248,num__TotalBath,0.014949
213,num__GrLivArea,0.014862


In [167]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [60, 100, 140],
    'regressor__max_depth': [15, 20, 25],
    'regressor__max_features': [.5, 'sqrt'],
    'regressor__min_samples_split': [2, 5]
    
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters (Grid Search): {'regressor__max_depth': 20, 'regressor__max_features': 0.5, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best score (Grid Search): 0.9061
5.275697465737661 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.2898
216,num__OverallQual,0.229792
249,num__TotalSF,0.160171
237,num__GarageCars,0.027073
218,num__YearBuilt,0.026291
196,ord__ExterQual,0.026284
200,ord__BsmtQual,0.019138
199,ord__KitchenQual,0.016928
248,num__TotalBath,0.014949
213,num__GrLivArea,0.014862


In [168]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [80, 100, 120],
    'regressor__max_depth': [15, 20, 25],
    'regressor__max_features': [.5, 'sqrt'],
    'regressor__min_samples_split': [2, 3],
    'regressor__min_samples_leaf': [1, 2, 3]
    
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters (Grid Search): {'regressor__max_depth': 20, 'regressor__max_features': 0.5, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best score (Grid Search): 0.9061
13.028723514080047 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.2898
216,num__OverallQual,0.229792
249,num__TotalSF,0.160171
237,num__GarageCars,0.027073
218,num__YearBuilt,0.026291
196,ord__ExterQual,0.026284
200,ord__BsmtQual,0.019138
199,ord__KitchenQual,0.016928
248,num__TotalBath,0.014949
213,num__GrLivArea,0.014862


In [169]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [90, 100, 110],
    'regressor__max_depth': [18, 20, 22],
    'regressor__max_features': [.25, .5, .75],
    'regressor__min_samples_split': [2],
    'regressor__min_samples_leaf': [1, 2]
    
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters (Grid Search): {'regressor__max_depth': 20, 'regressor__max_features': 0.5, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best score (Grid Search): 0.9061
11.932632112503052 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.2898
216,num__OverallQual,0.229792
249,num__TotalSF,0.160171
237,num__GarageCars,0.027073
218,num__YearBuilt,0.026291
196,ord__ExterQual,0.026284
200,ord__BsmtQual,0.019138
199,ord__KitchenQual,0.016928
248,num__TotalBath,0.014949
213,num__GrLivArea,0.014862


In [170]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [90, 100, 110],
    'regressor__max_depth': [20],
    'regressor__max_features': [.4, .5, .6],
    'regressor__min_samples_split': [2],
    'regressor__min_samples_leaf': [1]
    
}

# Setup the GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters (Grid Search): {'regressor__max_depth': 20, 'regressor__max_features': 0.5, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Best score (Grid Search): 0.9061
2.448677186171214 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.2898
216,num__OverallQual,0.229792
249,num__TotalSF,0.160171
237,num__GarageCars,0.027073
218,num__YearBuilt,0.026291
196,ord__ExterQual,0.026284
200,ord__BsmtQual,0.019138
199,ord__KitchenQual,0.016928
248,num__TotalBath,0.014949
213,num__GrLivArea,0.014862


In [172]:
pipeline_tuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, max_depth = 20, max_features = .5, min_samples_leaf = 1,
                                        min_samples_split = 2, random_state=0))
])

# Perform cross-validation and store results in a dictionary
cv_results = {}
scores = cross_val_score(pipeline_tuned, X, y)
cv_results = round(scores.mean(), 6)
# Output the mean cross-validation scores
print(cv_results)

0.906054


In [184]:
# Fit the pipeline with the best hyperparameters
pipeline_tuned.fit(X, y)

# Get the feature importances from the best model
best_rf = pipeline_tuned.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Print the feature importances
feature_importances_df.head(20)

Unnamed: 0,feature,importance
250,num__HQSF,0.2898
216,num__OverallQual,0.229792
249,num__TotalSF,0.160171
237,num__GarageCars,0.027073
218,num__YearBuilt,0.026291
196,ord__ExterQual,0.026284
200,ord__BsmtQual,0.019138
199,ord__KitchenQual,0.016928
248,num__TotalBath,0.014949
213,num__GrLivArea,0.014862


In [None]:
##EXPERIMENT WITH OPTUNA

In [191]:
start_time = time.time()
import optuna

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the RandomForestRegressor
    n_estimators = trial.suggest_int('regressor__n_estimators', 50, 500, step=50)
    max_depth = trial.suggest_int('regressor__max_depth', 2, 32, step=2)
    min_samples_split = trial.suggest_int('regressor__min_samples_split', 2, 20, step=2)
    min_samples_leaf = trial.suggest_int('regressor__min_samples_leaf', 1, 9, step=2)
    max_features = trial.suggest_categorical('regressor__max_features', ['sqrt', .5, 1])
    # Set the suggested hyperparameters in the pipeline
    pipeline.set_params(
        regressor__n_estimators=n_estimators,
        regressor__max_depth=max_depth,
        regressor__min_samples_split=min_samples_split,
        regressor__min_samples_leaf=min_samples_leaf,
        regressor__max_features = max_features
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Set the best hyperparameters in the pipeline
pipeline.set_params(**best_params)

# Fit the pipeline with the best hyperparameters
pipeline.fit(X, y)

# Get the feature importances from the best model
best_rf = pipeline.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
print(f"{(time.time() - start_time)/60} minutes")
# Print the feature importances
feature_importances_df

[I 2024-06-18 16:03:34,696] A new study created in memory with name: no-name-e5ea64c1-9e34-49c8-bfb0-5edb8433b5f8
[I 2024-06-18 16:03:37,189] Trial 0 finished with value: 0.8750638198084367 and parameters: {'regressor__n_estimators': 150, 'regressor__max_depth': 22, 'regressor__min_samples_split': 20, 'regressor__min_samples_leaf': 7, 'regressor__max_features': 'sqrt'}. Best is trial 0 with value: 0.8750638198084367.
[I 2024-06-18 16:03:56,952] Trial 1 finished with value: 0.8878886533080749 and parameters: {'regressor__n_estimators': 350, 'regressor__max_depth': 6, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 3, 'regressor__max_features': 0.5}. Best is trial 1 with value: 0.8878886533080749.
[I 2024-06-18 16:04:08,457] Trial 2 finished with value: 0.8985173194695895 and parameters: {'regressor__n_estimators': 150, 'regressor__max_depth': 22, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 7, 'regressor__max_features': 0.5}. Best is trial 2 with v

[I 2024-06-18 16:11:35,567] Trial 27 finished with value: 0.9042392001697618 and parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 32, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 25 with value: 0.905232426190614.
[I 2024-06-18 16:11:40,845] Trial 28 finished with value: 0.8848354813094635 and parameters: {'regressor__n_estimators': 300, 'regressor__max_depth': 24, 'regressor__min_samples_split': 12, 'regressor__min_samples_leaf': 5, 'regressor__max_features': 'sqrt'}. Best is trial 25 with value: 0.905232426190614.
[I 2024-06-18 16:11:43,576] Trial 29 finished with value: 0.3256423859685663 and parameters: {'regressor__n_estimators': 450, 'regressor__max_depth': 18, 'regressor__min_samples_split': 18, 'regressor__min_samples_leaf': 7, 'regressor__max_features': 1}. Best is trial 25 with value: 0.905232426190614.
[I 2024-06-18 16:11:50,148] Trial 30 finished with value: 0.9011204713294975 and para

[I 2024-06-18 16:22:17,531] Trial 54 finished with value: 0.9036100321197488 and parameters: {'regressor__n_estimators': 200, 'regressor__max_depth': 32, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 43 with value: 0.9055325648321754.
[I 2024-06-18 16:22:45,936] Trial 55 finished with value: 0.9050036344263038 and parameters: {'regressor__n_estimators': 250, 'regressor__max_depth': 30, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 43 with value: 0.9055325648321754.
[I 2024-06-18 16:23:14,335] Trial 56 finished with value: 0.9047664727075005 and parameters: {'regressor__n_estimators': 250, 'regressor__max_depth': 30, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 43 with value: 0.9055325648321754.
[I 2024-06-18 16:23:44,121] Trial 57 finished with value: 0.9033885512455703 and par

[I 2024-06-18 16:33:53,187] Trial 81 finished with value: 0.9045673282923816 and parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 28, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 43 with value: 0.9055325648321754.
[I 2024-06-18 16:34:46,381] Trial 82 finished with value: 0.9050047565956509 and parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 28, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 43 with value: 0.9055325648321754.
[I 2024-06-18 16:35:38,276] Trial 83 finished with value: 0.9043807392990626 and parameters: {'regressor__n_estimators': 450, 'regressor__max_depth': 30, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 43 with value: 0.9055325648321754.
[I 2024-06-18 16:36:17,939] Trial 84 finished with value: 0.9039118781057425 and para

Best hyperparameters: {'regressor__n_estimators': 250, 'regressor__max_depth': 32, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}
41.17306566635768 minutes


Unnamed: 0,feature,importance
250,num__HQSF,3.017455e-01
216,num__OverallQual,2.450887e-01
249,num__TotalSF,1.532325e-01
196,ord__ExterQual,3.081399e-02
218,num__YearBuilt,2.680392e-02
...,...,...
1,cat__MSSubClass_150,1.947311e-08
33,cat__Utilities_NoSewr,5.936749e-09
16,cat__MSZoning_A (agr),0.000000e+00
106,cat__RoofMatl_Roll,0.000000e+00


In [187]:
pipeline_tuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(max_depth=16, min_samples_split=4, n_estimators=250))
])

# Fit the pipeline with the best hyperparameters
pipeline_tuned.fit(X, y)

# Get the feature importances from the best model
best_rf = pipeline.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)

# Print the feature importances
feature_importances_df

Unnamed: 0,feature,importance
216,num__OverallQual,3.725238e-01
250,num__HQSF,3.577721e-01
249,num__TotalSF,7.102200e-02
218,num__YearBuilt,2.649099e-02
219,num__YearRemodAdd,9.407810e-03
...,...,...
106,cat__RoofMatl_Roll,1.524897e-08
186,cat__SaleType_VWD,1.237317e-08
114,cat__Exterior1st_CBlock,3.095026e-09
1,cat__MSSubClass_150,1.044043e-09


In [188]:
feature_importances_df.head(20)

Unnamed: 0,feature,importance
216,num__OverallQual,0.372524
250,num__HQSF,0.357772
249,num__TotalSF,0.071022
218,num__YearBuilt,0.026491
219,num__YearRemodAdd,0.009408
215,num__LotArea,0.008947
213,num__GrLivArea,0.00844
199,ord__KitchenQual,0.008168
238,num__GarageArea,0.007903
221,num__BsmtFinSF1,0.007706


In [177]:
study.best_trial

FrozenTrial(number=43, state=1, values=[0.9031357972416861], datetime_start=datetime.datetime(2024, 6, 18, 13, 56, 15, 60376), datetime_complete=datetime.datetime(2024, 6, 18, 13, 57, 10, 936187), params={'regressor__n_estimators': 250, 'regressor__max_depth': 16, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'regressor__n_estimators': IntDistribution(high=500, log=False, low=50, step=50), 'regressor__max_depth': IntDistribution(high=32, log=False, low=2, step=2), 'regressor__min_samples_split': IntDistribution(high=20, log=False, low=2, step=2), 'regressor__min_samples_leaf': IntDistribution(high=19, log=False, low=1, step=2)}, trial_id=43, value=None)

RandomForestRegressor(max_depth=16, min_samples_split=4, n_estimators=250), 90.3.

In [180]:
pipeline['regressor']

NEXT STEPS: Try dropping Qual, try log of y

In [192]:
#log transformation of y

start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the RandomForestRegressor
    n_estimators = trial.suggest_int('regressor__n_estimators', 50, 500, step=50)
    max_depth = trial.suggest_int('regressor__max_depth', 2, 32, step=2)
    min_samples_split = trial.suggest_int('regressor__min_samples_split', 2, 20, step=2)
    min_samples_leaf = trial.suggest_int('regressor__min_samples_leaf', 1, 9, step=2)
    max_features = trial.suggest_categorical('regressor__max_features', ['sqrt', .5, 1])
    # Set the suggested hyperparameters in the pipeline
    pipeline.set_params(
        regressor__n_estimators=n_estimators,
        regressor__max_depth=max_depth,
        regressor__min_samples_split=min_samples_split,
        regressor__min_samples_leaf=min_samples_leaf,
        regressor__max_features = max_features
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, np.log(y), cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Set the best hyperparameters in the pipeline
pipeline.set_params(**best_params)

# Fit the pipeline with the best hyperparameters
pipeline.fit(X, np.log(y))

# Get the feature importances from the best model
best_rf = pipeline.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
print(f"{(time.time() - start_time)/60} minutes")
# Print the feature importances
feature_importances_df

[I 2024-06-19 08:44:59,456] A new study created in memory with name: no-name-73e0a1a8-9ff4-4e4f-a936-5797dea0450c
[I 2024-06-19 08:45:07,475] Trial 0 finished with value: 0.8717373918714768 and parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 12, 'regressor__min_samples_split': 20, 'regressor__min_samples_leaf': 9, 'regressor__max_features': 'sqrt'}. Best is trial 0 with value: 0.8717373918714768.
[I 2024-06-19 08:45:11,649] Trial 1 finished with value: 0.8131731522812553 and parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 4, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt'}. Best is trial 0 with value: 0.8717373918714768.
[I 2024-06-19 08:45:53,982] Trial 2 finished with value: 0.9005806294414199 and parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 24, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 2 wit

[I 2024-06-19 08:54:04,717] Trial 27 finished with value: 0.7427889363037586 and parameters: {'regressor__n_estimators': 250, 'regressor__max_depth': 2, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 3, 'regressor__max_features': 0.5}. Best is trial 13 with value: 0.9017439199005641.
[I 2024-06-19 08:54:06,370] Trial 28 finished with value: 0.8147837751585423 and parameters: {'regressor__n_estimators': 150, 'regressor__max_depth': 26, 'regressor__min_samples_split': 12, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 1}. Best is trial 13 with value: 0.9017439199005641.
[I 2024-06-19 08:54:30,013] Trial 29 finished with value: 0.8884461710640312 and parameters: {'regressor__n_estimators': 350, 'regressor__max_depth': 14, 'regressor__min_samples_split': 20, 'regressor__min_samples_leaf': 9, 'regressor__max_features': 0.5}. Best is trial 13 with value: 0.9017439199005641.
[I 2024-06-19 08:54:34,787] Trial 30 finished with value: 0.8934594438636948 and param

[I 2024-06-19 09:07:11,431] Trial 54 finished with value: 0.9007207276520355 and parameters: {'regressor__n_estimators': 350, 'regressor__max_depth': 22, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 41 with value: 0.9018783351441305.
[I 2024-06-19 09:08:16,327] Trial 55 finished with value: 0.9006914239253776 and parameters: {'regressor__n_estimators': 450, 'regressor__max_depth': 28, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 41 with value: 0.9018783351441305.
[I 2024-06-19 09:08:54,910] Trial 56 finished with value: 0.8995001504712793 and parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 26, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 3, 'regressor__max_features': 0.5}. Best is trial 41 with value: 0.9018783351441305.
[I 2024-06-19 09:09:30,947] Trial 57 finished with value: 0.9011014544498112 and para

[I 2024-06-19 09:22:52,321] Trial 81 finished with value: 0.9014010355965661 and parameters: {'regressor__n_estimators': 300, 'regressor__max_depth': 30, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 41 with value: 0.9018783351441305.
[I 2024-06-19 09:23:29,430] Trial 82 finished with value: 0.900714870914968 and parameters: {'regressor__n_estimators': 300, 'regressor__max_depth': 32, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 41 with value: 0.9018783351441305.
[I 2024-06-19 09:24:00,664] Trial 83 finished with value: 0.9012381729624185 and parameters: {'regressor__n_estimators': 250, 'regressor__max_depth': 30, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 41 with value: 0.9018783351441305.
[I 2024-06-19 09:24:40,095] Trial 84 finished with value: 0.9003790772270384 and param

Best hyperparameters: {'regressor__n_estimators': 300, 'regressor__max_depth': 30, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}
46.76164090236028 minutes


Unnamed: 0,feature,importance
250,num__HQSF,2.620348e-01
216,num__OverallQual,2.550627e-01
249,num__TotalSF,1.441834e-01
196,ord__ExterQual,3.367337e-02
218,num__YearBuilt,3.278520e-02
...,...,...
117,cat__Exterior1st_ImStucc,8.828986e-08
177,cat__MiscFeature_TenC,5.706126e-08
16,cat__MSZoning_A (agr),4.766141e-08
104,cat__RoofMatl_Membran,3.785849e-08


In [193]:
#log transformation of y- finer tuning
start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the RandomForestRegressor
    n_estimators = trial.suggest_int('regressor__n_estimators', 200, 400, step=20)
    max_depth = trial.suggest_int('regressor__max_depth', 26, 36, step=2)
    min_samples_split = trial.suggest_int('regressor__min_samples_split', 2, 8, step=2)
    min_samples_leaf = trial.suggest_int('regressor__min_samples_leaf', 1, 7, step=2)
    max_features = trial.suggest_categorical('regressor__max_features', ['sqrt', .2, .5, .75, 1])
    # Set the suggested hyperparameters in the pipeline
    pipeline.set_params(
        regressor__n_estimators=n_estimators,
        regressor__max_depth=max_depth,
        regressor__min_samples_split=min_samples_split,
        regressor__min_samples_leaf=min_samples_leaf,
        regressor__max_features = max_features
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline, X, np.log(y), cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Set the best hyperparameters in the pipeline
pipeline.set_params(**best_params)

# Fit the pipeline with the best hyperparameters
pipeline.fit(X, np.log(y))

# Get the feature importances from the best model
best_rf = pipeline.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
print(f"{(time.time() - start_time)/60} minutes")
# Print the feature importances
feature_importances_df

[I 2024-06-19 09:47:21,247] A new study created in memory with name: no-name-10f19157-f905-46be-98c6-5549a9ec23c7
[I 2024-06-19 09:47:24,823] Trial 0 finished with value: 0.45826567522179706 and parameters: {'regressor__n_estimators': 200, 'regressor__max_depth': 30, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 3, 'regressor__max_features': 1}. Best is trial 0 with value: 0.45826567522179706.
[I 2024-06-19 09:47:29,252] Trial 1 finished with value: 0.8317924193291881 and parameters: {'regressor__n_estimators': 400, 'regressor__max_depth': 36, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 1}. Best is trial 1 with value: 0.8317924193291881.
[I 2024-06-19 09:48:05,544] Trial 2 finished with value: 0.8969966173313102 and parameters: {'regressor__n_estimators': 260, 'regressor__max_depth': 34, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 3, 'regressor__max_features': 0.75}. Best is trial 2 with value

[I 2024-06-19 09:57:37,297] Trial 27 finished with value: 0.9017072444642731 and parameters: {'regressor__n_estimators': 340, 'regressor__max_depth': 28, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.2}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 09:57:52,708] Trial 28 finished with value: 0.8951234826442226 and parameters: {'regressor__n_estimators': 340, 'regressor__max_depth': 30, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 5, 'regressor__max_features': 0.2}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 09:57:54,543] Trial 29 finished with value: 0.4658882969621031 and parameters: {'regressor__n_estimators': 220, 'regressor__max_depth': 26, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 3, 'regressor__max_features': 1}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 09:58:07,666] Trial 30 finished with value: 0.8996951287056003 and parame

[I 2024-06-19 10:07:22,888] Trial 54 finished with value: 0.9015551722989571 and parameters: {'regressor__n_estimators': 340, 'regressor__max_depth': 30, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.2}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 10:08:02,591] Trial 55 finished with value: 0.9018555242374546 and parameters: {'regressor__n_estimators': 320, 'regressor__max_depth': 32, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 10:08:08,681] Trial 56 finished with value: 0.888824733400147 and parameters: {'regressor__n_estimators': 300, 'regressor__max_depth': 28, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 3, 'regressor__max_features': 'sqrt'}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 10:08:48,020] Trial 57 finished with value: 0.9005676865717073 and pa

[I 2024-06-19 10:17:43,991] Trial 81 finished with value: 0.9001038285278511 and parameters: {'regressor__n_estimators': 320, 'regressor__max_depth': 36, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 10:18:27,138] Trial 82 finished with value: 0.8997365164754709 and parameters: {'regressor__n_estimators': 320, 'regressor__max_depth': 34, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 10:19:11,995] Trial 83 finished with value: 0.9010967188726665 and parameters: {'regressor__n_estimators': 340, 'regressor__max_depth': 34, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.5}. Best is trial 20 with value: 0.9021761568191826.
[I 2024-06-19 10:19:51,616] Trial 84 finished with value: 0.9007300964235337 and para

Best hyperparameters: {'regressor__n_estimators': 260, 'regressor__max_depth': 26, 'regressor__min_samples_split': 4, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 0.2}
37.904890954494476 minutes


Unnamed: 0,feature,importance
250,num__HQSF,1.540255e-01
249,num__TotalSF,1.418890e-01
216,num__OverallQual,1.368044e-01
218,num__YearBuilt,5.186780e-02
196,ord__ExterQual,4.879064e-02
...,...,...
106,cat__RoofMatl_Roll,2.522858e-07
82,cat__Condition2_RRAn,3.064913e-08
177,cat__MiscFeature_TenC,0.000000e+00
104,cat__RoofMatl_Membran,0.000000e+00


In [None]:
#Experiment: GradientBoostingRegressor

In [197]:
#Untuned GBR

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

gbr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=0))
])

start_time = time.time()

# Perform cross-validation and store results in a dictionary
cv_results = {}
scores = cross_val_score(gbr_pipeline, X, y)
cv_results = round(scores.mean(), 6)
# Output the mean cross-validation scores
print(cv_results)

#an untuned GBR has an R2 of .9121(!)

print(f"{time.time() - start_time} seconds")

0.912101
9.05811095237732 seconds


In [196]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [90, 100, 200, 300, 400, 500, 600]
}

# Setup the GridSearchCV
grid_search = GridSearchCV(gbr_pipeline, param_grid, cv=5, scoring='r2', verbose=1)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best parameters (Grid Search): {'regressor__n_estimators': 500}
Best score (Grid Search): 0.9184
3.124719965457916 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.358184
216,num__OverallQual,0.31903
249,num__TotalSF,0.055743
248,num__TotalBath,0.034942
218,num__YearBuilt,0.031026
199,ord__KitchenQual,0.019627
200,ord__BsmtQual,0.016305
221,num__BsmtFinSF1,0.010814
215,num__LotArea,0.010586
237,num__GarageCars,0.010379


In [198]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [400, 500, 600],
    'regressor__max_depth': [5, 10, 15]
}

# Setup the GridSearchCV
grid_search = GridSearchCV(gbr_pipeline, param_grid, cv=5, scoring='r2', verbose=2)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)
#Went Down! Eek need to change up depth- drop in learning rate see what happens

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END regressor__max_depth=5, regressor__n_estimators=400; total time=  10.9s
[CV] END regressor__max_depth=5, regressor__n_estimators=400; total time=  10.3s
[CV] END regressor__max_depth=5, regressor__n_estimators=400; total time=  10.4s
[CV] END regressor__max_depth=5, regressor__n_estimators=400; total time=  10.4s
[CV] END regressor__max_depth=5, regressor__n_estimators=400; total time=  10.2s
[CV] END regressor__max_depth=5, regressor__n_estimators=500; total time=  12.9s
[CV] END regressor__max_depth=5, regressor__n_estimators=500; total time=  12.8s
[CV] END regressor__max_depth=5, regressor__n_estimators=500; total time=  12.9s
[CV] END regressor__max_depth=5, regressor__n_estimators=500; total time=  13.0s
[CV] END regressor__max_depth=5, regressor__n_estimators=500; total time=  12.8s
[CV] END regressor__max_depth=5, regressor__n_estimators=600; total time=  15.4s
[CV] END regressor__max_depth=5, regressor__n_est

Unnamed: 0,feature,importance
216,num__OverallQual,0.387862
250,num__HQSF,0.314062
249,num__TotalSF,0.078744
218,num__YearBuilt,0.024679
199,ord__KitchenQual,0.013745
248,num__TotalBath,0.01284
215,num__LotArea,0.010257
213,num__GrLivArea,0.009889
238,num__GarageArea,0.009647
200,ord__BsmtQual,0.008592


In [200]:
start_time = time.time()
param_grid = {
    'regressor__n_estimators': [400, 500, 600],
    'regressor__max_depth': [2, 5],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    
}

# Setup the GridSearchCV
grid_search = GridSearchCV(gbr_pipeline, param_grid, cv=5, scoring='r2', verbose=2)

# Fit the GridSearchCV to the data
grid_search.fit(X, y)

# Best parameters and best score from Grid Search
print("Best parameters (Grid Search):", grid_search.best_params_)
print("Best score (Grid Search):", round(grid_search.best_score_, 4))
print(f"{(time.time() - start_time)/60} minutes")

report_importances(grid_search).head(20)
#HMMMM .9159 again--- i'm not improving this model

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END regressor__learning_rate=0.01, regressor__max_depth=2, regressor__n_estimators=400; total time=   5.1s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=2, regressor__n_estimators=400; total time=   4.4s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=2, regressor__n_estimators=400; total time=   4.5s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=2, regressor__n_estimators=400; total time=   4.8s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=2, regressor__n_estimators=400; total time=   4.7s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=2, regressor__n_estimators=500; total time=   5.9s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=2, regressor__n_estimators=500; total time=   6.0s
[CV] END regressor__learning_rate=0.01, regressor__max_depth=2, regressor__n_estimators=500; total time=   6.0s
[CV] END regressor__learning_rate=0.01, reg

[CV] END regressor__learning_rate=0.2, regressor__max_depth=2, regressor__n_estimators=600; total time=   6.4s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=2, regressor__n_estimators=600; total time=   6.3s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=5, regressor__n_estimators=400; total time=  10.3s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=5, regressor__n_estimators=400; total time=  10.5s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=5, regressor__n_estimators=400; total time=  10.6s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=5, regressor__n_estimators=400; total time=  11.0s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=5, regressor__n_estimators=400; total time=  10.6s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=5, regressor__n_estimators=500; total time=  14.0s
[CV] END regressor__learning_rate=0.2, regressor__max_depth=5, regressor__n_estimators=500; total time=  16.9s
[

Unnamed: 0,feature,importance
216,num__OverallQual,0.387862
250,num__HQSF,0.314062
249,num__TotalSF,0.078744
218,num__YearBuilt,0.024679
199,ord__KitchenQual,0.013745
248,num__TotalBath,0.01284
215,num__LotArea,0.010257
213,num__GrLivArea,0.009889
238,num__GarageArea,0.009647
200,ord__BsmtQual,0.008592


In [None]:
#OPTUNA AND GBReg

In [207]:
start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline_gbr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state = 0))
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the RandomForestRegressor
    learning_rate = trial.suggest_float('regressor__learning_rate', .01, .21, step = .02)
    n_estimators = trial.suggest_int('regressor__n_estimators', 50, 500, step=50)
    max_depth = trial.suggest_int('regressor__max_depth', 2, 32, step=2)
    min_samples_split = trial.suggest_int('regressor__min_samples_split', 2, 20, step=2)
    min_samples_leaf = trial.suggest_int('regressor__min_samples_leaf', 1, 9, step=2)
    subsample = trial.suggest_float('regressor__subsample', .8, 1, step = .1)
    max_features = trial.suggest_categorical('regressor__max_features', ['sqrt', .5, 1])
   
# Set the suggested hyperparameters in the pipeline
    pipeline_gbr.set_params(
        regressor__learning_rate = learning_rate,
        regressor__n_estimators=n_estimators,
        regressor__max_depth=max_depth,
        regressor__min_samples_split=min_samples_split,
        regressor__min_samples_leaf=min_samples_leaf,
        regressor__subsample = subsample,
        regressor__max_features = max_features
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline_gbr, X, y, cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# # Set the best hyperparameters in the pipeline
# pipeline_gbr.set_params(**best_params)

# # Fit the pipeline with the best hyperparameters
# pipeline_gbr.fit(X, y)

# # Get the feature importances from the best model
# best_rf = pipeline.named_steps['regressor']
# feature_importances = best_rf.feature_importances_

# # Get the feature names after preprocessing
# preprocessor = pipeline.named_steps['preprocessor']
# transformed_feature_names = preprocessor.get_feature_names_out()

# # Create a DataFrame for better readability
# feature_importances_df = pd.DataFrame({
#     'feature': transformed_feature_names,
#     'importance': feature_importances
# })

# # Sort features by importance
# feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
# print(f"{(time.time() - start_time)/60} minutes")
# # Print the feature importances
# feature_importances_df

[I 2024-06-19 12:19:22,411] A new study created in memory with name: no-name-dfb132c8-6105-47e9-8ed6-1d2372039d0a
[I 2024-06-19 12:19:23,086] Trial 0 finished with value: 0.5148937659018131 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 50, 'regressor__max_depth': 28, 'regressor__min_samples_split': 12, 'regressor__min_samples_leaf': 3, 'regressor__subsample': 0.9, 'regressor__max_features': 1}. Best is trial 0 with value: 0.5148937659018131.
[I 2024-06-19 12:21:01,451] Trial 1 finished with value: 0.9025746856860952 and parameters: {'regressor__learning_rate': 0.17, 'regressor__n_estimators': 400, 'regressor__max_depth': 20, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 7, 'regressor__subsample': 1.0, 'regressor__max_features': 0.5}. Best is trial 1 with value: 0.9025746856860952.
[I 2024-06-19 12:21:10,521] Trial 2 finished with value: 0.8987426430447221 and parameters: {'regressor__learning_rate': 0.13, 'regressor__n_estimators': 50

[I 2024-06-19 12:24:52,396] Trial 22 finished with value: 0.9158729391104394 and parameters: {'regressor__learning_rate': 0.09, 'regressor__n_estimators': 350, 'regressor__max_depth': 8, 'regressor__min_samples_split': 18, 'regressor__min_samples_leaf': 7, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 15 with value: 0.9179721453368705.
[I 2024-06-19 12:25:01,114] Trial 23 finished with value: 0.9151560260565607 and parameters: {'regressor__learning_rate': 0.09, 'regressor__n_estimators': 350, 'regressor__max_depth': 14, 'regressor__min_samples_split': 18, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 15 with value: 0.9179721453368705.
[I 2024-06-19 12:25:07,294] Trial 24 finished with value: 0.9225572909502151 and parameters: {'regressor__learning_rate': 0.05, 'regressor__n_estimators': 450, 'regressor__max_depth': 6, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 9, 

[I 2024-06-19 12:29:55,374] Trial 45 finished with value: 0.9139774114157355 and parameters: {'regressor__learning_rate': 0.01, 'regressor__n_estimators': 450, 'regressor__max_depth': 6, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__subsample': 0.8, 'regressor__max_features': 'sqrt'}. Best is trial 43 with value: 0.9232340769170817.
[I 2024-06-19 12:30:04,722] Trial 46 finished with value: 0.9045829451862033 and parameters: {'regressor__learning_rate': 0.21, 'regressor__n_estimators': 500, 'regressor__max_depth': 10, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.8, 'regressor__max_features': 'sqrt'}. Best is trial 43 with value: 0.9232340769170817.
[I 2024-06-19 12:30:14,074] Trial 47 finished with value: 0.921580751441786 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 400, 'regressor__max_depth': 14, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 7, 'r

[I 2024-06-19 12:32:56,101] Trial 68 finished with value: 0.920547156975554 and parameters: {'regressor__learning_rate': 0.06999999999999999, 'regressor__n_estimators': 500, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}. Best is trial 50 with value: 0.9247057546774785.
[I 2024-06-19 12:33:01,263] Trial 69 finished with value: 0.9191610606065078 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 500, 'regressor__max_depth': 4, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}. Best is trial 50 with value: 0.9247057546774785.
[I 2024-06-19 12:33:09,486] Trial 70 finished with value: 0.9153115126051301 and parameters: {'regressor__learning_rate': 0.05, 'regressor__n_estimators': 450, 'regressor__max_depth': 8, 'regressor__min_samples_split': 4, 'regressor__min_samples_

[I 2024-06-19 12:35:33,059] Trial 91 finished with value: 0.9242372172170141 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 450, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}. Best is trial 50 with value: 0.9247057546774785.
[I 2024-06-19 12:35:38,900] Trial 92 finished with value: 0.9242372172170141 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 450, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}. Best is trial 50 with value: 0.9247057546774785.
[I 2024-06-19 12:35:46,214] Trial 93 finished with value: 0.9203930228185854 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 450, 'regressor__max_depth': 8, 'regressor__min_samples_split': 6, 'regressor__min_samples_leaf': 5, 'reg

Best hyperparameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 500, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}


ValueError: Invalid parameter 'learning_rate' for estimator RandomForestRegressor(max_depth=26, max_features=0.2, min_samples_split=4,
                      n_estimators=260). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].

In [208]:
# Set the best hyperparameters in the pipeline
pipeline_gbr.set_params(**best_params)

# Fit the pipeline with the best hyperparameters
pipeline_gbr.fit(X, y)

# Get the feature importances from the best model
best_rf = pipeline.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
print(f"{(time.time() - start_time)/60} minutes")
# Print the feature importances
feature_importances_df

56.12329216798147 minutes


Unnamed: 0,feature,importance
250,num__HQSF,1.540255e-01
249,num__TotalSF,1.418890e-01
216,num__OverallQual,1.368044e-01
218,num__YearBuilt,5.186780e-02
196,ord__ExterQual,4.879064e-02
...,...,...
106,cat__RoofMatl_Roll,2.522858e-07
82,cat__Condition2_RRAn,3.064913e-08
177,cat__MiscFeature_TenC,0.000000e+00
104,cat__RoofMatl_Membran,0.000000e+00


.9247
Best hyperparameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 500, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}

In [210]:
start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline_gbr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state = 0))
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the RandomForestRegressor
    learning_rate = trial.suggest_float('regressor__learning_rate', .01, .07, step = .01)
    n_estimators = trial.suggest_int('regressor__n_estimators', 400, 600, step=20)
    max_depth = trial.suggest_int('regressor__max_depth', 2, 8, step=1)
    min_samples_split = trial.suggest_int('regressor__min_samples_split', 6, 10, step=1)
    min_samples_leaf = trial.suggest_int('regressor__min_samples_leaf', 3, 7, step=1)
    subsample = trial.suggest_float('regressor__subsample', .8, 1, step = .1)
    max_features = trial.suggest_categorical('regressor__max_features', ['sqrt', .5, 1])
   
# Set the suggested hyperparameters in the pipeline
    pipeline_gbr.set_params(
        regressor__learning_rate = learning_rate,
        regressor__n_estimators=n_estimators,
        regressor__max_depth=max_depth,
        regressor__min_samples_split=min_samples_split,
        regressor__min_samples_leaf=min_samples_leaf,
        regressor__subsample = subsample,
        regressor__max_features = max_features
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline_gbr, X, y, cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Set the best hyperparameters in the pipeline
pipeline_gbr.set_params(**best_params)

# Fit the pipeline with the best hyperparameters
pipeline_gbr.fit(X, y)

# Get the feature importances from the best model
best_rf = pipeline_gbr.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline_gbr.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
     'feature': transformed_feature_names,
     'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
print(f"{(time.time() - start_time)/60} minutes")
# Print the feature importances
feature_importances_df

[I 2024-06-19 13:20:22,382] A new study created in memory with name: no-name-b16ef60f-9844-49c5-a3ce-c353c93438b5
[I 2024-06-19 13:20:25,071] Trial 0 finished with value: 0.8922436993147841 and parameters: {'regressor__learning_rate': 0.060000000000000005, 'regressor__n_estimators': 500, 'regressor__max_depth': 5, 'regressor__min_samples_split': 7, 'regressor__min_samples_leaf': 7, 'regressor__subsample': 0.8, 'regressor__max_features': 1}. Best is trial 0 with value: 0.8922436993147841.
[I 2024-06-19 13:20:30,538] Trial 1 finished with value: 0.9220209342460981 and parameters: {'regressor__learning_rate': 0.06999999999999999, 'regressor__n_estimators': 440, 'regressor__max_depth': 5, 'regressor__min_samples_split': 9, 'regressor__min_samples_leaf': 4, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 1 with value: 0.9220209342460981.
[I 2024-06-19 13:21:21,577] Trial 2 finished with value: 0.9150693919574489 and parameters: {'regressor__learning_rate': 0.0

[I 2024-06-19 13:25:18,375] Trial 22 finished with value: 0.9228887631974338 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 480, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 6, 'regressor__subsample': 0.8, 'regressor__max_features': 'sqrt'}. Best is trial 18 with value: 0.9246718143832912.
[I 2024-06-19 13:25:25,995] Trial 23 finished with value: 0.9215294558726868 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 480, 'regressor__max_depth': 7, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 18 with value: 0.9246718143832912.
[I 2024-06-19 13:25:32,047] Trial 24 finished with value: 0.918978982524758 and parameters: {'regressor__learning_rate': 0.02, 'regressor__n_estimators': 520, 'regressor__max_depth': 5, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regr

[I 2024-06-19 13:29:48,260] Trial 45 finished with value: 0.9214488911007969 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 520, 'regressor__max_depth': 8, 'regressor__min_samples_split': 7, 'regressor__min_samples_leaf': 4, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 29 with value: 0.9247692572721296.
[I 2024-06-19 13:29:56,718] Trial 46 finished with value: 0.9197671982083072 and parameters: {'regressor__learning_rate': 0.05, 'regressor__n_estimators': 480, 'regressor__max_depth': 8, 'regressor__min_samples_split': 7, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}. Best is trial 29 with value: 0.9247692572721296.
[I 2024-06-19 13:30:03,913] Trial 47 finished with value: 0.9219838932032884 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 460, 'regressor__max_depth': 7, 'regressor__min_samples_split': 7, 'regressor__min_samples_leaf': 4, 'reg

[I 2024-06-19 13:32:49,699] Trial 68 finished with value: 0.9246718143832912 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 480, 'regressor__max_depth': 6, 'regressor__min_samples_split': 9, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 52 with value: 0.9248003820835873.
[I 2024-06-19 13:33:21,773] Trial 69 finished with value: 0.9186310860887754 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 420, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 6, 'regressor__subsample': 0.9, 'regressor__max_features': 0.5}. Best is trial 52 with value: 0.9248003820835873.
[I 2024-06-19 13:33:28,152] Trial 70 finished with value: 0.9245320441269433 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 460, 'regressor__max_depth': 6, 'regressor__min_samples_split': 9, 'regressor__min_samples_leaf': 5, 'regres

[I 2024-06-19 13:36:24,733] Trial 91 finished with value: 0.9249129369149033 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 540, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 85 with value: 0.9249749822247795.
[I 2024-06-19 13:36:32,203] Trial 92 finished with value: 0.9249129369149033 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 540, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 85 with value: 0.9249749822247795.
[I 2024-06-19 13:36:39,603] Trial 93 finished with value: 0.9216066489223087 and parameters: {'regressor__learning_rate': 0.05, 'regressor__n_estimators': 540, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'reg

Best hyperparameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 560, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}
17.000131114323935 minutes


Unnamed: 0,feature,importance
250,num__HQSF,0.091428
249,num__TotalSF,0.082211
236,num__GarageYrBlt,0.066469
200,ord__BsmtQual,0.052865
216,num__OverallQual,0.050532
...,...,...
151,cat__Heating_Floor,0.000000
104,cat__RoofMatl_Membran,0.000000
117,cat__Exterior1st_ImStucc,0.000000
156,cat__Heating_Wall,0.000000


In [214]:
start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline_gbr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state = 0))
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the RandomForestRegressor
    learning_rate = trial.suggest_float('regressor__learning_rate', .03, .05, step = .01)
    n_estimators = trial.suggest_int('regressor__n_estimators', 520, 600, step=10)
    max_depth = trial.suggest_int('regressor__max_depth', 4, 8, step=1)
    min_samples_split = trial.suggest_int('regressor__min_samples_split', 6, 10, step=1)
    min_samples_leaf = trial.suggest_int('regressor__min_samples_leaf', 4, 6, step=1)
    subsample = trial.suggest_float('regressor__subsample', .8, 1, step = .05)
    max_features = trial.suggest_categorical('regressor__max_features', ['sqrt', .5, 1])
   
# Set the suggested hyperparameters in the pipeline
    pipeline_gbr.set_params(
        regressor__learning_rate = learning_rate,
        regressor__n_estimators=n_estimators,
        regressor__max_depth=max_depth,
        regressor__min_samples_split=min_samples_split,
        regressor__min_samples_leaf=min_samples_leaf,
        regressor__subsample = subsample,
        regressor__max_features = max_features
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline_gbr, X, y, cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Set the best hyperparameters in the pipeline
pipeline_gbr.set_params(**best_params)

# Fit the pipeline with the best hyperparameters
pipeline_gbr.fit(X, y)

# Get the feature importances from the best model
best_rf = pipeline_gbr.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline_gbr.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
     'feature': transformed_feature_names,
     'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
print(f"{(time.time() - start_time)/60} minutes")
# Print the feature importances
feature_importances_df

[I 2024-06-19 13:48:08,959] A new study created in memory with name: no-name-a4cfc196-9da7-40fd-b55d-0e00fbe9ca90
[I 2024-06-19 13:48:19,311] Trial 0 finished with value: 0.9200274841370175 and parameters: {'regressor__learning_rate': 0.05, 'regressor__n_estimators': 580, 'regressor__max_depth': 7, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9500000000000001, 'regressor__max_features': 'sqrt'}. Best is trial 0 with value: 0.9200274841370175.
[I 2024-06-19 13:48:21,668] Trial 1 finished with value: 0.8867201979502275 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 540, 'regressor__max_depth': 8, 'regressor__min_samples_split': 9, 'regressor__min_samples_leaf': 6, 'regressor__subsample': 0.8, 'regressor__max_features': 1}. Best is trial 0 with value: 0.9200274841370175.
[I 2024-06-19 13:48:28,769] Trial 2 finished with value: 0.9221865000659168 and parameters: {'regressor__learning_rate': 0.03, 'regressor__

[I 2024-06-19 13:53:15,361] Trial 22 finished with value: 0.9220548838021305 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 550, 'regressor__max_depth': 5, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 18 with value: 0.9249146529617622.
[I 2024-06-19 13:53:23,554] Trial 23 finished with value: 0.9232810501197559 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 560, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9500000000000001, 'regressor__max_features': 'sqrt'}. Best is trial 18 with value: 0.9249146529617622.
[I 2024-06-19 13:53:29,507] Trial 24 finished with value: 0.9224468779548097 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 570, 'regressor__max_depth': 4, 'regressor__min_samples_split': 9, 'regressor__min_samples

[I 2024-06-19 13:57:59,134] Trial 44 finished with value: 0.9230857938771889 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 580, 'regressor__max_depth': 6, 'regressor__min_samples_split': 7, 'regressor__min_samples_leaf': 4, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 43 with value: 0.9250318233082581.
[I 2024-06-19 13:58:06,827] Trial 45 finished with value: 0.9243643662358458 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 580, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.8500000000000001, 'regressor__max_features': 'sqrt'}. Best is trial 43 with value: 0.9250318233082581.
[I 2024-06-19 13:58:14,606] Trial 46 finished with value: 0.924340699238992 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 590, 'regressor__max_depth': 6, 'regressor__min_samples_split': 8, 'regressor__min_samples_

[I 2024-06-19 14:01:12,572] Trial 66 finished with value: 0.9248691718345388 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 530, 'regressor__max_depth': 6, 'regressor__min_samples_split': 9, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 43 with value: 0.9250318233082581.
[I 2024-06-19 14:01:19,841] Trial 67 finished with value: 0.9248691718345388 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 530, 'regressor__max_depth': 6, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 43 with value: 0.9250318233082581.
[I 2024-06-19 14:02:00,507] Trial 68 finished with value: 0.9177025748745098 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 530, 'regressor__max_depth': 6, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 5, 'r

[I 2024-06-19 14:05:08,677] Trial 89 finished with value: 0.9249146529617622 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 550, 'regressor__max_depth': 6, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 'sqrt'}. Best is trial 43 with value: 0.9250318233082581.
[I 2024-06-19 14:05:16,442] Trial 90 finished with value: 0.9229392644144152 and parameters: {'regressor__learning_rate': 0.03, 'regressor__n_estimators': 550, 'regressor__max_depth': 6, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9500000000000001, 'regressor__max_features': 'sqrt'}. Best is trial 43 with value: 0.9250318233082581.
[I 2024-06-19 14:05:23,954] Trial 91 finished with value: 0.9249146529617622 and parameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 550, 'regressor__max_depth': 6, 'regressor__min_samples_split': 10, 'regressor__min_samp

Best hyperparameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 580, 'regressor__max_depth': 6, 'regressor__min_samples_split': 7, 'regressor__min_samples_leaf': 4, 'regressor__subsample': 0.9500000000000001, 'regressor__max_features': 'sqrt'}
18.246113435427347 minutes


Unnamed: 0,feature,importance
216,num__OverallQual,0.101346
250,num__HQSF,0.065859
236,num__GarageYrBlt,0.062010
249,num__TotalSF,0.057123
225,num__1stFlrSF,0.051168
...,...,...
177,cat__MiscFeature_TenC,0.000000
175,cat__MiscFeature_Othr,0.000000
173,cat__MiscFeature_Gar2,0.000000
164,cat__Functional_Sal,0.000000


.925
Best hyperparameters: {'regressor__learning_rate': 0.04, 'regressor__n_estimators': 580, 'regressor__max_depth': 6, 'regressor__min_samples_split': 7, 'regressor__min_samples_leaf': 4, 'regressor__subsample': 0.9500000000000001, 'regressor__max_features': 'sqrt'}

In [215]:
feature_importances_df.head(20)

Unnamed: 0,feature,importance
216,num__OverallQual,0.101346
250,num__HQSF,0.065859
236,num__GarageYrBlt,0.06201
249,num__TotalSF,0.057123
225,num__1stFlrSF,0.051168
238,num__GarageArea,0.047056
235,num__Fireplaces,0.040857
218,num__YearBuilt,0.035595
199,ord__KitchenQual,0.03341
200,ord__BsmtQual,0.032917


In [217]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline_gbr_tuned = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state = 0, learning_rate = .04, 
                                            n_estimators = 580, max_depth = 6, min_samples_split = 7,
                                           min_samples_leaf = 4, subsample = .95, max_features = 'sqrt'))
])

scores = cross_val_score(pipeline_gbr_tuned, X, y, cv=5, scoring='r2')
np.mean(scores)

0.9250318233082581

In [None]:
#optuna GBR and log(y)

In [216]:
start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features ),
        ('num', 'passthrough', numerical_features)  # Pass through numerical features unchanged
    ]
)

pipeline_gbr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state = 0))
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the RandomForestRegressor
    learning_rate = trial.suggest_float('regressor__learning_rate', .01, .21, step = .02)
    n_estimators = trial.suggest_int('regressor__n_estimators', 50, 500, step=50)
    max_depth = trial.suggest_int('regressor__max_depth', 2, 32, step=2)
    min_samples_split = trial.suggest_int('regressor__min_samples_split', 2, 20, step=2)
    min_samples_leaf = trial.suggest_int('regressor__min_samples_leaf', 1, 9, step=2)
    subsample = trial.suggest_float('regressor__subsample', .8, 1, step = .1)
    max_features = trial.suggest_categorical('regressor__max_features', ['sqrt', .5, 1])
   
# Set the suggested hyperparameters in the pipeline
    pipeline_gbr.set_params(
        regressor__learning_rate = learning_rate,
        regressor__n_estimators=n_estimators,
        regressor__max_depth=max_depth,
        regressor__min_samples_split=min_samples_split,
        regressor__min_samples_leaf=min_samples_leaf,
        regressor__subsample = subsample,
        regressor__max_features = max_features
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline_gbr, X, np.log(y), cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

# Set the best hyperparameters in the pipeline
pipeline_gbr.set_params(**best_params)

# Fit the pipeline with the best hyperparameters
pipeline_gbr.fit(X, np.log(y))

# Get the feature importances from the best model
best_rf = pipeline.named_steps['regressor']
feature_importances = best_rf.feature_importances_

# Get the feature names after preprocessing
preprocessor = pipeline.named_steps['preprocessor']
transformed_feature_names = preprocessor.get_feature_names_out()

# Create a DataFrame for better readability
feature_importances_df = pd.DataFrame({
    'feature': transformed_feature_names,
    'importance': feature_importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='importance', ascending=False)
print(f"{(time.time() - start_time)/60} minutes")
# Print the feature importances
feature_importances_df

[I 2024-06-19 14:17:18,173] A new study created in memory with name: no-name-ae45b5f2-0245-4973-a7d1-9c6f7f1a7cbf
[I 2024-06-19 14:17:21,134] Trial 0 finished with value: 0.8919263366231082 and parameters: {'regressor__learning_rate': 0.11, 'regressor__n_estimators': 450, 'regressor__max_depth': 6, 'regressor__min_samples_split': 20, 'regressor__min_samples_leaf': 9, 'regressor__subsample': 0.9, 'regressor__max_features': 1}. Best is trial 0 with value: 0.8919263366231082.
[I 2024-06-19 14:17:25,824] Trial 1 finished with value: 0.9077852854837152 and parameters: {'regressor__learning_rate': 0.13, 'regressor__n_estimators': 100, 'regressor__max_depth': 20, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 9, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}. Best is trial 1 with value: 0.9077852854837152.
[I 2024-06-19 14:17:27,271] Trial 2 finished with value: 0.883708989649952 and parameters: {'regressor__learning_rate': 0.19, 'regressor__n_estimators': 

[I 2024-06-19 14:27:14,769] Trial 22 finished with value: 0.9096383985557835 and parameters: {'regressor__learning_rate': 0.19, 'regressor__n_estimators': 500, 'regressor__max_depth': 4, 'regressor__min_samples_split': 18, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.8, 'regressor__max_features': 0.5}. Best is trial 5 with value: 0.9220087240314033.
[I 2024-06-19 14:27:37,248] Trial 23 finished with value: 0.9108321026386165 and parameters: {'regressor__learning_rate': 0.21, 'regressor__n_estimators': 450, 'regressor__max_depth': 4, 'regressor__min_samples_split': 16, 'regressor__min_samples_leaf': 3, 'regressor__subsample': 0.8, 'regressor__max_features': 0.5}. Best is trial 5 with value: 0.9220087240314033.
[I 2024-06-19 14:28:08,639] Trial 24 finished with value: 0.9082173433446215 and parameters: {'regressor__learning_rate': 0.15000000000000002, 'regressor__n_estimators': 350, 'regressor__max_depth': 8, 'regressor__min_samples_split': 20, 'regressor__min_samples_leaf

[I 2024-06-19 14:37:36,835] Trial 45 finished with value: 0.9154975929630391 and parameters: {'regressor__learning_rate': 0.13, 'regressor__n_estimators': 400, 'regressor__max_depth': 6, 'regressor__min_samples_split': 20, 'regressor__min_samples_leaf': 9, 'regressor__subsample': 1.0, 'regressor__max_features': 0.5}. Best is trial 25 with value: 0.9233058616703635.
[I 2024-06-19 14:37:39,956] Trial 46 finished with value: 0.9160487626102057 and parameters: {'regressor__learning_rate': 0.17, 'regressor__n_estimators': 450, 'regressor__max_depth': 2, 'regressor__min_samples_split': 16, 'regressor__min_samples_leaf': 9, 'regressor__subsample': 1.0, 'regressor__max_features': 'sqrt'}. Best is trial 25 with value: 0.9233058616703635.
[I 2024-06-19 14:38:45,333] Trial 47 finished with value: 0.9089198310480106 and parameters: {'regressor__learning_rate': 0.11, 'regressor__n_estimators': 300, 'regressor__max_depth': 16, 'regressor__min_samples_split': 20, 'regressor__min_samples_leaf': 9, 're

[I 2024-06-19 14:49:04,538] Trial 68 finished with value: 0.9190152234160948 and parameters: {'regressor__learning_rate': 0.19, 'regressor__n_estimators': 350, 'regressor__max_depth': 4, 'regressor__min_samples_split': 8, 'regressor__min_samples_leaf': 1, 'regressor__subsample': 0.9, 'regressor__max_features': 0.5}. Best is trial 57 with value: 0.9234283710953555.
[I 2024-06-19 14:49:06,995] Trial 69 finished with value: 0.8893946863291389 and parameters: {'regressor__learning_rate': 0.19, 'regressor__n_estimators': 500, 'regressor__max_depth': 12, 'regressor__min_samples_split': 12, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 1}. Best is trial 57 with value: 0.9234283710953555.
[I 2024-06-19 14:49:40,387] Trial 70 finished with value: 0.91738947851762 and parameters: {'regressor__learning_rate': 0.13, 'regressor__n_estimators': 450, 'regressor__max_depth': 6, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 3, 'regressor_

[I 2024-06-19 15:00:25,173] Trial 91 finished with value: 0.9219639700098501 and parameters: {'regressor__learning_rate': 0.21, 'regressor__n_estimators': 500, 'regressor__max_depth': 2, 'regressor__min_samples_split': 14, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 0.5}. Best is trial 57 with value: 0.9234283710953555.
[I 2024-06-19 15:00:39,872] Trial 92 finished with value: 0.9227466449714535 and parameters: {'regressor__learning_rate': 0.21, 'regressor__n_estimators': 500, 'regressor__max_depth': 2, 'regressor__min_samples_split': 16, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 1.0, 'regressor__max_features': 0.5}. Best is trial 57 with value: 0.9234283710953555.
[I 2024-06-19 15:01:07,102] Trial 93 finished with value: 0.9162972009736325 and parameters: {'regressor__learning_rate': 0.21, 'regressor__n_estimators': 500, 'regressor__max_depth': 4, 'regressor__min_samples_split': 16, 'regressor__min_samples_leaf': 5, 'regres

Best hyperparameters: {'regressor__learning_rate': 0.17, 'regressor__n_estimators': 500, 'regressor__max_depth': 2, 'regressor__min_samples_split': 12, 'regressor__min_samples_leaf': 5, 'regressor__subsample': 0.9, 'regressor__max_features': 0.5}
48.27990358273188 minutes


Unnamed: 0,feature,importance
250,num__HQSF,1.540255e-01
249,num__TotalSF,1.418890e-01
216,num__OverallQual,1.368044e-01
218,num__YearBuilt,5.186780e-02
196,ord__ExterQual,4.879064e-02
...,...,...
106,cat__RoofMatl_Roll,2.522858e-07
82,cat__Condition2_RRAn,3.064913e-08
177,cat__MiscFeature_TenC,0.000000e+00
104,cat__RoofMatl_Membran,0.000000e+00


In [220]:
from sklearn.svm import SVR

ss = StandardScaler()


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features),
        ('num', ss, numerical_features)  # scale numerical features
    ]
)

pipeline_SVR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])


# Perform cross-validation and store results in a dictionary
cv_results = {}
scores = cross_val_score(pipeline, X, y)
cv_results = round(scores.mean(), 6)
# Output the mean cross-validation scores
print(cv_results)

#an untuned SVR has an R2 of .9054

0.905722


In [223]:
start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features),
        ('num', ss, numerical_features)  # scale numerical features
    ]
)

pipeline_SVR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the SVR
    kernel = trial.suggest_categorical('regressor__kernel', ['rbf', 'linear', 'poly'])
    epsilon = trial.suggest_float('regressor__epsilon', .001, .991, step = .01)
    C = trial.suggest_int('regressor__C', 1, 100, step = 1)
    gamma = trial.suggest_categorical('regressor__gamma', ['scale', 'auto', .2, .4, .6, .8])
    
    
# Set the suggested hyperparameters in the pipeline
    pipeline_SVR.set_params(
        regressor__kernel = kernel,
        regressor__epsilon = epsilon,
        regressor__C = C,
        regressor__gamma = gamma
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline_SVR, X, y, cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")
print(f"{(time.time() - start_time)/60} minutes")

[I 2024-06-21 12:22:07,504] A new study created in memory with name: no-name-e5ad39b8-e86e-4be8-b0e9-20e840eb2c6d
[I 2024-06-21 12:23:04,959] Trial 0 finished with value: 0.8427663218621767 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.811, 'regressor__C': 97, 'regressor__gamma': 0.6}. Best is trial 0 with value: 0.8427663218621767.
[I 2024-06-21 12:24:02,367] Trial 1 finished with value: 0.8418365541093772 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.9510000000000001, 'regressor__C': 62, 'regressor__gamma': 0.8}. Best is trial 0 with value: 0.8427663218621767.
[I 2024-06-21 12:24:05,232] Trial 2 finished with value: 0.8712411084686469 and parameters: {'regressor__kernel': 'linear', 'regressor__epsilon': 0.381, 'regressor__C': 70, 'regressor__gamma': 0.8}. Best is trial 2 with value: 0.8712411084686469.
[I 2024-06-21 12:24:08,166] Trial 3 finished with value: 0.8550661715391641 and parameters: {'regressor__kernel': 'linear', 'regressor__

[I 2024-06-21 12:33:36,523] Trial 33 finished with value: 0.8457985841962021 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.371, 'regressor__C': 6, 'regressor__gamma': 0.8}. Best is trial 18 with value: 0.9027128430670983.
[I 2024-06-21 12:34:09,333] Trial 34 finished with value: 0.8766163633680091 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.461, 'regressor__C': 26, 'regressor__gamma': 0.2}. Best is trial 18 with value: 0.9027128430670983.
[I 2024-06-21 12:34:23,738] Trial 35 finished with value: 0.8865895147089397 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.371, 'regressor__C': 7, 'regressor__gamma': 0.2}. Best is trial 18 with value: 0.9027128430670983.
[I 2024-06-21 12:34:33,569] Trial 36 finished with value: 0.8925983970336115 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.361, 'regressor__C': 4, 'regressor__gamma': 0.2}. Best is trial 18 with value: 0.9027128430670983.
[I 2024-06-21 1

[I 2024-06-21 12:43:01,132] Trial 66 finished with value: 0.3699683917029318 and parameters: {'regressor__kernel': 'linear', 'regressor__epsilon': 0.14100000000000001, 'regressor__C': 1, 'regressor__gamma': 0.2}. Best is trial 18 with value: 0.9027128430670983.
[I 2024-06-21 12:43:53,595] Trial 67 finished with value: 0.8484726967251571 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.101, 'regressor__C': 9, 'regressor__gamma': 0.6}. Best is trial 18 with value: 0.9027128430670983.
[I 2024-06-21 12:44:18,546] Trial 68 finished with value: 0.8828889816807338 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.041, 'regressor__C': 13, 'regressor__gamma': 0.2}. Best is trial 18 with value: 0.9027128430670983.
[I 2024-06-21 12:45:09,739] Trial 69 finished with value: 0.8547104413985164 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.181, 'regressor__C': 16, 'regressor__gamma': 0.4}. Best is trial 18 with value: 0.9027128430670983

Best hyperparameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.451, 'regressor__C': 1, 'regressor__gamma': 0.2}
31.196287635962168 minutes


In [224]:
start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features),
        ('num', ss, numerical_features)  # scale numerical features
    ]
)

pipeline_SVR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the SVR
    kernel = trial.suggest_categorical('regressor__kernel', ['poly'])
    epsilon = trial.suggest_float('regressor__epsilon', .301, .601, step = .01)
    C = trial.suggest_float('regressor__C', .1, 2, step = .1)
    gamma = trial.suggest_categorical('regressor__gamma', ['scale', 'auto', .2, .4, .6, .8])
    
    
# Set the suggested hyperparameters in the pipeline
    pipeline_SVR.set_params(
        regressor__kernel = kernel,
        regressor__epsilon = epsilon,
        regressor__C = C,
        regressor__gamma = gamma
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline_SVR, X, y, cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")
print(f"{(time.time() - start_time)/60} minutes")

[I 2024-06-21 12:57:03,537] A new study created in memory with name: no-name-633b2f14-b9e5-44ca-b95e-f037271757ae
[I 2024-06-21 12:57:06,866] Trial 0 finished with value: -0.05968580814982496 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.471, 'regressor__C': 0.9, 'regressor__gamma': 'scale'}. Best is trial 0 with value: -0.05968580814982496.
[I 2024-06-21 12:57:52,304] Trial 1 finished with value: 0.8752504639895233 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.361, 'regressor__C': 1.4000000000000001, 'regressor__gamma': 0.6}. Best is trial 1 with value: 0.8752504639895233.
[I 2024-06-21 12:58:31,229] Trial 2 finished with value: 0.8733323172913776 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.481, 'regressor__C': 1.9000000000000001, 'regressor__gamma': 0.6}. Best is trial 1 with value: 0.8752504639895233.
[I 2024-06-21 12:58:55,951] Trial 3 finished with value: 0.8825716742779358 and parameters: {'regressor__kerne

[I 2024-06-21 13:03:14,224] Trial 32 finished with value: 0.9055035727672294 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.421, 'regressor__C': 0.4, 'regressor__gamma': 0.2}. Best is trial 23 with value: 0.9058238695051809.
[I 2024-06-21 13:03:17,467] Trial 33 finished with value: 0.9050101551911898 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.45099999999999996, 'regressor__C': 0.2, 'regressor__gamma': 0.2}. Best is trial 23 with value: 0.9058238695051809.
[I 2024-06-21 13:03:23,351] Trial 34 finished with value: 0.9015412643327542 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.491, 'regressor__C': 1.6, 'regressor__gamma': 0.2}. Best is trial 23 with value: 0.9058238695051809.
[I 2024-06-21 13:03:50,304] Trial 35 finished with value: 0.8806067091714971 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.471, 'regressor__C': 0.6, 'regressor__gamma': 0.6}. Best is trial 23 with value: 0.905823869505

[I 2024-06-21 13:07:27,785] Trial 65 finished with value: -0.06613435879703404 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.45099999999999996, 'regressor__C': 0.2, 'regressor__gamma': 'auto'}. Best is trial 23 with value: 0.9058238695051809.
[I 2024-06-21 13:07:30,811] Trial 66 finished with value: 0.9008030658960082 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.431, 'regressor__C': 0.1, 'regressor__gamma': 0.2}. Best is trial 23 with value: 0.9058238695051809.
[I 2024-06-21 13:07:34,530] Trial 67 finished with value: 0.9047197196423122 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.361, 'regressor__C': 0.6, 'regressor__gamma': 0.2}. Best is trial 23 with value: 0.9058238695051809.
[I 2024-06-21 13:07:37,897] Trial 68 finished with value: 0.905503549445692 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.391, 'regressor__C': 0.4, 'regressor__gamma': 0.2}. Best is trial 23 with value: 0.90582386

[I 2024-06-21 13:10:53,975] Trial 98 finished with value: 0.9058238215660651 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.381, 'regressor__C': 0.30000000000000004, 'regressor__gamma': 0.2}. Best is trial 23 with value: 0.9058238695051809.
[I 2024-06-21 13:10:57,010] Trial 99 finished with value: 0.9008030478077812 and parameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.391, 'regressor__C': 0.1, 'regressor__gamma': 0.2}. Best is trial 23 with value: 0.9058238695051809.


Best hyperparameters: {'regressor__kernel': 'poly', 'regressor__epsilon': 0.511, 'regressor__C': 0.30000000000000004, 'regressor__gamma': 0.2}
13.891284930706025 minutes


In [None]:
start_time = time.time()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, non_ordinal_categorical_features),
        ('ord', ordinal_encoder, ordinal_categorical_features),
        ('num', ss, numerical_features)  # scale numerical features
    ]
)

pipeline_SVR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', SVR())
])

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for the SVR
    kernel = trial.suggest_categorical('regressor__kernel', ['poly'])
    degree = trial.suggest_int('regressor__degree', 2, 5, step = 1)
    epsilon = trial.suggest_float('regressor__epsilon', .401, .601, step = .01)
    C = trial.suggest_float('regressor__C', .1, 2, step = .1)
    gamma = trial.suggest_float('regressor__gamma', .1, .3, step = .01)
    
    
# Set the suggested hyperparameters in the pipeline
    pipeline_SVR.set_params(
        regressor__kernel = kernel,
        regressor__degree = degree,
        regressor__epsilon = epsilon,
        regressor__C = C,
        regressor__gamma = gamma
    )
    
    # Perform cross-validation
    scores = cross_val_score(pipeline_SVR, X, y, cv=5, scoring='r2')
    
    # Return the mean of the cross-validation scores
    return np.mean(scores)

# Create a study
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")
print(f"{(time.time() - start_time)/60} minutes")