## Housing 2º Competition, Kaggle Regression

In [29]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor

#import set_config
from sklearn import set_config
set_config(transform_output="pandas")

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 300)

In [30]:
# check version of sklearn to verify is bigger than 1.2 to include OrdinalEncoder(categories=ordinal_rankings, encoded_missing_value=-1, handle_unknown='use_encoded_value', unknown_value=-1)

import sklearn
print(sklearn.__version__)

1.4.1.post1


In [31]:
# update sklearn for using root_mean_squared_error
#!pip install --upgrade scikit-learn


In [32]:
# Load data (iteration 6)
url = "https://drive.google.com/file/d/1c5210z-HSQ-Kv4EExbYIjjPq6lT0UYZW/view?usp=drive_link"
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
data = pd.read_csv(path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Splitting the data

In [82]:

# Separate the features and target variable
X = data.drop("Id", axis=1).copy()
y = X.pop("SalePrice")

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=42)


Let's analyze the ordinal categories to determine if any include 'NA' values that should be explicitly added because they differ from missing values:

1. **Alley**: Includes 'NA' for "No alley access". This is a meaningful attribute rather than just missing data.
2. **BsmtQual**: 'NA' signifies "No Basement", which is a different condition than just an unknown basement quality.
3. **BsmtCond**: Similar to `BsmtQual`, 'NA' means "No Basement".
4. **BsmtExposure**: 'NA' again indicates "No Basement".
5. **BsmtFinType1**: 'NA' implies "No Basement".
6. **BsmtFinType2**: 'NA' also implies "No Basement".
7. **FireplaceQu**: 'NA' indicates "No Fireplace", which is distinct from not knowing the quality of the fireplace.
8. **GarageFinish**: 'NA' signifies "No Garage", not just an unobserved or missing garage finish.
9. **GarageQual**: 'NA' signifies "No Garage".
10. **GarageCond**: Similarly, 'NA' here means "No Garage".
11. **PoolQC**: 'NA' stands for "No Pool".
12. **Fence**: 'NA' means "No Fence".

In each of these cases, 'NA' should not simply be treated as a missing value, because it provides explicit information about the absence of a feature, which is valuable in the context of house pricing.


## Selecting types of features

In [83]:
# List of columns containing ordered categorical features

ordinal_categories_column_names = [
    "Street", "Alley", "LotShape", "Utilities", "LandSlope", "ExterQual", "ExterCond",
    "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "HeatingQC", "KitchenQual", "Functional", "FireplaceQu", "GarageFinish",
    "GarageQual", "GarageCond", "PavedDrive", "PoolQC", "Fence"]


# Separate numeric columns and categorical columns
X_num_columns = X_train.select_dtypes(include='number')
X_cat_columns = X_train.select_dtypes(exclude='number')
X_cat_ordered_columns = X_cat_columns[ordinal_categories_column_names]
X_cat_unordered_columns = X_cat_columns.drop(ordinal_categories_column_names, axis=1)

# Define categories for ordered categorical features

ordinal_rankings = [
    ['Grvl', 'Pave'],  # Street
    ['NA', 'Grvl', 'Pave'],  # Alley
    ['IR3', 'IR2', 'IR1', 'Reg'],  # LotShape
    ["ELO", "NoSeWa", "NoSewr", "AllPub"],  # Utilities
    ["Sev", "Mod", "Gtl"],  # LandSlope
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # ExterQual
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # ExterCond
    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # BsmtQual
    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # BsmtCond
    ['NA', 'No', 'Mn', 'Av', 'Gd'],  # BsmtExposure
    ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],  # BsmtFinType1
    ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],  # BsmtFinType2
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # HeatingQC
    ['Po', 'Fa', 'TA', 'Gd', 'Ex'],  # KitchenQual
    ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],  # Functional
    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # FireplaceQu
    ['NA', 'Unf', 'RFn', 'Fin'],  # GarageFinish
    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # GarageQual
    ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],  # GarageCond
    ['N', 'P', 'Y'],  # PavedDrive
    ['NA', 'Fa', 'TA', 'Gd', 'Ex'],  # PoolQC
    ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']  # Fence
]


# Create pipelines for preprocessing the data
# (1) For unordered categorical features, fill missing values with "N_A" and perform one-hot encoding
unordered_categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown='ignore')
)

# (2) For ordered categorical features, use ordinal encoding with the defined categories
ordered_categoric_pipe = make_pipeline(
    OrdinalEncoder(categories=ordinal_rankings, encoded_missing_value=-1, handle_unknown='use_encoded_value', unknown_value=-1)
)

# (3) For numeric features, fill missing values with the mean
numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean")
)

# Create a ColumnTransformer to apply the appropriate preprocessing to different feature groups
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipe, X_num_columns.columns),
        ('unordered', unordered_categoric_pipe, X_cat_unordered_columns.columns),
        ('ordered', ordered_categoric_pipe, X_cat_ordered_columns.columns)
    ]
)


In [35]:
preprocessor

## Understanding the pipeline

We are aiming to have a pipeline that will have this structure:
```python
pipeline = make_pipeline(
    preprocessor,
    MinMaxScaler(),
    SelectKBest(score_func=mutual_info_regression),
    RandomForestRegressor()
)
```

Let's understand what each step does:

1. **preprocessor**: This step will be responsible for encoding the categorical features and imputing missing values.
2. **MinMaxScaler**: This step will scale the features to a range of [0, 1].
3. **SelectKBest**: This step will select the best features based on a scoring function.
4. **RandomForestRegressor**: This step will be the final estimator.




In [84]:
# 1.Preprocess the training data
X_train_processed = preprocessor.fit_transform(X_train)

X_train_processed

Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,ordered__HeatingQC,ordered__KitchenQual,ordered__Functional,ordered__FireplaceQu,ordered__GarageFinish,ordered__GarageQual,ordered__GarageCond,ordered__PavedDrive,ordered__PoolQC,ordered__Fence
413,30.0,56.000000,8960.0,5.0,6.0,1927.0,1950.0,0.0,0.0,0.0,...,3.0,2.0,7.0,4.0,1.0,3.0,3.0,2.0,-1.0,-1.0
522,50.0,50.000000,5000.0,6.0,7.0,1947.0,1950.0,0.0,399.0,0.0,...,4.0,2.0,7.0,4.0,1.0,3.0,3.0,2.0,-1.0,-1.0
1036,20.0,89.000000,12898.0,9.0,5.0,2007.0,2008.0,70.0,1022.0,0.0,...,4.0,4.0,7.0,5.0,3.0,3.0,3.0,2.0,-1.0,-1.0
614,180.0,21.000000,1491.0,4.0,6.0,1972.0,1972.0,0.0,150.0,480.0,...,4.0,2.0,7.0,-1.0,-1.0,-1.0,-1.0,2.0,-1.0,-1.0
218,50.0,70.026689,15660.0,7.0,9.0,1939.0,2006.0,312.0,341.0,0.0,...,4.0,3.0,7.0,3.0,1.0,3.0,3.0,2.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,20.0,78.000000,9317.0,6.0,5.0,2006.0,2006.0,0.0,24.0,0.0,...,3.0,3.0,7.0,4.0,2.0,3.0,3.0,2.0,-1.0,-1.0
1130,50.0,65.000000,7804.0,4.0,3.0,1928.0,1950.0,0.0,622.0,0.0,...,2.0,3.0,5.0,3.0,1.0,3.0,3.0,2.0,-1.0,3.0
1294,20.0,60.000000,8172.0,5.0,7.0,1955.0,1990.0,0.0,167.0,0.0,...,2.0,2.0,7.0,-1.0,1.0,3.0,3.0,0.0,-1.0,-1.0
860,50.0,55.000000,7642.0,7.0,8.0,1918.0,1998.0,0.0,0.0,0.0,...,3.0,3.0,7.0,4.0,1.0,3.0,3.0,2.0,-1.0,4.0



Certain scaling processes can result in transforming features to have the same variance, like the standard scaler, which changes the standard deviation to 1. However, this is **not** our desired outcome. We need to choose a scaler that preserves variance, and for this purpose, we will use min-max scaling.

In [85]:
# 2.Manually apply MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_train_scaled


Unnamed: 0,num__MSSubClass,num__LotFrontage,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__MasVnrArea,num__BsmtFinSF1,num__BsmtFinSF2,...,ordered__HeatingQC,ordered__KitchenQual,ordered__Functional,ordered__FireplaceQu,ordered__GarageFinish,ordered__GarageQual,ordered__GarageCond,ordered__PavedDrive,ordered__PoolQC,ordered__Fence
413,0.058824,0.119863,0.035804,0.444444,0.625,0.398551,0.000000,0.00000,0.000000,0.000000,...,0.75,0.333333,1.000000,0.833333,0.50,0.666667,0.666667,1.0,0.0,0.0
522,0.176471,0.099315,0.017294,0.555556,0.750,0.543478,0.000000,0.00000,0.070695,0.000000,...,1.00,0.333333,1.000000,0.833333,0.50,0.666667,0.666667,1.0,0.0,0.0
1036,0.000000,0.232877,0.054210,0.888889,0.500,0.978261,0.966667,0.04375,0.181077,0.000000,...,1.00,1.000000,1.000000,1.000000,1.00,0.666667,0.666667,1.0,0.0,0.0
614,0.941176,0.000000,0.000893,0.333333,0.625,0.724638,0.366667,0.00000,0.026577,0.325645,...,1.00,0.333333,1.000000,0.000000,0.00,0.000000,0.000000,1.0,0.0,0.0
218,0.176471,0.167900,0.067120,0.666667,1.000,0.485507,0.933333,0.19500,0.060418,0.000000,...,1.00,0.666667,1.000000,0.666667,0.50,0.666667,0.666667,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1095,0.000000,0.195205,0.037472,0.555556,0.500,0.971014,0.933333,0.00000,0.004252,0.000000,...,0.75,0.666667,1.000000,0.833333,0.75,0.666667,0.666667,1.0,0.0,0.0
1130,0.176471,0.150685,0.030400,0.333333,0.250,0.405797,0.000000,0.00000,0.110206,0.000000,...,0.50,0.666667,0.666667,0.666667,0.50,0.666667,0.666667,1.0,0.0,0.8
1294,0.000000,0.133562,0.032120,0.444444,0.750,0.601449,0.666667,0.00000,0.029589,0.000000,...,0.50,0.333333,1.000000,0.000000,0.50,0.666667,0.666667,0.0,0.0,0.0
860,0.176471,0.116438,0.029643,0.666667,0.875,0.333333,0.800000,0.00000,0.000000,0.000000,...,0.75,0.666667,1.000000,0.833333,0.50,0.666667,0.666667,1.0,0.0,1.0


In [86]:
X_train_scaled.shape

(1458, 223)

- If you expect or know that your data exhibits mostly linear relationships, or you're working with relatively simple models and want to keep computational costs low, `f_regression` might be the more appropriate choice.
    
- If you're dealing with complex data where features might have nonlinear effects on the target, or when you don't want to miss out on any potential relationships (regardless of their nature), `mutual_info_regression` can be more effective, despite its higher computational demand.

In [87]:
# 4. Apply SelectKBest
selector = SelectKBest(score_func=mutual_info_regression, k=10)  # Choose k as desired
X_train_kbest = selector.fit_transform(X_train_scaled, y_train)

# Get selected feature indices from SelectKBest
selected_indices = selector.get_support(indices=True)

# Translate selected indices to feature names
feature_names_processed= preprocessor.get_feature_names_out()
selected_feature_names = [feature_names_processed[i] for i in selected_indices]

print("Top features selected by SelectKBest:", selected_feature_names)

X_train_kbest

Top features selected by SelectKBest: ['num__OverallQual', 'num__YearBuilt', 'num__TotalBsmtSF', 'num__1stFlrSF', 'num__GrLivArea', 'num__GarageCars', 'num__GarageArea', 'ordered__ExterQual', 'ordered__BsmtQual', 'ordered__KitchenQual']


Unnamed: 0,num__OverallQual,num__YearBuilt,num__TotalBsmtSF,num__1stFlrSF,num__GrLivArea,num__GarageCars,num__GarageArea,ordered__ExterQual,ordered__BsmtQual,ordered__KitchenQual
413,0.444444,0.398551,0.164975,0.159247,0.130746,0.50,0.253879,0.333333,0.666667,0.333333
522,0.555556,0.543478,0.164321,0.153740,0.250565,0.50,0.296192,0.333333,0.666667,0.333333
1036,0.888889,0.978261,0.265139,0.295089,0.242276,0.75,0.643159,0.666667,1.000000,1.000000
614,0.333333,0.724638,0.103110,0.067921,0.055765,0.00,0.000000,0.333333,0.833333,0.333333
218,0.666667,0.485507,0.130606,0.184259,0.305200,0.50,0.303949,0.666667,0.666667,0.666667
...,...,...,...,...,...,...,...,...,...,...
1095,0.555556,0.971014,0.215057,0.224874,0.184627,0.50,0.310296,0.666667,0.833333,0.666667
1130,0.333333,0.405797,0.183633,0.228086,0.310286,0.50,0.406206,0.333333,0.666667,0.666667
1294,0.444444,0.601449,0.141408,0.121615,0.099849,0.50,0.403385,0.333333,0.666667,0.333333
860,0.666667,0.333333,0.149264,0.132630,0.205727,0.25,0.152327,0.666667,0.666667,0.666667


In [88]:
from sklearn.feature_selection import f_regression
# 4. Apply SelectKBest
selector = SelectKBest(score_func=f_regression, k=10)  # Choose k as desired
X_train_kbest = selector.fit_transform(X_train_scaled, y_train)

# Get selected feature indices from SelectKBest
selected_indices = selector.get_support(indices=True)

# Translate selected indices to feature names
selected_feature_names = [feature_names_processed[i] for i in selected_indices]

print("Top features selected by SelectKBest:", selected_feature_names)

X_train_kbest

Top features selected by SelectKBest: ['num__OverallQual', 'num__TotalBsmtSF', 'num__1stFlrSF', 'num__GrLivArea', 'num__FullBath', 'num__GarageCars', 'num__GarageArea', 'ordered__ExterQual', 'ordered__BsmtQual', 'ordered__KitchenQual']


Unnamed: 0,num__OverallQual,num__TotalBsmtSF,num__1stFlrSF,num__GrLivArea,num__FullBath,num__GarageCars,num__GarageArea,ordered__ExterQual,ordered__BsmtQual,ordered__KitchenQual
413,0.444444,0.164975,0.159247,0.130746,0.333333,0.50,0.253879,0.333333,0.666667,0.333333
522,0.555556,0.164321,0.153740,0.250565,0.666667,0.50,0.296192,0.333333,0.666667,0.333333
1036,0.888889,0.265139,0.295089,0.242276,0.666667,0.75,0.643159,0.666667,1.000000,1.000000
614,0.333333,0.103110,0.067921,0.055765,0.333333,0.00,0.000000,0.333333,0.833333,0.333333
218,0.666667,0.130606,0.184259,0.305200,0.333333,0.50,0.303949,0.666667,0.666667,0.666667
...,...,...,...,...,...,...,...,...,...,...
1095,0.555556,0.215057,0.224874,0.184627,0.666667,0.50,0.310296,0.666667,0.833333,0.666667
1130,0.333333,0.183633,0.228086,0.310286,0.666667,0.50,0.406206,0.333333,0.666667,0.666667
1294,0.444444,0.141408,0.121615,0.099849,0.333333,0.50,0.403385,0.333333,0.666667,0.333333
860,0.666667,0.149264,0.132630,0.205727,0.333333,0.25,0.152327,0.666667,0.666667,0.666667


When using cross-validation with Scikit-Learn, the scoring mechanism is designed with a convention that a bigger (higher) score indicates a better model 📈. However, with error metrics such as Root Mean Absolute Error (RMAE) or Root Mean Squared Log Error (RMSLE), we are in a situation where a good model is indicated by a lower value since we want to minimize error, not maximize it 📉. Scikit-Learn addresses this by using negated versions of these metrics, like 'neg_root_mean_squared_log_error'.

Using the negated version means that the error is multiplied by -1, essentially flipping the sign 🔁. So, during the cross-validation process, when Scikit-Learn tries to maximize the scoring function, it's effectively minimizing the original error metric.

Here's a straightforward example:

- **Model 1** has an RMSLE of approximately 0.00576 📊.
- **Model 2** has an RMSLE of approximately 0.00235 📈.

When these MSLE scores are used in a cross-validation context within Scikit-Learn, we use their negated values:

- **Model 1** has a negated RMSLE of approximately -0.00576 ➖.
- **Model 2** has a negated RMSLE of approximately -0.00235 ➖.

In cross-validation, we look for the model that has the "highest" score because Scikit-Learn's convention is "higher is better." Since these are negated errors, a higher value (closer to zero) means a lower actual error. Therefore, Model 2, with a negated RMSLE of -0.00235, will be determined correctly as better than Model 1 with -0.00576 ✅.

Additionally, when you use cross-validation with 'neg_root_mean_squared_log_error' as your scoring function, you are being consistent with the competition's evaluation metric. This consistency is critical because it ensures that the model selection and tuning process during development are in direct correlation with the performance metric that determines your final position in the competition 🏆.

In [41]:

pipeline = make_pipeline(
    preprocessor,
    MinMaxScaler(),
    SelectKBest(score_func=mutual_info_regression), # or score_func=f_regression
    RandomForestRegressor()
)

# Define the hyperparameter grid for RandomizedSearchCV
param_grid = {
    "columntransformer__num__simpleimputer__strategy": ["mean", "median"],
    "randomforestregressor__n_estimators": range(50, 400, 50),
    "randomforestregressor__max_depth": range(1, 15, 2),
    "randomforestregressor__min_samples_split": range(2, 20, 2),
    "selectkbest__k": range(1, 70, 2)
}

# Perform RandomizedSearchCV for hyperparameter tuning
search = RandomizedSearchCV(pipeline,
                            param_distributions=param_grid,
                            n_iter=40,
                            cv=5,
                            scoring="neg_root_mean_squared_log_error",
                            verbose=1,
                            n_jobs=-1)

# Fit the model to the training data
search.fit(X_train, y_train)

#creating the dictionary to store the average scores
avg_scores = {"rforest" : search.best_score_}

avg_scores

Fitting 5 folds for each of 40 candidates, totalling 200 fits


{'rforest': -0.1435075492060407}

However, feel free to use other metrics to evaluate your model's performance during development. Metrics like R2 and RMSE will allow you to understand your model's behavior and performance in a more human-readable way 🚀.

In [42]:

# Make predictions on training and test sets
y_train_pred = search.predict(X_train)
y_test_pred = search.predict(X_test)

# Calculate R-squared scores and root mean squared errors for training and test sets
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

rmse_train = root_mean_squared_error(y_train, y_train_pred)
rmse_test = root_mean_squared_error(y_test, y_test_pred)

# Create a DataFrame to store the model evaluation results
score_data = {
    'RF_R2_Score': [r2_train, r2_test],
    'RF_RMSE': [rmse_train, rmse_test]
}

index_values = ['Train', 'Test']

df_scores = pd.DataFrame(score_data, index=index_values)

# Display the model evaluation results
df_scores

Unnamed: 0,RF_R2_Score,RF_RMSE
Train,0.969508,13511.306828
Test,0.884088,30676.330607


## Gradient Boosting

XGBoost
Extreme Gradient Boost (XGB) is a boosting algorithm that uses the gradient boosting framework; where gradient descent algorithm is employed to minimize the errors in the sequential model. It improves on the gradient boosting framework with faster execution speed and improved performance.

![image.png](https://miro.medium.com/max/1400/1*QJZ6W-Pck_W7RlIDwUIN9Q.jpeg)

In [89]:
xgb_pipeline = make_pipeline(
    preprocessor,
    MinMaxScaler(),
    SelectKBest(score_func=mutual_info_regression),
    XGBRegressor()
)

xgb_param_grid = {
    "columntransformer__num__simpleimputer__strategy": ["mean", "median"],
    "selectkbest__k": range(60, 70, 1),
    "xgbregressor__learning_rate": [0.001, 0.01, 0.015],
    "xgbregressor__n_estimators": [1900, 2000, 2100],
    "xgbregressor__max_depth": [2, 3],
    "xgbregressor__min_child_weight": [0.8, 1, 1.2],
    "xgbregressor__reg_alpha": [0.0008, 0.001, 0.003],
    "xgbregressor__reg_lambda": [0.003, 0.005, 0.007]
}

xgb_search = RandomizedSearchCV(xgb_pipeline,
                            param_distributions=xgb_param_grid,
                            n_iter=50,
                            cv=10,
                            scoring="neg_root_mean_squared_log_error",
                            verbose=1,
                            n_jobs=-1)

xgb_search.fit(X_train, y_train)



print("Best Parameters:", xgb_search.best_params_)

avg_scores["xgb"] = xgb_search.best_score_
avg_scores


Fitting 10 folds for each of 50 candidates, totalling 500 fits
Best Parameters: {'xgbregressor__reg_lambda': 0.005, 'xgbregressor__reg_alpha': 0.001, 'xgbregressor__n_estimators': 1900, 'xgbregressor__min_child_weight': 1, 'xgbregressor__max_depth': 3, 'xgbregressor__learning_rate': 0.015, 'selectkbest__k': 60, 'columntransformer__num__simpleimputer__strategy': 'median'}


{'rforest': -0.1435075492060407,
 'xgb': -0.12940491722125969,
 'lgbm': -0.13103865879736942}

In [81]:
xgb_search.best_params_

{'xgbregressor__reg_lambda': 0.005,
 'xgbregressor__reg_alpha': 0.001,
 'xgbregressor__n_estimators': 2000,
 'xgbregressor__min_child_weight': 1,
 'xgbregressor__max_depth': 3,
 'xgbregressor__learning_rate': 0.01,
 'selectkbest__k': 68,
 'columntransformer__num__simpleimputer__strategy': 'median'}

In [90]:

# Make predictions on training and test sets
y_train_pred = xgb_search.predict(X_train)
y_test_pred = xgb_search.predict(X_test)

# Calculate R-squared scores and root mean squared errors for training and test sets
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

rmse_train = root_mean_squared_error(y_train, y_train_pred)
rmse_test = root_mean_squared_error(y_test, y_test_pred)

# Create a DataFrame to store the model evaluation results
score_data = {
    'XGB_R2_Score': [r2_train, r2_test],
    'XGB_RMSE': [rmse_train, rmse_test]
}

index_values = ['Train', 'Test']

# Concatanate the results to the previous DataFrame
df_scores = pd.concat([df_scores, pd.DataFrame(score_data, index=index_values)], axis=1)

# Display the model evaluation results
df_scores

Unnamed: 0,RF_R2_Score,RF_RMSE,XGB_R2_Score,XGB_RMSE,Lgbm_R2_Score,Lgbm_RMSE,XGB_R2_Score.1,XGB_RMSE.1,XGB_R2_Score.2,XGB_RMSE.2,XGB_R2_Score.3,XGB_RMSE.3,XGB_R2_Score.4,XGB_RMSE.4,XGB_R2_Score.5,XGB_RMSE.5
Train,0.969508,13511.306828,0.963858,14709.892556,0.946508,17895.661145,0.963858,14709.892556,0.963858,14709.892556,0.978076,11797.519891,0.976727,12111.469364,0.982338,10548.971306
Test,0.884088,30676.330607,0.908096,27315.423095,0.875995,31729.176758,0.908096,27315.423095,0.908096,27315.423095,0.865109,27224.91214,0.908623,24530.294838,0.995683,5601.308667


https://neptune.ai/blog/lightgbm-parameters-guide
LightBGM is another gradient boosting framework developed by Microsoft that is based on decision tree algorithm, designed to be efficient and distributed. Some of the advantages of implementing LightBGM compared to other boosting frameworks include:

1. Faster training speed and higher efficiency (use histogram based algorithm i.e it buckets continuous feature values into discrete bins which fasten the training procedure)
2. Lower memory usage (Replaces continuous values to discrete bins which result in lower memory usage)
3. Better accuracy
4. Support of parallel and GPU learning
5. Capable of handling large-scale data (capable of performing equally good with large datasets with a significant reduction in training time as compared to XGBOOST)

LightGBM splits the tree leaf wise with the best fit whereas other boosting algorithms split the tree depth wise or level wise rather than leaf-wise. This leaf-wise algorithm reduces more loss than the level-wise algorithm, hence resulting in much better accuracy which can rarely be achieved by any of the existing boosting algorithms.

![image.png](https://rohitgr7.github.io/content/images/2019/03/Screenshot-from-2019-03-27-23-09-47-1.png)

In [45]:
# LightGBM Pipeline
lgbm_pipeline = make_pipeline(
    preprocessor,
    MinMaxScaler(),
    SelectKBest(score_func=mutual_info_regression),
    LGBMRegressor()
)


lgbm_param_grid = {
    'columntransformer__num__simpleimputer__strategy': ['mean', 'median'],
    'lgbmregressor__num_leaves': [15, 31, 63],
    'lgbmregressor__max_depth': [5, 10, -1],  # Include no limit (-1) as an option
    'lgbmregressor__learning_rate': [0.005, 0.01, 0.05, 0.1],
    'lgbmregressor__n_estimators': [100, 300, 500, 1000, 1500],
    'lgbmregressor__reg_alpha': [0, 0.001, 0.01, 0.1],
    'lgbmregressor__reg_lambda': [0, 0.001, 0.01, 0.1],
    'lgbmregressor__feature_fraction': [0.5, 0.6, 0.7, 0.8],  # Or 'colsample_bytree' if appropriate
    'lgbmregressor__min_child_samples': [20, 30, 40, 50],
    "selectkbest__k": range(50, 70, 2) # 'all' to test without feature selection as well

}


# Hyperparameter Tuning with RandomizedSearchCV
lgbm_search = RandomizedSearchCV(lgbm_pipeline,
                            param_distributions=lgbm_param_grid,
                            n_iter=40,
                            cv=5,
                            scoring="neg_root_mean_squared_log_error",
                            verbose=1,
                            n_jobs=-1)


lgbm_search.fit(X_train, y_train)



Fitting 5 folds for each of 40 candidates, totalling 200 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000465 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2883
[LightGBM] [Info] Number of data points in the train set: 1241, number of used features: 64
[LightGBM] [Info] Start training from score 181017.845286


In [46]:
print("Best Parameters:", lgbm_search.best_params_)
avg_scores["lgbm"] = lgbm_search.best_score_
avg_scores

Best Parameters: {'selectkbest__k': 64, 'lgbmregressor__reg_lambda': 0.1, 'lgbmregressor__reg_alpha': 0.001, 'lgbmregressor__num_leaves': 63, 'lgbmregressor__n_estimators': 1500, 'lgbmregressor__min_child_samples': 30, 'lgbmregressor__max_depth': 10, 'lgbmregressor__learning_rate': 0.005, 'lgbmregressor__feature_fraction': 0.5, 'columntransformer__num__simpleimputer__strategy': 'median'}


{'rforest': -0.1435075492060407,
 'xgb': -0.13431425081932363,
 'lgbm': -0.13103865879736942}

In [47]:
# Make predictions on training and test sets
y_train_pred = lgbm_search.predict(X_train)
y_test_pred = lgbm_search.predict(X_test)

# Calculate R-squared scores and root mean squared errors for training and test sets
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

rmse_train = root_mean_squared_error(y_train, y_train_pred)
rmse_test = root_mean_squared_error(y_test, y_test_pred)

# Create a DataFrame to store the model evaluation results
score_data = {
    'Lgbm_R2_Score': [r2_train, r2_test],
    'Lgbm_RMSE': [rmse_train, rmse_test]
}

index_values = ['Train', 'Test']

# Concatanate the results to the previous DataFrame
df_scores = pd.concat([df_scores, pd.DataFrame(score_data, index=index_values)], axis=1)

# Display the model evaluation results
df_scores



Unnamed: 0,RF_R2_Score,RF_RMSE,XGB_R2_Score,XGB_RMSE,Lgbm_R2_Score,Lgbm_RMSE
Train,0.969508,13511.306828,0.963858,14709.892556,0.946508,17895.661145
Test,0.884088,30676.330607,0.908096,27315.423095,0.875995,31729.176758


## Import kaggle data

In [91]:
competition_data = pd.read_csv(r"C:\Users\Marvin\Documents\WBS\Data-Science-Bootcamp\7_Supervised ML\Data\competition_test_data.csv")
competition_ids = competition_data.pop('Id')
competition_ids

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

In [92]:
my_submission = pd.DataFrame({"Id": competition_ids})
my_submission

Unnamed: 0,Id
0,1461
1,1462
2,1463
3,1464
4,1465
...,...
1454,2915
1455,2916
1456,2917
1457,2918


In [93]:
my_submission["SalePrice"] = xgb_search.predict(competition_data)
my_submission.head(20)

Unnamed: 0,Id,SalePrice
0,1461,123574.203125
1,1462,162009.34375
2,1463,184460.4375
3,1464,187866.03125
4,1465,189557.546875
5,1466,173157.546875
6,1467,170721.0
7,1468,163808.578125
8,1469,183405.375
9,1470,126417.859375


In [94]:
my_submission.to_csv("ML_xgb_submission_3.csv", index=False)