In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# the below extension properly formats a cell after it is run
%load_ext nb_black

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)


# Set the maximum number of columns to 200
pd.set_option("display.max_columns", 200)

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [28]:
housing = pd.read_csv("../data/housing_corr.csv")

<IPython.core.display.Javascript object>

In [29]:
for name in housing.columns:
    print(name, ": number of values", len(housing[name].value_counts()))

PID : number of values 2558
SalePrice : number of values 914
MSSubClass : number of values 16
MSZoning : number of values 6
LotFrontage : number of values 127
LotArea : number of values 1741
Street : number of values 2
Alley : number of values 3
LotShape : number of values 4
LandContour : number of values 4
Utilities : number of values 2
LotConfig : number of values 5
LandSlope : number of values 3
Neighborhood : number of values 26
Condition1 : number of values 9
Condition2 : number of values 8
BldgType : number of values 5
HouseStyle : number of values 8
OverallQual : number of values 10
OverallCond : number of values 9
RoofStyle : number of values 6
RoofMatl : number of values 7
Exterior1st : number of values 15
Exterior2nd : number of values 16
MasVnrType : number of values 4
MasVnrArea : number of values 414
ExterCond : number of values 5
Foundation : number of values 6
BsmtQual : number of values 6
BsmtCond : number of values 6
BsmtExposure : number of values 5
BsmtFinSF1 : numbe

<IPython.core.display.Javascript object>

In [30]:
housing.drop(
    ["PID", "Neighborhood_st"], axis=1, inplace=True,
)

<IPython.core.display.Javascript object>

## Oridnal Coding since this is the preferred method for Random Forest over Dummifying (One-Hot Encoding) ##

In [31]:
# get a list of categorical features (i.e., object type columns)
cat_features = list(housing.select_dtypes(include=["object"]).columns)
cat_features.remove("Neighborhood")

<IPython.core.display.Javascript object>

In [32]:
from sklearn.preprocessing import OrdinalEncoder

# create an instance of the OrdinalEncoder class
ordinal_encoder = OrdinalEncoder()


# fit the ordinal encoder to the categorical features
ordinal_encoder.fit(housing[cat_features])

# transform the categorical features into encoded numerical values
housing[cat_features] = ordinal_encoder.transform(housing[cat_features])


<IPython.core.display.Javascript object>

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Create a label encoder instance
label_encoder = LabelEncoder()

# Encode the neighborhood column and store the mapping
housing['NeighborhoodEncoded'] = label_encoder.fit_transform(housing['Neighborhood'])
neighborhood_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Get unique neighborhoods
unique_neighborhoods = housing['Neighborhood'].unique()

# Loop through each neighborhood
for neighborhood in unique_neighborhoods:
    print(f"Analyzing neighborhood: {neighborhood}")
    
    # Filter data based on the neighborhood
    neighborhood_data = housing[housing['Neighborhood'] == neighborhood]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(neighborhood_data.drop(["SalePrice", "Neighborhood", "NeighborhoodEncoded"], axis=1), neighborhood_data["SalePrice"], test_size=0.2, random_state=42)

    # Train the Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Calculate feature importances
    feature_importances = rf.feature_importances_

    # Analyze the feature importances
    important_features = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances}).sort_values('Importance', ascending=False).head(5)

    print(important_features)
    print("\n")

# Print the mapping between neighborhood names and encoded values
print("Neighborhood mapping:")
print(neighborhood_mapping)


Analyzing neighborhood: SWISU
         Feature  Importance
35      2ndFlrSF    0.157702
61       TotalSF    0.112373
16   OverallCond    0.104800
42  TotRmsAbvGrd    0.071526
3        LotArea    0.070801


Analyzing neighborhood: Edwards
       Feature  Importance
61     TotalSF    0.380209
3      LotArea    0.138225
47  GarageArea    0.082139
59         Age    0.053246
60    RemodAge    0.031181


Analyzing neighborhood: IDOTRR
        Feature  Importance
61      TotalSF    0.355999
15  OverallQual    0.207659
60     RemodAge    0.073947
49   PavedDrive    0.064591
35     2ndFlrSF    0.054537


Analyzing neighborhood: OldTown
        Feature  Importance
61      TotalSF    0.351097
35     2ndFlrSF    0.162087
15  OverallQual    0.079209
16  OverallCond    0.070586
18     RoofMatl    0.039303


Analyzing neighborhood: NWAmes
          Feature  Importance
61        TotalSF    0.672153
35       2ndFlrSF    0.059292
58  SaleCondition    0.033609
47     GarageArea    0.032596
42   TotRmsAbv

<IPython.core.display.Javascript object>

## Random Forest to determine each feature correlation with sale price ##

In [36]:


# Create an instance of the OrdinalEncoder class
ordinal_encoder = OrdinalEncoder()

# Encode the Neighborhood column
housing['NeighborhoodEncoded'] = ordinal_encoder.fit_transform(housing[['Neighborhood']])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(housing.drop(["SalePrice", "Neighborhood"], axis=1), housing["SalePrice"], test_size=0.2, random_state=42)

# Train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Calculate feature importances
feature_importances = rf.feature_importances_

# Analyze the feature importances
important_features = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances}).sort_values('Importance', ascending=False)

print(important_features)


                Feature  Importance
15          OverallQual    0.489539
61              TotalSF    0.310159
59                  Age    0.020941
35             2ndFlrSF    0.016088
41          KitchenQual    0.015583
47           GarageArea    0.012832
3               LotArea    0.011795
60             RemodAge    0.011290
64            TotalBath    0.011272
28           BsmtFinSF1    0.010304
30            BsmtUnfSF    0.009602
16          OverallCond    0.006532
44           Fireplaces    0.005701
25             BsmtQual    0.004743
63         TotalPorchSF    0.004146
50           WoodDeckSF    0.003957
67  NeighborhoodEncoded    0.003761
2           LotFrontage    0.003724
22           MasVnrArea    0.003724
51          OpenPorchSF    0.003231
55               MoSold    0.002918
1              MSZoning    0.002534
36         BsmtFullBath    0.001862
65       MSSubClass_cat    0.001802
45           GarageType    0.001731
53          ScreenPorch    0.001588
46         GarageFinish    0

<IPython.core.display.Javascript object>

## Using Random Forest to Look at the most important feature for each street grouping ##

In [38]:
# Running throught the process from Creating_test_train notebook to create quantiles

# Loading Dateset again
housing = pd.read_csv("../data/housing_corr.csv")


# Splitting the data into train test and stratifying on neighborhood since that is what we are intested in
train_set, test_set = train_test_split(
    housing, test_size=0.2, stratify=housing["Neighborhood"], random_state=42
)


def group_neighbor_streets_by_saleprice(
    traindf=train_set,
    testdf=test_set,
    num_quantiles=10,  # notice the difference in this line!!
):
    # Calculate the mean sale price for each street in the training df
    street_prices = traindf.groupby("Neighborhood_st")["SalePrice"].mean()
    # Group the streets into the specified number of quantiles based on sale price
    labels = [f"group_{i+1}" for i in range(num_quantiles)]
    groups = pd.qcut(street_prices, q=num_quantiles, labels=range(1, num_quantiles + 1))
    # Create a dictionary that maps each street name to its corresponding sale price group label
    street_group_dict = dict(zip(street_prices.index, groups))
    # Add a new column to the training dataframe with the street price groups
    traindf["StreetPriceGroup"] = traindf["Neighborhood_st"].map(street_group_dict)
    # Add a new column to the testing dataframe with the street price groups
    testdf["StreetPriceGroup"] = testdf["Neighborhood_st"].map(street_group_dict)
    return street_group_dict


# this will use the dictionary created to fill in the missing values in the test df with
# another group in the same neighborhood


def fill_na(testdf=test_set, d={}):
    # Extract the first part of the string in the "Neighborhood_st" column
    testdf["Neighborhood_prefix"] = testdf["Neighborhood_st"].map(
        lambda x: x.split("_")[0]
    )
    # Create a new dict that only contains the neighborhood
    new_dict = {k.split("_")[0]: v for k, v in d.items()}
    # Create a list of PIDs with missing StreetPriceGroup values
    na_pid_list = testdf[testdf["StreetPriceGroup"].isna()]["PID"].tolist()
    # Create a Boolean mask to filter the DataFrame
    mask = testdf["PID"].isin(na_pid_list)
    # Apply the dictionary mapping only to the filtered rows
    testdf.loc[mask, "StreetPriceGroup"] = testdf[mask]["Neighborhood_prefix"].map(
        new_dict
    )
    # Drop the column since there is no more use for it
    testdf.drop("Neighborhood_prefix", axis=1, inplace=True)


# the num of quantiles can be changed and it is assigned to d which is the dictionary that
# will be used to fill in the missing values

d = group_neighbor_streets_by_saleprice(
    traindf=train_set, testdf=test_set, num_quantiles=10,
)

fill_na(test_set, d)


# Remove the column that was used to create groupings
# train_set.drop("Neighborhood_st", axis=1, inplace=True)
# test_set.drop("Neighborhood_st", axis=1, inplace=True)


# overly complicated code to remove PID and move SalePrice to first column
# I need to do this in the future and save the csv's after so we dont have to do this each time
train_set = train_set.iloc[
    :,
    train_set.columns.tolist().index("SalePrice") : (
        train_set.columns.tolist().index("SalePrice") + 1
    ),
].join(train_set.drop(columns=["SalePrice", "PID"]))
# same with test set
test_set = test_set.iloc[
    :,
    test_set.columns.tolist().index("SalePrice") : (
        test_set.columns.tolist().index("SalePrice") + 1
    ),
].join(test_set.drop(columns=["SalePrice", "PID"]))

<IPython.core.display.Javascript object>

In [45]:
# get a list of categorical features (i.e., object type columns)
cat_features = list(train_set.select_dtypes(include=["object"]).columns)

# create an instance of the OrdinalEncoder class
ordinal_encoder = OrdinalEncoder()


# fit the ordinal encoder to the categorical features
ordinal_encoder.fit(train_set[cat_features])

# transform the categorical features into encoded numerical values
train_set[cat_features] = ordinal_encoder.transform(train_set[cat_features])


<IPython.core.display.Javascript object>

In [55]:
# get a list of categorical features (i.e., object type columns)
cat_features = list(test_set.select_dtypes(include=["object"]).columns)

# create an instance of the OrdinalEncoder class
ordinal_encoder = OrdinalEncoder()


# fit the ordinal encoder to the categorical features
ordinal_encoder.fit(test_set[cat_features])

# transform the categorical features into encoded numerical values
test_set[cat_features] = ordinal_encoder.transform(test_set[cat_features])

<IPython.core.display.Javascript object>

In [42]:
# Remove the column that was used to create groupings
train_set.drop("Neighborhood_st", axis=1, inplace=True)
test_set.drop("Neighborhood_st", axis=1, inplace=True)

<IPython.core.display.Javascript object>

In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Sort unique values in the StreetPriceGroup column by mean SalePrice in ascending order
group_means = train_set.groupby('StreetPriceGroup')['SalePrice'].mean()
unique_groups = group_means.sort_values().index

# Loop through each value in the StreetPriceGroup column
for group in unique_groups:
    print(f"Analyzing group: {group}")
    
    # Filter data based on the StreetPriceGroup value
    group_data = train_set[train_set['StreetPriceGroup'] == group]

    # Print mean SalePrice for the group
    print(f"Mean SalePrice: {group_data['SalePrice'].mean()}")

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(group_data.drop(["SalePrice", "StreetPriceGroup"], axis=1), group_data["SalePrice"], test_size=0.2, random_state=42)

    # Train the Random Forest model
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    # Calculate feature importances
    feature_importances = rf.feature_importances_

    # Analyze the feature importances
    important_features = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances}).sort_values('Importance', ascending=False).head(5)

    print(important_features)
    print("\n")


Analyzing group: 1
Mean SalePrice: 96447.10714285714
          Feature  Importance
62        TotalSF    0.309368
16    OverallQual    0.198242
17    OverallCond    0.075767
59  SaleCondition    0.040282
61       RemodAge    0.035571


Analyzing group: 2
Mean SalePrice: 120687.70319634704
        Feature  Importance
62      TotalSF    0.365843
34   CentralAir    0.062086
17  OverallCond    0.057945
31    BsmtUnfSF    0.053413
61     RemodAge    0.044905


Analyzing group: 3
Mean SalePrice: 131822.11219512194
        Feature  Importance
16  OverallQual    0.225205
62      TotalSF    0.163845
48   GarageArea    0.087980
36     2ndFlrSF    0.050391
29   BsmtFinSF1    0.049707


Analyzing group: 4
Mean SalePrice: 141645.2448275862
       Feature  Importance
62     TotalSF    0.326022
36    2ndFlrSF    0.105220
60         Age    0.073954
29  BsmtFinSF1    0.047697
51  WoodDeckSF    0.044348


Analyzing group: 5
Mean SalePrice: 153341.14736842105
       Feature  Importance
62     TotalSF    0

<IPython.core.display.Javascript object>

In [None]:
## Looking at the feature importance on SalePrice when adding the StreetPriceGroup variable. 

In [54]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_set.drop("SalePrice", axis=1), train_set["SalePrice"], test_size=0.2, random_state=42)

# Train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Calculate feature importances
feature_importances = rf.feature_importances_

# Analyze the feature importances
important_features = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances}).sort_values('Importance', ascending=False)

print(important_features)


             Feature  Importance
68  StreetPriceGroup    0.658141
62           TotalSF    0.180576
16       OverallQual    0.033746
26          BsmtQual    0.009102
29        BsmtFinSF1    0.008874
48        GarageArea    0.008305
36          2ndFlrSF    0.008026
61          RemodAge    0.007759
23        MasVnrArea    0.007466
3            LotArea    0.006697
65         TotalBath    0.006286
31         BsmtUnfSF    0.005421
60               Age    0.005041
17       OverallCond    0.004357
64      TotalPorchSF    0.003726
51        WoodDeckSF    0.003237
42       KitchenQual    0.002851
2        LotFrontage    0.002566
54       ScreenPorch    0.002385
52       OpenPorchSF    0.002282
43      TotRmsAbvGrd    0.002102
56            MoSold    0.001971
11      Neighborhood    0.001753
49        GarageQual    0.001693
47      GarageFinish    0.001394
21       Exterior2nd    0.001322
40      BedroomAbvGr    0.001320
19          RoofMatl    0.001213
57            YrSold    0.001191
45        

<IPython.core.display.Javascript object>

## Using the top 20 features from above to predict SalePrice ##

In [61]:
from sklearn.metrics import mean_absolute_error, r2_score


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_set[important_features['Feature'][:20].tolist()], train_set["SalePrice"], test_size=0.2, random_state=42)

# Train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict SalePrice using the test data
y_pred = rf.predict(X_test)

# Calculate Mean Absolute Error (MAE) and R-squared score
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("R-squared score:", r2)



Mean Absolute Error: 14018.455731707316
R-squared score: 0.9160173334216787


<IPython.core.display.Javascript object>

## Trying again with bagging regressor ##

In [63]:

# Select the top 20 features
top_20_features = important_features.head(20)["Feature"].tolist()


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_set[top_20_features], train_set["SalePrice"], test_size=0.2, random_state=42)

# Create and fit the Bagging Regressor with a Random Forest base estimator
rf = RandomForestRegressor(n_estimators=100, random_state=42)
bagging_rf = BaggingRegressor(base_estimator=rf, n_estimators=10, random_state=42)
bagging_rf.fit(X_train, y_train)

# Predict SalePrice on the testing set
y_pred = bagging_rf.predict(X_test)

# Calculate Mean Absolute Error (MAE) and R-squared score
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("R-squared score:", r2)


Mean Absolute Error: 13902.682904878047
R-squared score: 0.916587279890754


<IPython.core.display.Javascript object>

## Now trying Stacking and Boosting with top 20 features ##

In [66]:
# import pandas as pd
# import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
# from sklearn.metrics import mean_absolute_error, r2_score
# from sklearn.model_selection import train_test_split

# Select the top 20 features
top_20_features = important_features['Feature'][:20].tolist()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_set[top_20_features], train_set["SalePrice"], test_size=0.2, random_state=42)

# Define base estimators
rf = RandomForestRegressor(n_estimators=100, random_state=42)
dt = DecisionTreeRegressor(random_state=42)

# Create the stacking regressor using the base estimators
estimators = [('rf', rf), ('dt', dt)]
stack_reg = StackingRegressor(estimators=estimators, final_estimator=GradientBoostingRegressor(random_state=42))

# Fit the stacking regressor to the training data
stack_reg.fit(X_train, y_train)

# Predict using the stacking regressor
y_pred_stack = stack_reg.predict(X_test)

# Calculate Mean Absolute Error (MAE) and R-squared score
mae_stack = mean_absolute_error(y_test, y_pred_stack)
r2_stack = r2_score(y_test, y_pred_stack)

print("Stacking Regressor Results:")
print("Mean Absolute Error:", mae_stack)
print("R-squared score:", r2_stack)

# Create the boosting regressor using the base estimator
boost_reg = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Fit the boosting regressor to the training data
boost_reg.fit(X_train, y_train)

# Predict using the boosting regressor
y_pred_boost = boost_reg.predict(X_test)

# Calculate Mean Absolute Error (MAE) and R-squared score
mae_boost = mean_absolute_error(y_test, y_pred_boost)
r2_boost = r2_score(y_test, y_pred_boost)

print("\nBoosting Regressor Results:")
print("Mean Absolute Error:", mae_boost)
print("R-squared score:", r2_boost)


Stacking Regressor Results:
Mean Absolute Error: 15146.804259213986
R-squared score: 0.9029411144822934

Boosting Regressor Results:
Mean Absolute Error: 13328.839803176925
R-squared score: 0.9290692109057372


<IPython.core.display.Javascript object>

## Seeing if there is a difference when using all features and not top 20 ##

In [67]:
# Random Forestbbb

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_set.drop("SalePrice", axis=1), train_set["SalePrice"], test_size=0.2, random_state=42)

# Train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Calculate Mean Absolute Error (MAE) and R-squared score
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("R-squared score:", r2)


Mean Absolute Error: 14087.587317073172
R-squared score: 0.9141713429944978


<IPython.core.display.Javascript object>

In [68]:
# Bagging

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_set.drop("SalePrice", axis=1), train_set["SalePrice"], test_size=0.2, random_state=42)

# Create and fit the Bagging Regressor with a Random Forest base estimator
rf = RandomForestRegressor(n_estimators=100, random_state=42)
bagging = BaggingRegressor(base_estimator=rf, n_estimators=10, random_state=42)
bagging.fit(X_train, y_train)

# Make predictions and calculate scores
y_pred = bagging.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Bagging - Mean Absolute Error:", mae)
print("Bagging - R-squared score:", r2)


Bagging - Mean Absolute Error: 13996.095329268293
Bagging - R-squared score: 0.913565262760414


<IPython.core.display.Javascript object>

In [70]:
# Stacking

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_set.drop("SalePrice", axis=1), train_set["SalePrice"], test_size=0.2, random_state=42)

# Create base estimators
estimators = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('knn', KNeighborsRegressor()),
    ('svr', SVR()),
]

# Create and fit the Stacking Regressor with a Linear Regression final estimator
stacking = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
stacking.fit(X_train, y_train)

# Make predictions and calculate scores
y_pred = stacking.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Stacking - Mean Absolute Error:", mae)
print("Stacking - R-squared score:", r2)


Stacking - Mean Absolute Error: 14067.524329716342
Stacking - R-squared score: 0.9142560891429595


<IPython.core.display.Javascript object>

In [71]:
# Boosting


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    train_set.drop("SalePrice", axis=1),
    train_set["SalePrice"],
    test_size=0.2,
    random_state=42,
)

# Create and fit the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gbr.fit(X_train, y_train)

# Make predictions and calculate scores
y_pred = gbr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Boosting - Mean Absolute Error:", mae)
print("Boosting - R-squared score:", r2)

Boosting - Mean Absolute Error: 12704.623040466531
Boosting - R-squared score: 0.935640719186301


<IPython.core.display.Javascript object>

## XGBoost ##

In [75]:
# Import necessary libraries
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_set.drop("SalePrice", axis=1), train_set["SalePrice"], test_size=0.2, random_state=42)

# Create and fit the XGBoost model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = xgb_model.predict(X_test)

# Calculate Mean Absolute Error (MAE) and R-squared score
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("XGBoost - Mean Absolute Error:", mae)
print("XGBoost - R-squared score:", r2)


XGBoost - Mean Absolute Error: 12184.325304878048
XGBoost - R-squared score: 0.9370510881429757


<IPython.core.display.Javascript object>

## Hyperparameter tuning with GridSearchCV ##

In [77]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 500],
}

# Initialize the XGBoost regressor
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Initialize the GridSearchCV object
grid_search = GridSearchCV(
    xgb_reg, 
    param_grid, 
    cv=5, 
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_xgb = grid_search.best_estimator_

# Use the best model to make predictions on the test set
y_pred = best_xgb.predict(X_test)

# Calculate the Mean Absolute Error (MAE) and R-squared score
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the results
print("Best Hyperparameters:", best_params)
print("Mean Absolute Error:", mae)
print("R-squared score:", r2)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Mean Absolute Error: 11577.684041539635
R-squared score: 0.9471523720329054


<IPython.core.display.Javascript object>