In [144]:
import sys
import os
sys.path.append(os.path.abspath('../'))

import time
from tqdm import tqdm
import requests
import json
import numpy as np
import pandas as pd 
import seaborn as sns
import optuna
import shap
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from boruta import BorutaPy
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    silhouette_score
)
from sklearn.preprocessing import TargetEncoder
from sklearn.feature_selection import VarianceThreshold
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from sklearn.model_selection import train_test_split, KFold

shap.initjs()

In [145]:
from src import Root
root = Root()
root.lib.jupyter_settings()

In [146]:
df = pd.read_csv("../data/housing_geo.csv")

In [147]:
df.shape

(21135, 186)

In [148]:
df.sample(10)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,total_rooms_per_households,total_bedrooms_per_households,income_per_household,total_rooms_not_bedrooms,population_per_hosehold,income_per_population,total_rooms_not_bedrooms_per_households,distance_to_los_angeles,distance_to_san_francisco,distance_to_san_diego,distance_to_sacramento,distance_to_silicon_valley,distance_to_fresno,distance_to_santa_barbara,distance_to_san_jose,distance_to_oakland,distance_to_anaheim,distance_to_golden_gate_bridge,distance_to_yosemite_valley,distance_to_death_valley,distance_to_big_sur,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,is_high_value,target,neighborhood,city,county,state,postcode,distance_to_school,distance_to_university,distance_to_college,distance_to_hospital,distance_to_supermarket,distance_to_convenience,distance_to_department_store,distance_to_restaurant,distance_to_cafe,distance_to_park,distance_to_bus_station,distance_to_train_station,distance_to_subway_station,distance_to_beach,distance_to_bank,distance_to_police,distance_to_fire_station,distance_to_retail,distance_to_education,distance_to_Los_Angeles,distance_to_San_Francisco,distance_to_San_Diego,distance_to_Sacramento,distance_to_San_Jose,distance_to_Fresno,distance_to_Oakland,distance_to_Bakersfield,distance_to_Irvine,distance_to_Riverside,distance_to_nearest_major_city,nearest_major_city,has_los_angeles_within_500m,has_los_angeles_within_1000m,has_los_angeles_within_2000m,has_san_francisco_within_500m,has_san_francisco_within_1000m,has_san_francisco_within_2000m,has_san_diego_within_500m,has_san_diego_within_1000m,has_san_diego_within_2000m,has_sacramento_within_500m,has_sacramento_within_1000m,has_sacramento_within_2000m,has_silicon_valley_within_500m,has_silicon_valley_within_1000m,has_silicon_valley_within_2000m,has_fresno_within_500m,has_fresno_within_1000m,has_fresno_within_2000m,has_santa_barbara_within_500m,has_santa_barbara_within_1000m,has_santa_barbara_within_2000m,has_san_jose_within_500m,has_san_jose_within_1000m,has_san_jose_within_2000m,has_oakland_within_500m,has_oakland_within_1000m,has_oakland_within_2000m,has_anaheim_within_500m,has_anaheim_within_1000m,has_anaheim_within_2000m,has_golden_gate_bridge_within_500m,has_golden_gate_bridge_within_1000m,has_golden_gate_bridge_within_2000m,has_yosemite_valley_within_500m,has_yosemite_valley_within_1000m,has_yosemite_valley_within_2000m,has_death_valley_within_500m,has_death_valley_within_1000m,has_death_valley_within_2000m,has_big_sur_within_500m,has_big_sur_within_1000m,has_big_sur_within_2000m,has_school_within_500m,has_school_within_1000m,has_school_within_2000m,has_university_within_500m,has_university_within_1000m,has_university_within_2000m,has_college_within_500m,has_college_within_1000m,has_college_within_2000m,has_hospital_within_500m,has_hospital_within_1000m,has_hospital_within_2000m,has_supermarket_within_500m,has_supermarket_within_1000m,has_supermarket_within_2000m,has_convenience_within_500m,has_convenience_within_1000m,has_convenience_within_2000m,has_department_store_within_500m,has_department_store_within_1000m,has_department_store_within_2000m,has_restaurant_within_500m,has_restaurant_within_1000m,has_restaurant_within_2000m,has_cafe_within_500m,has_cafe_within_1000m,has_cafe_within_2000m,has_park_within_500m,has_park_within_1000m,has_park_within_2000m,has_bus_station_within_500m,has_bus_station_within_1000m,has_bus_station_within_2000m,has_train_station_within_500m,has_train_station_within_1000m,has_train_station_within_2000m,has_subway_station_within_500m,has_subway_station_within_1000m,has_subway_station_within_2000m,has_beach_within_500m,has_beach_within_1000m,has_beach_within_2000m,has_bank_within_500m,has_bank_within_1000m,has_bank_within_2000m,has_police_within_500m,has_police_within_1000m,has_police_within_2000m,has_fire_station_within_500m,has_fire_station_within_1000m,has_fire_station_within_2000m,has_retail_within_500m,has_retail_within_1000m,has_retail_within_2000m,has_education_within_500m,has_education_within_1000m,has_education_within_2000m,has_nearest_major_city_within_500m,has_nearest_major_city_within_1000m,has_nearest_major_city_within_2000m,amenity_count_500m,amenity_diversity_500m,amenity_count_1000m,amenity_diversity_1000m,amenity_count_2000m,amenity_diversity_2000m,walkability_score,amenity_data_completeness,avg_distance_to_key_amenities,distance_to_bus_station_norm,distance_to_train_station_norm,distance_to_subway_station_norm,transportation_accessibility
7875,-118.14,33.88,41.0,1531.0,343.0,1119.0,341.0,4.36,4.49,1.01,0.01,1188.0,3.28,0.0,3.48,13.3,360.31,98.32,374.73,327.24,218.19,96.61,318.34,356.37,13.29,364.78,286.25,195.52,265.1,False,False,False,False,0.01,161400.0,,Bellflower,Los Angeles County,California,90706.0,968.64,4009.09,4006.79,1140.3,915.76,285.07,1413.27,1361.23,1513.69,459.85,4009.81,20000.0,20000.0,17637.24,1400.04,1390.93,932.37,285.07,968.64,21.37,579.76,158.07,602.47,512.19,350.65,573.35,184.21,36.23,71.56,21.37,Los_Angeles,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,0,0,1,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,1,1,1,1,0,1,1,1,1,1,18,18,22,22,28,28,41.8,1.0,1021.1,0.0,0.0,0.0,0.0
20345,-119.04,34.22,18.0,3117.0,583.0,2079.0,545.0,4.65,5.72,1.07,0.01,2534.0,3.81,0.0,4.65,46.99,309.81,150.1,330.79,276.53,178.97,40.04,268.06,306.45,69.69,314.25,253.39,199.04,210.85,False,False,False,False,0.08,222800.0,Old Town,Camarillo,Ventura County,California,93010.0,674.74,4009.09,4006.79,3998.09,97.81,370.66,4168.57,147.02,339.5,526.83,4009.81,20000.0,20000.0,17637.24,248.16,4010.81,4010.06,97.81,674.74,75.77,498.35,241.53,531.66,431.14,287.46,492.9,127.96,126.92,155.87,75.77,Los_Angeles,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,1,1,0,1,1,1,1,1,21,21,24,24,24,24,45.0,1.0,338.91,0.0,0.0,0.0,0.0
10534,-117.69,33.51,4.0,1223.0,275.0,505.0,244.0,4.66,5.01,1.13,0.02,948.0,2.07,0.01,3.89,49.14,396.56,62.84,409.73,363.51,252.53,131.16,354.57,392.54,25.99,401.03,318.26,213.17,301.35,False,False,False,False,0.46,173000.0,,Laguna Niguel,Orange County,California,92677.0,4010.5,4009.09,4006.79,3998.09,1576.01,4041.52,4168.57,1426.26,1650.25,236.15,4009.81,20000.0,20000.0,17637.24,1890.72,4010.81,4010.06,1576.01,4010.5,79.04,638.08,100.98,658.78,570.5,405.89,631.57,240.07,23.14,59.78,23.14,Irvine,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,16,16,16,16,21,21,33.5,1.0,1827.93,0.0,0.0,0.0,0.0
11974,-117.42,34.0,32.0,1617.0,346.0,1153.0,385.0,3.02,4.2,0.9,0.01,1271.0,2.99,0.0,3.3,47.31,382.43,89.99,389.38,349.91,231.43,133.38,340.58,377.79,30.51,386.9,292.14,177.11,293.35,True,False,False,False,0.0,96600.0,Rubidoux,Jurupa Valley,Riverside County,California,92509.0,4010.5,4009.09,4006.79,3998.09,1373.49,439.7,4168.57,840.08,4009.83,591.42,4009.81,20000.0,20000.0,17637.24,4011.5,4010.81,943.58,439.7,4010.5,76.29,615.6,144.47,626.28,548.23,372.18,608.07,211.35,51.38,4.64,4.64,Riverside,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,1,1,1,17,17,20,20,21,21,37.3,1.0,2165.4,0.0,0.0,0.0,0.0
2428,-119.52,36.61,33.0,1225.0,275.0,1065.0,248.0,1.9,4.94,1.11,0.01,950.0,4.29,0.0,3.83,190.8,178.72,300.59,173.89,149.97,17.24,151.59,139.97,172.41,211.97,182.98,86.72,143.68,129.32,True,False,False,False,0.0,55100.0,,,Fresno County,California,93648.0,4010.5,4009.09,4006.79,3998.09,981.19,4041.52,4168.57,4010.74,4009.83,741.4,4009.81,20000.0,20000.0,17637.24,4011.5,4010.81,4010.06,981.19,4010.5,306.58,288.05,483.05,279.78,225.66,27.77,277.85,144.48,359.35,350.9,27.77,Fresno,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,1,1,15,15,18,18,18,18,33.0,1.0,2751.07,0.0,0.0,0.0,0.0
11827,-121.02,39.01,17.0,4786.0,799.0,2066.0,770.0,3.97,6.22,1.04,0.01,3987.0,2.68,0.0,5.18,375.58,114.13,485.49,39.1,125.46,170.78,325.41,124.73,107.36,396.9,113.99,112.64,281.27,194.13,True,False,False,False,0.0,185400.0,,Meadow Vista,Placer County,California,95722.0,1585.95,4009.09,4006.79,3998.09,1114.63,1768.54,4168.57,4010.74,4009.83,1965.29,4009.81,20000.0,20000.0,17637.24,4011.5,4010.81,4010.06,1114.63,1585.95,603.65,183.69,780.32,62.93,200.49,274.53,172.77,440.96,656.68,646.46,62.93,Sacramento,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,15,15,15,15,21,21,31.8,1.0,2537.62,0.0,0.0,0.0,0.0
15704,-122.43,37.79,52.0,3219.0,969.0,1152.0,830.0,4.2,3.88,1.17,0.01,2250.0,1.39,0.0,2.71,348.58,1.19,459.49,74.65,34.49,162.49,278.26,43.14,8.73,372.23,3.35,157.89,314.88,110.46,False,False,True,False,0.94,500001.0,,San Francisco,,California,94115.0,294.63,4009.09,893.26,3998.09,428.55,287.12,1750.15,340.64,366.51,270.08,4009.81,20000.0,20000.0,2050.54,432.96,4010.81,4010.06,287.12,294.63,560.9,1.92,739.21,120.16,69.44,261.89,14.09,406.3,617.0,621.86,1.92,San_Francisco,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,0,0,0,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,24,24,25,25,26,26,49.3,1.0,353.37,0.0,0.0,0.0,0.0
17343,-120.41,34.86,15.0,978.0,187.0,407.0,182.0,4.38,5.37,1.03,0.02,791.0,2.24,0.01,4.35,135.45,230.37,238.21,264.05,197.35,134.36,50.58,190.02,228.28,158.95,234.66,213.22,226.75,125.18,False,False,False,False,0.34,158000.0,,Orcutt,Santa Barbara County,California,93455.0,604.39,4009.09,4006.79,3998.09,775.62,1053.03,4168.57,719.64,895.57,674.76,4009.81,20000.0,20000.0,17637.24,837.7,4010.81,977.1,775.62,604.39,218.29,370.35,383.5,424.19,305.42,215.82,366.95,139.02,271.28,295.5,139.02,Bakersfield,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,1,0,1,1,1,1,1,15,15,24,24,25,25,39.3,1.0,722.42,0.0,0.0,0.0,0.0
6466,-118.04,34.09,34.0,2597.0,461.0,1542.0,470.0,4.62,5.53,0.98,0.01,2136.0,3.28,0.0,4.54,11.95,353.25,107.64,365.0,320.36,207.71,97.41,311.27,349.02,18.93,357.73,273.95,179.93,260.68,True,False,False,False,0.01,248900.0,,El Monte,Los Angeles County,California,91731.0,1174.57,4009.09,4006.79,3998.09,1389.56,787.18,1374.51,350.64,1456.13,807.35,4009.81,20000.0,20000.0,17637.24,1211.88,4010.81,1661.65,787.18,1174.57,19.26,568.5,172.97,586.91,500.92,333.87,561.64,168.23,49.11,62.55,19.26,Los_Angeles,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,1,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,1,1,1,1,16,16,19,19,26,26,37.1,1.0,986.8,0.0,0.0,0.0,0.0
2128,-119.72,36.8,16.0,2396.0,526.0,1338.0,518.0,2.17,4.63,1.02,0.0,1870.0,2.58,0.0,3.61,207.24,162.95,317.35,156.72,135.06,5.68,164.39,125.08,156.44,228.65,167.15,74.27,155.59,121.55,True,False,False,False,0.0,78800.0,,Clovis,Fresno County,California,93612.0,912.21,4009.09,4006.79,3998.09,919.43,762.78,2015.18,1305.03,1026.33,637.68,4009.81,20000.0,20000.0,17637.24,4011.5,4010.81,1127.93,762.78,912.21,333.03,262.67,510.03,252.17,201.69,9.14,252.14,170.44,386.2,378.41,9.14,Fresno,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,1,1,1,1,15,15,21,21,24,24,36.9,1.0,1557.17,0.0,0.0,0.0,0.0


In [149]:
df = df.drop(columns = ["neighborhood", "postcode", "state"])
df["city"] = df["city"].fillna("not_found")
df["county"] = df["county"].fillna("not_found")

In [150]:
# Removes features with variance < 0.1
selector = VarianceThreshold(threshold=0.1)  

In [151]:
numeric_att = df.select_dtypes(include = np.number)
cat_att = df.select_dtypes(exclude = np.number)

In [152]:
feature_names = numeric_att.columns.tolist()

selector = VarianceThreshold(threshold=0.1)
selector.fit(numeric_att)

selected_features = [feature_names[i] for i in selector.get_support(indices=True)]

# set(feature_names) - set(selected_features)
variance_treshold_features = selected_features + cat_att.columns.tolist() + ["is_high_value"]

In [153]:
df_selected = df[variance_treshold_features]

In [154]:
df.shape

(21135, 183)

In [155]:
df_selected.shape

(21135, 104)

In [156]:
df_selected = df_selected.copy()
df_selected.loc[:, "lat_sin"] = np.sin(np.radians(df_selected["latitude"]))
df_selected.loc[:, "lat_cos"] = np.cos(np.radians(df_selected["latitude"]))
df_selected.loc[:, "lon_sin"] = np.sin(np.radians(df_selected["longitude"]))
df_selected.loc[:, "lon_cos"] = np.cos(np.radians(df_selected["longitude"]))

In [157]:
X = df_selected.drop(columns = "target")
y = df_selected["target"]

In [158]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [159]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,total_rooms_per_households,total_bedrooms_per_households,total_rooms_not_bedrooms,population_per_hosehold,total_rooms_not_bedrooms_per_households,distance_to_los_angeles,distance_to_san_francisco,distance_to_san_diego,distance_to_sacramento,distance_to_silicon_valley,distance_to_fresno,distance_to_santa_barbara,distance_to_san_jose,distance_to_oakland,distance_to_anaheim,distance_to_golden_gate_bridge,distance_to_yosemite_valley,distance_to_death_valley,distance_to_big_sur,distance_to_school,distance_to_university,distance_to_college,distance_to_hospital,distance_to_supermarket,distance_to_convenience,distance_to_department_store,distance_to_restaurant,distance_to_cafe,distance_to_park,distance_to_bus_station,distance_to_beach,distance_to_bank,distance_to_police,distance_to_fire_station,distance_to_retail,distance_to_education,distance_to_Los_Angeles,distance_to_San_Francisco,distance_to_San_Diego,distance_to_Sacramento,distance_to_San_Jose,distance_to_Fresno,distance_to_Oakland,distance_to_Bakersfield,distance_to_Irvine,distance_to_Riverside,distance_to_nearest_major_city,has_school_within_500m,has_school_within_1000m,has_school_within_2000m,has_supermarket_within_500m,has_supermarket_within_1000m,has_supermarket_within_2000m,has_convenience_within_500m,has_convenience_within_1000m,has_convenience_within_2000m,has_department_store_within_1000m,has_department_store_within_2000m,has_restaurant_within_500m,has_restaurant_within_1000m,has_restaurant_within_2000m,has_cafe_within_500m,has_cafe_within_1000m,has_cafe_within_2000m,has_park_within_500m,has_park_within_1000m,has_bus_station_within_2000m,has_bank_within_1000m,has_bank_within_2000m,has_police_within_2000m,has_fire_station_within_1000m,has_fire_station_within_2000m,has_retail_within_500m,has_retail_within_1000m,has_retail_within_2000m,has_education_within_500m,has_education_within_1000m,has_education_within_2000m,amenity_count_500m,amenity_diversity_500m,amenity_count_1000m,amenity_diversity_1000m,amenity_count_2000m,amenity_diversity_2000m,walkability_score,avg_distance_to_key_amenities,transportation_accessibility,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,city,county,nearest_major_city,is_high_value,lat_sin,lat_cos,lon_sin,lon_cos
11396,-117.97,33.73,19.0,4154.0,560.0,2130.0,589.0,7.28,7.05,0.95,3594.0,3.62,6.1,27.24,374.53,84.25,388.54,341.46,231.73,109.82,332.55,370.57,8.03,379.0,298.91,202.3,279.23,1526.45,4009.09,4006.79,3998.09,1249.04,265.73,1535.93,1222.27,1396.79,325.38,4009.81,17637.24,1619.54,4010.81,4010.06,265.73,1526.45,43.8,602.63,135.43,624.68,535.06,372.42,596.2,206.14,14.22,61.64,14.22,0,0,1,0,0,1,1,1,1,0,1,0,0,1,0,0,1,1,1,0,0,1,0,0,0,1,1,1,0,0,1,18,18,18,18,25,25,38.1,1188.54,0.0,False,False,False,False,Fountain Valley,Orange County,Irvine,0.02,0.56,0.83,-0.88,-0.47
9734,-121.64,36.82,18.0,1819.0,283.0,919.0,295.0,4.17,6.17,0.96,1536.0,3.12,5.21,270.38,78.66,380.74,121.97,45.46,102.7,198.42,38.29,76.35,294.22,83.03,136.17,261.59,39.1,4010.5,4009.09,4006.79,3998.09,4145.77,4041.52,4168.57,4010.74,4009.83,27004.31,4009.81,17637.24,4011.5,4010.81,4010.06,4144.85,4010.5,435.14,126.52,612.56,195.94,61.54,165.65,122.74,285.45,491.29,499.21,61.54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,15,15,15,15,15,30.0,8636.56,0.0,False,False,False,False,Prunedale,Monterey County,San_Jose,0.01,0.6,0.8,-0.85,-0.52
3579,-118.54,34.23,35.0,3422.0,601.0,1690.0,574.0,4.38,5.96,1.05,2821.0,2.94,4.91,20.93,327.06,131.38,342.57,293.96,186.93,67.39,285.09,323.16,44.97,331.52,257.27,183.04,232.09,774.82,4009.09,4006.79,3998.09,699.04,1054.82,1016.26,351.77,803.29,583.6,4009.81,17637.24,1343.83,4010.81,4010.06,699.04,774.82,33.7,526.25,211.27,550.74,458.69,300.37,519.92,134.19,89.49,110.95,33.7,0,1,1,0,1,1,0,0,1,0,1,1,1,1,0,1,1,0,1,0,0,1,0,0,0,0,1,1,0,1,1,16,16,22,22,25,25,38.9,750.61,0.0,False,False,False,False,Los Angeles,Los Angeles County,Los_Angeles,0.02,0.56,0.83,-0.88,-0.48
19637,-120.96,37.48,32.0,1256.0,212.0,682.0,236.0,2.98,5.32,0.9,1044.0,2.89,4.42,281.54,82.42,392.95,81.48,60.55,82.5,222.83,51.77,75.15,304.27,86.31,82.18,231.65,95.82,4010.5,4009.09,4006.79,3998.09,4145.77,1493.23,4168.57,4010.74,4009.83,27004.31,4009.81,17637.24,4011.5,4010.81,4010.06,1493.23,4010.5,452.7,132.92,631.8,130.97,83.5,132.86,121.19,291.46,507.85,505.79,83.5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,15,15,15,15,17,17,30.6,8636.56,0.0,True,False,False,False,not_found,Stanislaus County,San_Jose,0.0,0.61,0.79,-0.86,-0.51
10924,-117.88,33.73,36.0,2471.0,498.0,2594.0,475.0,3.75,5.2,1.05,1973.0,5.46,4.15,30.51,377.91,81.47,391.1,344.88,234.02,114.48,335.92,373.87,7.62,382.38,300.43,200.88,283.21,871.15,4009.09,4006.79,3998.09,1485.44,4041.52,4168.57,1414.85,1617.81,510.69,4009.81,17637.24,1590.14,4010.81,4010.06,1485.44,871.15,49.08,608.09,130.94,628.83,540.51,376.13,601.54,210.13,7.07,54.34,7.07,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,1,15,15,18,18,23,23,34.5,1174.45,0.0,False,False,False,False,Santa Ana,Orange County,Irvine,0.0,0.56,0.83,-0.88,-0.47


In [160]:
categorical_cols = ["city", "county", "nearest_major_city"]

encoder = TargetEncoder(smooth = "auto")
encoder.fit(X_train[categorical_cols], y_train)

X_train[categorical_cols] = encoder.transform(X_train[categorical_cols])
X_test[categorical_cols] = encoder.transform(X_test[categorical_cols])

In [161]:
xgb = XGBRegressor(
    n_estimators=500,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

boruta_selector = BorutaPy(
    estimator=xgb,
    n_estimators='auto',
    max_iter=100,  # Number of iterations
    verbose=0,    # Print progress
    random_state=42
)


boruta_selector.fit(np.array(X_train), np.array(y_train))
selected_features = X_train.columns[boruta_selector.support_].tolist()

# Get feature rankings
feature_rankings = boruta_selector.ranking_

ranking = pd.DataFrame({
    'Feature': X_train.columns,
    'Ranking': feature_rankings
})

In [162]:
selected_features

['housing_median_age',
 'total_rooms',
 'median_income',
 'total_rooms_per_households',
 'total_rooms_not_bedrooms',
 'population_per_hosehold',
 'total_rooms_not_bedrooms_per_households',
 'distance_to_los_angeles',
 'distance_to_anaheim',
 'distance_to_death_valley',
 'distance_to_convenience',
 'distance_to_cafe',
 'distance_to_beach',
 'distance_to_retail',
 'distance_to_Los_Angeles',
 'distance_to_Bakersfield',
 'distance_to_Riverside',
 'has_cafe_within_1000m',
 'ocean_proximity_ISLAND',
 'city',
 'county',
 'nearest_major_city',
 'is_high_value',
 'lat_cos',
 'lon_sin',
 'lon_cos']

In [163]:
ranking.sort_values('Ranking')

Unnamed: 0,Feature,Ranking
53,distance_to_Riverside,1
32,distance_to_convenience,1
35,distance_to_cafe,1
38,distance_to_beach,1
42,distance_to_retail,1
44,distance_to_Los_Angeles,1
51,distance_to_Bakersfield,1
25,distance_to_death_valley,1
105,lon_sin,1
96,ocean_proximity_ISLAND,1


In [164]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [165]:
def train_xgboost_with_cv(X_train, y_train, X_test=None, y_test=None, n_folds=5, log_transform_target=False):
    """
    Trains an XGBoost regressor with cross-validation and returns the model and metrics.
    
    Parameters:
    -----------
    X_train : DataFrame or array-like
        Training features
    y_train : Series or array-like
        Training target
    X_test : DataFrame or array-like, optional
        Test features
    y_test : Series or array-like, optional
        Test target
    n_folds : int, default=5
        Number of cross-validation folds
    log_transform_target : bool, default=False
        If True, applies np.log1p transformation to the target variable for training
        and reverts the transformation with np.expm1 for prediction and metrics calculation
        
    Returns:
    --------
    model : XGBRegressor
        Trained XGBoost regressor model
    metrics_df : DataFrame
        DataFrame containing MAE, MAPE, RMSE, and R² for each fold, along with mean and std
    """
    # Create the XGBoost regressor with specified parameters
    model = XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    # Apply log transformation to target if requested
    if log_transform_target:
        # Convert to numpy array if it's not already
        if hasattr(y_train, 'values'):
            y_train_orig = y_train.values.copy()
            y_train = np.log1p(y_train.values)
        else:
            y_train_orig = y_train.copy()
            y_train = np.log1p(y_train)
        
        if y_test is not None:
            if hasattr(y_test, 'values'):
                y_test_orig = y_test.values.copy()
                y_test = np.log1p(y_test.values)
            else:
                y_test_orig = y_test.copy()
                y_test = np.log1p(y_test)
    else:
        y_train_orig = y_train.values.copy() if hasattr(y_train, 'values') else y_train.copy()
        y_test_orig = y_test.values.copy() if y_test is not None and hasattr(y_test, 'values') else None if y_test is None else y_test.copy()
    
    # Initialize KFold cross-validator
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Initialize lists to store metrics for each fold
    fold_mae = []
    fold_mape = []
    fold_rmse = []
    fold_r2 = []
    
    # Perform cross-validation
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        # Split the data for this fold
        X_fold_train = X_train.iloc[train_idx] if hasattr(X_train, 'iloc') else X_train[train_idx]
        y_fold_train = y_train[train_idx] if isinstance(y_train, np.ndarray) else y_train.iloc[train_idx] if hasattr(y_train, 'iloc') else y_train[train_idx]
        
        X_fold_val = X_train.iloc[val_idx] if hasattr(X_train, 'iloc') else X_train[val_idx]
        y_fold_val = y_train[val_idx] if isinstance(y_train, np.ndarray) else y_train.iloc[val_idx] if hasattr(y_train, 'iloc') else y_train[val_idx]
        
        # Get original validation targets for metrics calculation
        y_fold_val_orig = y_train_orig[val_idx] if isinstance(y_train_orig, np.ndarray) else y_train_orig.iloc[val_idx] if hasattr(y_train_orig, 'iloc') else y_train_orig[val_idx]
        
        # Train the model on this fold
        model.fit(X_fold_train, y_fold_train)
        
        # Make predictions on the validation set
        y_pred = model.predict(X_fold_val)
        
        # Revert log transformation if applied
        if log_transform_target:
            y_pred = np.expm1(y_pred)
        
        # Calculate metrics on the original scale
        mae = mean_absolute_error(y_fold_val_orig, y_pred)
        
        # Calculate MAPE, handling zero values
        mask = y_fold_val_orig != 0
        if mask.any():
            mape = np.mean(np.abs((y_fold_val_orig[mask] - y_pred[mask]) / y_fold_val_orig[mask])) * 100
        else:
            mape = np.nan
            
        rmse = np.sqrt(mean_squared_error(y_fold_val_orig, y_pred))
        r2 = r2_score(y_fold_val_orig, y_pred)
        
        # Append metrics to lists
        fold_mae.append(mae)
        fold_mape.append(mape)
        fold_rmse.append(rmse)
        fold_r2.append(r2)
    
    # Train the final model on the entire training set
    model.fit(X_train, y_train)
    
    # Create a DataFrame with the metrics for each fold
    metrics_df = pd.DataFrame({
        'Fold': range(1, n_folds + 1),
        'MAE': fold_mae,
        'MAPE': fold_mape,
        'RMSE': fold_rmse,
        'R²': fold_r2
    })
    
    # Calculate mean and standard deviation for each metric
    mean_metrics = metrics_df[['MAE', 'MAPE', 'RMSE', 'R²']].mean()
    std_metrics = metrics_df[['MAE', 'MAPE', 'RMSE', 'R²']].std()
    
    # Add mean and standard deviation rows to the DataFrame
    metrics_df = pd.concat([
        metrics_df,
        pd.DataFrame({'Fold': ['Mean'], 'MAE': [mean_metrics['MAE']], 
                     'MAPE': [mean_metrics['MAPE']], 'RMSE': [mean_metrics['RMSE']],
                     'R²': [mean_metrics['R²']]}),
        pd.DataFrame({'Fold': ['Std'], 'MAE': [std_metrics['MAE']], 
                     'MAPE': [std_metrics['MAPE']], 'RMSE': [std_metrics['RMSE']],
                     'R²': [std_metrics['R²']]})
    ])
    
    # If test data is provided, calculate metrics on test set
    if X_test is not None and y_test is not None:
        # Make predictions on the test set
        y_pred_test = model.predict(X_test)
        
        # Revert log transformation if applied
        if log_transform_target:
            y_pred_test = np.expm1(y_pred_test)
            
        test_mae = mean_absolute_error(y_test_orig, y_pred_test)
        
        # Calculate MAPE on test set, handling zero values
        mask = y_test_orig != 0
        if mask.any():
            test_mape = np.mean(np.abs((y_test_orig[mask] - y_pred_test[mask]) / y_test_orig[mask])) * 100
        else:
            test_mape = np.nan
            
        test_rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_test))
        test_r2 = r2_score(y_test_orig, y_pred_test)
        
        # Add test metrics to the DataFrame
        metrics_df = pd.concat([
            metrics_df,
            pd.DataFrame({'Fold': ['Test'], 'MAE': [test_mae], 
                         'MAPE': [test_mape], 'RMSE': [test_rmse],
                         'R²': [test_r2]})
        ])
    
    return model, metrics_df

In [166]:
model, metrics_df = train_xgboost_with_cv(
    X_train, 
    y_train, 
    X_test, 
    y_test, 
    log_transform_target=True
)

In [167]:
metrics_df

Unnamed: 0,Fold,MAE,MAPE,RMSE,R²
0,1,25766.1,13.34,42372.1,0.87
1,2,26116.71,13.39,41968.2,0.87
2,3,25888.08,13.71,41418.18,0.88
3,4,25944.39,13.71,40495.5,0.88
4,5,26104.04,14.02,41820.75,0.87
0,Mean,25963.87,13.63,41614.94,0.87
0,Std,148.53,0.28,712.8,0.01
0,Test,25315.2,13.52,41304.26,0.88


In [170]:
def tune_xgboost(X_train, y_train, X_test, y_test, n_trials=100, log_transform=False, random_state=42):
    """
    Perform hyperparameter tuning for XGBoost regression using Optuna.
    
    Parameters:
    -----------
    X_train : array-like
        Training features
    y_train : array-like
        Training target
    X_test : array-like
        Test features
    y_test : array-like
        Test target
    n_trials : int, default=100
        Number of Optuna trials
    log_transform : bool, default=False
        Whether to apply log1p transformation to the target variable
    random_state : int, default=42
        Random state for reproducibility
    
    Returns:
    --------
    dict
        Dictionary containing best hyperparameters and evaluation metrics
    """
    # Handle log transformation of target using log1p
    if log_transform:
        # Check for negative values
        if np.any(y_train < 0) or np.any(y_test < 0):
            raise ValueError("Cannot apply log1p transform to negative values")
        
        # Apply log1p transformation
        y_train_transformed = np.log1p(y_train)
        y_test_original = y_test.copy()  # Store original for later metric calculation
        y_test_transformed = np.log1p(y_test)
    else:
        y_train_transformed = y_train
        y_test_transformed = y_test
        y_test_original = y_test
    
    def objective(trial):
        # Define the hyperparameter search space
        param = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'random_state': random_state
        }
        
        # Add booster-specific parameters
        if param['booster'] == 'gbtree' or param['booster'] == 'dart':
            param['max_depth'] = trial.suggest_int('max_depth', 3, 10)
            param['min_child_weight'] = trial.suggest_float('min_child_weight', 1, 10)
            param['gamma'] = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
            
            # DART specific params
            if param['booster'] == 'dart':
                param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
                param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
                param['rate_drop'] = trial.suggest_float('rate_drop', 0.0, 0.5)
                param['skip_drop'] = trial.suggest_float('skip_drop', 0.0, 0.5)
        
        # Create and train the model
        model = XGBRegressor(**param)
        model.fit(X_train, y_train_transformed)
        
        # Make predictions
        y_pred_transformed = model.predict(X_test)
        
        # Convert predictions back to original scale if log transformed
        if log_transform:
            y_pred = np.expm1(y_pred_transformed)
            # Compute validation RMSE on transformed scale for optimization
            rmse_transformed = np.sqrt(mean_squared_error(y_test_transformed, y_pred_transformed))
            return rmse_transformed
        else:
            # Regular RMSE
            rmse = np.sqrt(mean_squared_error(y_test_transformed, y_pred_transformed))
            return rmse
    
    # Create Optuna study
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)
    
    # Get the best parameters
    best_params = study.best_params
    best_params['objective'] = 'reg:squarederror'
    
    # Train model with best parameters
    model = XGBRegressor(**best_params, random_state=random_state)
    model.fit(X_train, y_train_transformed)
    
    # Make predictions
    y_pred_transformed = model.predict(X_test)
    
    # Convert predictions back to original scale if log transformed
    if log_transform:
        y_pred = np.expm1(y_pred_transformed)
    else:
        y_pred = y_pred_transformed
    
    # Calculate metrics on the original scale
    rmse = np.sqrt(mean_squared_error(y_test_original, y_pred))
    mae = mean_absolute_error(y_test_original, y_pred)
    r2 = r2_score(y_test_original, y_pred)
    
    # Calculate MAPE (Mean Absolute Percentage Error)
    # Adding a small epsilon to avoid division by zero
    epsilon = 1e-10
    mape = np.mean(np.abs((y_test_original - y_pred) / (np.abs(y_test_original) + epsilon))) * 100
    
    # Create results dictionary
    results = {
        'best_params': best_params,
        'metrics': {
            'rmse': rmse,
            'mae': mae,
            'mape': mape,
            'r2': r2
        },
        'model': model,
        'log_transform_used': log_transform
    }
    
    return results

# Example usage:
# results = xgboost_hyperparameter_tuning(X_train, y_train, X_test, y_test, n_trials=100, log_transform=True)
# print("Best Parameters:", results['best_params'])
# print("Metrics:", results['metrics'])
# best_model = results['model']  # Access the trained model

In [172]:
results = tune_xgboost(X_train, y_train, X_test, y_test, n_trials=500, log_transform=True ,random_state=42)

[I 2025-04-28 12:43:34,677] A new study created in memory with name: no-name-bd8acd4d-0a7d-4047-83dd-68eef817ad5c
Parameters: { "colsample_bytree", "subsample" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-04-28 12:43:35,576] Trial 0 finished with value: 0.28480023439012225 and parameters: {'booster': 'gblinear', 'lambda': 8.456526417921191e-07, 'alpha': 1.529528923235762e-08, 'subsample': 0.6595373005516678, 'colsample_bytree': 0.826079529665461, 'n_estimators': 445, 'learning_rate': 0.2669231544900607}. Best is trial 0 with value: 0.28480023439012225.
Parameters: { "colsample_bytree", "subsample" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-04-28 12:43:36,803] Trial 1 finished with value: 0.3022019181609059 and parameters: {'booster': 'gblinear', 'lambda': 0.0003976134356050937, 'alpha': 0.07956060091649471, 'subsample': 0.5579015175240261, 'colsample_bytree': 0.8696067275555125, 'n_estimators': 602, 'learning_rate': 0.21033936782231086}

In [173]:
results['best_params']

{'booster': 'dart',
 'lambda': 0.7109123298889988,
 'alpha': 0.00010671619871214839,
 'subsample': 0.7348513660194621,
 'colsample_bytree': 0.5098990519649645,
 'n_estimators': 654,
 'learning_rate': 0.034762745976783416,
 'max_depth': 10,
 'min_child_weight': 5.475404655344538,
 'gamma': 3.145749776610941e-07,
 'sample_type': 'uniform',
 'normalize_type': 'forest',
 'rate_drop': 0.027315148913931372,
 'skip_drop': 0.28911491328632233,
 'objective': 'reg:squarederror'}

In [174]:
results['metrics']

{'rmse': 39950.96439317332,
 'mae': 24081.435478342984,
 'mape': 12.922034451162242,
 'r2': 0.8856849891587978}

In [175]:
best_model = results['model']

In [None]:
best_model._xgb_dmatrix_props = {'enable_categorical': True}

explainer = shap.Explainer(best_model)
shap_values = explainer(X_train)

In [None]:
shap.plots.waterfall(shap_values[10])

In [None]:
shap.summary_plot(shap_values, X_train)