In [27]:
# 2.2m housing data

# cleaning -drop nas

# modeling
# linear regression
# ridge regression w/o optimizaton
# compare performance metrics

# hypertuning for ridge
# compare with w/o hypertuning

# observations about process

In [70]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from scipy.sparse import hstack, csr_matrix
from scipy.stats import uniform, loguniform

from sklearn.experimental import enable_halving_search_cv
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingRandomSearchCV, HalvingGridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, precision_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
housing_data = pd.read_csv('realtor-data.csv')
housing_data = housing_data.dropna()

# extract year from time column
housing_data['prev_sold_date'] = pd.to_datetime(housing_data['prev_sold_date'])
housing_data['prev_sold_date'] = housing_data['prev_sold_date'].dt.year

# remove outliers
housing_data = housing_data[housing_data['price'] < 150000000.0]
housing_data = housing_data[housing_data['price'] > 2]

housing_data = housing_data[housing_data['bed'] < 100]

In [94]:
# feature/target split
x = housing_data.drop(['price'], axis=1)    

#y = housing_data['price']
y = np.log(housing_data['price'])

categorical_features = ['status', 'city', 'state', 'zip_code'] # removed street,  brokered_by
numeric_features = [ 'house_size', 'bed', 'bath'] # 'acre_lot', 'prev_sold_date' out due to lack of correlation

x = x[categorical_features + numeric_features]

# can place sparse matrix directly into the models data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state = 42)

# set what columns to be transformed
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

x_train_processed = preprocessor.fit_transform(x_train)  # fit + transform on training
x_test_processed = preprocessor.transform(x_test)        # transform only on test



In [95]:
corr_matrix = housing_data[numeric_features + ["price"]].corr()
corr_matrix

Unnamed: 0,house_size,bed,bath,price
house_size,1.0,0.267685,0.32735,0.208842
bed,0.267685,1.0,0.648367,0.263986
bath,0.32735,0.648367,1.0,0.439145
price,0.208842,0.263986,0.439145,1.0


In [96]:
LinearModel = LinearRegression()
LinearModel.fit(x_train_processed, y_train)

# predict
linear_y_pred = LinearModel.predict(x_test_processed)

# model metrics
#print("Intercept:", LinearModel.intercept_)
#print("Coefficients:", model.coef_)

linear_rmse = np.sqrt(mean_squared_error(y_test, linear_y_pred))
linear_r2 = r2_score(y_test, linear_y_pred)

print("Root Mean Squared Error:", linear_rmse)
print("R² Score:", linear_r2)

Root Mean Squared Error: 0.3857903754162343
R² Score: 0.7693024514764026


In [97]:
# fit and predict
RidgeModel = Ridge(alpha=1.0)
RidgeModel.fit(x_train_processed, y_train)

ridge_y_pred = RidgeModel.predict(x_test_processed)

# model metrics
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
ridge_r2 = r2_score(y_test, ridge_y_pred)

print("Root Mean Squared Error:", ridge_rmse)
print("R² Score:", ridge_r2)

Root Mean Squared Error: 0.3834872485760397
R² Score: 0.7720487083619157


In [None]:
ridge = Ridge()

param_dist = {'alpha': loguniform(.1, 100)}
param_grid = {'alpha': np.logspace(-2, 2, 20)} # 0.01 to 100

grid = HalvingGridSearchCV(Ridge(), 
                             param_grid,
                             factor=2, 
                             #n_candidates=100, 
                             cv=5, 
                             scoring = 'r2',
                             n_jobs = -1)

grid.fit(x_train_processed, y_train)

print("Best alpha:", grid.best_params_['alpha'])
print("Best CV score:", grid.best_score_)

In [92]:
# fit and predict
RidgeModel = Ridge(alpha=grid.best_params_['alpha'])
RidgeModel.fit(x_train_processed, y_train)

ridge_y_pred = RidgeModel.predict(x_test_processed)

# model metrics
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
ridge_r2 = r2_score(y_test, ridge_y_pred)

print("Root Mean Squared Error:", ridge_rmse)
print("R² Score:", ridge_r2)

Root Mean Squared Error: 760358.7887835274
R² Score: 0.48349712844610626


In [60]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0,10,0.007156,0.002515,0.001648,0.0004827046,11.192956,{'alpha': 11.192955932157956},-2.508585,-12.716551,...,,,11,0.981635,0.619929,0.356037,0.389903,0.499334,0.569367,0.225895
1,0,10,0.007548,0.003132,0.000797,0.0007461981,80.483652,{'alpha': 80.48365151109027},-0.012393,-13.023483,...,,,11,0.877825,0.212934,0.067767,0.097729,0.148566,0.280964,0.302466
2,0,10,0.005808,0.00172,0.001994,0.001993156,96.877564,{'alpha': 96.8775636114776},-0.268876,-13.015332,...,,,11,0.849458,0.184369,0.056878,0.083247,0.127919,0.260374,0.2977
3,0,10,0.005203,0.000391,0.000999,0.0006306067,32.784828,{'alpha': 32.784828344101875},-0.772566,-13.025459,...,,,11,0.955707,0.387283,0.153035,0.199052,0.282298,0.395475,0.291237
4,0,10,0.004435,0.001322,0.000997,7.979012e-07,7.448112,{'alpha': 7.4481120096003615},-2.918691,-12.430104,...,,,11,0.985706,0.698265,0.462475,0.478025,0.590054,0.642905,0.191369
5,0,10,0.005212,0.001444,0.001493,0.0003760538,18.143146,{'alpha': 18.143146015065806},-1.822456,-12.923858,...,,,11,0.974202,0.517986,0.24946,0.295608,0.396353,0.486722,0.260614
6,0,10,0.0044,0.00034,0.000969,0.0006335662,73.010655,{'alpha': 73.01065481740048},0.044206,-13.027386,...,,,11,0.890767,0.229107,0.074247,0.106155,0.160383,0.292132,0.303893
7,0,10,0.005448,0.001851,0.001049,0.0007162713,95.416192,{'alpha': 95.41619163817516},-0.239915,-13.016022,...,,,11,0.851974,0.186601,0.057704,0.084361,0.129523,0.262032,0.298193
8,0,10,0.005422,0.000464,0.001996,0.0006334688,93.314072,{'alpha': 93.31407187891537},-0.200127,-13.017029,...,,,11,0.855599,0.189908,0.058936,0.086016,0.131901,0.264472,0.298884
9,0,10,0.004213,0.001142,0.001196,0.0003999528,66.351531,{'alpha': 66.35153080273517},0.053891,-13.0308,...,,,11,0.902219,0.245734,0.081164,0.114998,0.172635,0.30335,0.304596
