In [27]:
# 2.2m housing data

# cleaning -drop nas

# modeling
# linear regression
# ridge regression w/o optimizaton
# compare performance metrics

# hypertuning for ridge
# compare with w/o hypertuning

# observations about process

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from scipy.sparse import hstack, csr_matrix
from scipy.stats import uniform, loguniform

from sklearn.svm import SVR, LinearSVR
from sklearn.experimental import enable_halving_search_cv
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV, HalvingRandomSearchCV, HalvingGridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, precision_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [3]:
housing_data = pd.read_csv('realtor-data.csv')
housing_data = housing_data.dropna()

# extract year from time column
housing_data['prev_sold_date'] = pd.to_datetime(housing_data['prev_sold_date'])
housing_data['prev_sold_date'] = housing_data['prev_sold_date'].dt.year

# remove outliers
housing_data = housing_data[housing_data['price'] < 150000000.0]
housing_data = housing_data[housing_data['price'] > 2]

housing_data = housing_data[housing_data['bed'] < 100]

In [4]:
# feature/target split
x = housing_data.drop(['price'], axis=1)    

#y = housing_data['price']
y = np.log(housing_data['price'])

# removed street and brokered_by out due to small improvements and massive time increase; 
# 'acre_lot' and 'prev_sold_date' due to lack of correlation
categorical_features = ['status', 'city', 'state', 'zip_code'] 
numeric_features = ['house_size', 'bed', 'bath']

x[numeric_features] = np.log(x[numeric_features])

x = x[categorical_features + numeric_features]

# can place sparse matrix directly into the models data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state = 42)

# set what columns to be transformed
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

x_train_processed = preprocessor.fit_transform(x_train)  # fit + transform on training
x_test_processed = preprocessor.transform(x_test)        # transform only on test



In [115]:
corr_matrix = (housing_data[numeric_features + ["price"]]).corr()
print(corr_matrix)

corr_matrix = np.log(housing_data[numeric_features + ["price"]]).corr()
print(corr_matrix)

            house_size       bed      bath     price
house_size    1.000000  0.267685  0.327350  0.208842
bed           0.267685  1.000000  0.648367  0.263986
bath          0.327350  0.648367  1.000000  0.439145
price         0.208842  0.263986  0.439145  1.000000
            house_size       bed      bath     price
house_size    1.000000  0.689217  0.763688  0.570796
bed           0.689217  1.000000  0.593114  0.368189
bath          0.763688  0.593114  1.000000  0.574984
price         0.570796  0.368189  0.574984  1.000000


In [116]:
LinearModel = LinearRegression()
LinearModel.fit(x_train_processed, y_train)

# predict
linear_y_pred = LinearModel.predict(x_test_processed)

# model metrics
#print("Intercept:", LinearModel.intercept_)
#print("Coefficients:", model.coef_)

linear_rmse = np.sqrt(mean_squared_error(y_test, linear_y_pred))
linear_r2 = r2_score(y_test, linear_y_pred)

print("Root Mean Squared Error:", linear_rmse)
print("R² Score:", linear_r2)

Root Mean Squared Error: 0.3235713405262727
R² Score: 0.8377142831175598


In [117]:
# fit and predict
RidgeModel = Ridge(alpha=1.0)
RidgeModel.fit(x_train_processed, y_train)

ridge_y_pred = RidgeModel.predict(x_test_processed)

# model metrics
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
ridge_r2 = r2_score(y_test, ridge_y_pred)

print("Root Mean Squared Error:", ridge_rmse)
print("R² Score:", ridge_r2)

Root Mean Squared Error: 0.32147049782841736
R² Score: 0.8398147779721221


In [None]:
ridge = Ridge()

param_grid = {'alpha': np.logspace(-2, 2, 20)} # 0.01 to 100

grid = HalvingGridSearchCV(Ridge(), 
                             param_grid,
                             factor=2, 
                             cv=5, 
                             scoring = 'r2',
                             n_jobs = -1)

grid.fit(x_train_processed, y_train)

print("Best alpha:", grid.best_params_['alpha'])
print("Best CV score:", grid.best_score_)

Best alpha: 2.06913808111479
Best CV score: 0.8369177756679751


In [119]:
# fit and predict
RidgeModel = Ridge(alpha=grid.best_params_['alpha'])
RidgeModel.fit(x_train_processed, y_train)

ridge_y_pred = RidgeModel.predict(x_test_processed)

# model metrics
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
ridge_r2 = r2_score(y_test, ridge_y_pred)

print("Root Mean Squared Error:", ridge_rmse)
print("R² Score:", ridge_r2)

Root Mean Squared Error: 0.32141380797588054
R² Score: 0.839871268853812


In [None]:
# Observations
# Data is very messy, lots of outliers and missing values 
# Heavy right skew in target and numerical features
# Ridge regression relies on scaling numeric factors to run within a reasonable amount of time and have reliable results
# Without tuning and a naively selected alpha of 1 ridge regression performs about the same and much faster than linear regression due to high amount of features with OneHotEncoder.
# Tuning seemingly provides marginal increases if any in this case


In [120]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,iter,n_resources,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0,54239,1.678656,0.088559,0.004028,0.000937,0.01,{'alpha': 0.01},0.777297,0.784686,...,0.774192,0.007297,34,0.903814,0.902089,0.903317,0.905118,0.902754,0.903419,0.001026
1,0,54239,1.646,0.082429,0.003876,0.000987,0.016238,{'alpha': 0.016237767391887217},0.777673,0.784936,...,0.774532,0.007261,33,0.903805,0.902088,0.903304,0.90511,0.902747,0.903411,0.001024
2,0,54239,1.559808,0.038405,0.003907,0.000974,0.026367,{'alpha': 0.026366508987303583},0.778419,0.785455,...,0.77511,0.007251,32,0.903745,0.902044,0.903284,0.905091,0.902723,0.903378,0.001028
3,0,54239,1.569007,0.026748,0.004003,0.001059,0.042813,{'alpha': 0.04281332398719394},0.779294,0.786078,...,0.775913,0.007163,31,0.903684,0.901999,0.903229,0.905038,0.902674,0.903325,0.001025
4,0,54239,1.434565,0.038933,0.005238,0.00288,0.069519,{'alpha': 0.06951927961775606},0.780592,0.78709,...,0.777183,0.007021,30,0.903548,0.901845,0.903069,0.904897,0.90254,0.90318,0.001028
5,0,54239,1.408163,0.077883,0.003925,0.001096,0.112884,{'alpha': 0.11288378916846889},0.782365,0.788417,...,0.77888,0.006804,28,0.903203,0.901563,0.902801,0.904612,0.902256,0.902887,0.001023
6,0,54239,1.216538,0.135248,0.003239,0.000739,0.183298,{'alpha': 0.18329807108324356},0.78474,0.790257,...,0.781178,0.006546,27,0.902553,0.900886,0.902151,0.903932,0.901555,0.902215,0.001026
7,0,54239,1.143741,0.078474,0.004225,0.000736,0.297635,{'alpha': 0.29763514416313175},0.787713,0.792335,...,0.783988,0.006196,26,0.901115,0.899525,0.90083,0.902696,0.9003,0.900893,0.001052
8,0,54239,1.046229,0.047251,0.003632,0.000999,0.483293,{'alpha': 0.4832930238571752},0.790746,0.794459,...,0.787056,0.005703,24,0.898494,0.896926,0.898292,0.900222,0.897752,0.898337,0.001088
9,0,54239,0.94727,0.064111,0.004087,0.001064,0.78476,{'alpha': 0.7847599703514611},0.793284,0.796081,...,0.789766,0.005127,23,0.893752,0.892206,0.893648,0.895704,0.893063,0.893675,0.001154


In [8]:
SVRModel = LinearSVR()
SVRModel.fit(x_train_processed, y_train)



In [9]:
SVR_y_pred = SVRModel.predict(x_test_processed)

# model metrics
SVR_rmse = np.sqrt(mean_squared_error(y_test, SVR_y_pred))
SVR_r2 = r2_score(y_test, SVR_y_pred)

print("Root Mean Squared Error:", SVR_rmse)
print("R² Score:", SVR_r2)

Root Mean Squared Error: 0.326869004097609
R² Score: 0.8343895712016223
