In [27]:
# 2.2m housing data

# cleaning -drop nas

# modeling
# linear regression
# ridge regression w/o optimizaton
# compare performance metrics

# hypertuning for ridge
# compare with w/o hypertuning

# observations about process

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from scipy.sparse import hstack, csr_matrix

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, precision_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [26]:
housing_data = pd.read_csv('realtor-data.csv')
housing_data = housing_data.dropna()

# extract year from time column
housing_data['prev_sold_date'] = pd.to_datetime(housing_data['prev_sold_date'])
housing_data['prev_sold_date'] = housing_data['prev_sold_date'].dt.year

housing_data = housing_data[housing_data['price'] < 150000000.0]
housing_data = housing_data[housing_data['price'] > 2]

housing_data = housing_data[housing_data['bed'] < 100]


In [27]:

# feature/target split
x = housing_data.drop(['price'], axis=1)    
y = housing_data['price']

categorical_features = ['brokered_by', 'status', 'street', 'city', 'state', 'zip_code']
numeric_features = ['acre_lot', 'house_size', 'bed', 'bath', 'prev_sold_date'] # maybe categorical year

# can place sparse matrix directly into the models data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state = 42)

# set what columns to be transformed
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ]
)

x_train_processed = preprocessor.fit_transform(x_train)  # fit + transform on training
x_test_processed = preprocessor.transform(x_test)        # transform only on test



In [None]:
LinearModel = LinearRegression()
LinearModel.fit(x_train_processed, y_train)

# predict
linear_y_pred = LinearModel.predict(x_test_processed)

# model metrics
#print("Intercept:", LinearModel.intercept_)
#print("Coefficients:", model.coef_)

linear_rmse = np.sqrt(mean_squared_error(y_test, linear_y_pred))
linear_r2 = r2_score(y_test, linear_y_pred)

print("Root Mean Squared Error:", linear_rmse)
print("R² Score:", linear_r2)

In [32]:
# fit and predict
RidgeModel = Ridge(alpha=1.0)
RidgeModel.fit(x_train, y_train)

ridge_y_pred = RidgeModel.predict(x_test)

# model metrics
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))
ridge_r2 = r2_score(y_test, ridge_y_pred)

print("Root Mean Squared Error:", ridge_rmse)
print("R² Score:", ridge_r2)

Root Mean Squared Error: 692011.2010981624
R² Score: 0.5671273254480658


In [34]:
ridge = Ridge()
param_grid = {'alpha': [0.01, 0.1, 1, 10, 50, 100, 200]}

In [None]:
grid = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')

grid.fit(x_train, y_train)

print("Best alpha:", grid.best_params_['alpha'])
print("Best CV score:", -grid.best_score_)