# Housing Predictive Analysis in King County

#### Haochen Miao

In [1]:
# necessary libraries
# Dependency imports
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()

# machine learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
import joblib


In [2]:
df = pd.read_csv('/Users/haochenmiao/Documents/School/INFO 371/housing_cost_predictive_analysis/data/cleaned_data.csv')
X = df.drop('price', axis=1)
y = df['price']

In [3]:
df.head()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,year,month,day,total_sqft,total_sqft15
0,7129300520,221900,3,1.0,1180,5650,1.0,0,0,3,...,98178,47.5112,-122.257,1340,5650,2014,10,13,6830,6990
1,6414100192,538000,3,2.25,2570,7242,2.0,0,0,3,...,98125,47.721,-122.319,1690,7639,2014,12,9,9812,9329
2,5631500400,180000,2,1.0,770,10000,1.0,0,0,3,...,98028,47.7379,-122.233,2720,8062,2015,2,25,10770,10782
3,2487200875,604000,4,3.0,1960,5000,1.0,0,0,5,...,98136,47.5208,-122.393,1360,5000,2014,12,9,6960,6360
4,1954400510,510000,3,2.0,1680,8080,1.0,0,0,3,...,98074,47.6168,-122.045,1800,7503,2015,2,18,9760,9303


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [5]:
# Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred_lin = lin_reg.predict(X_test)

### Ridge Regression

In [6]:
# Ridge Regression
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


### Lasso Regression

In [7]:
# Lasso Regression
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


### Elastic Net

In [8]:
# Elastic Net
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


### Decision Tree

In [9]:
# Decision Tree
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)

### Random Forest

In [10]:
# Random Forest
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(X_train, y_train)

### Gradient Boosting

In [11]:
# Gradient Boosting
gb_reg = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_reg.fit(X_train, y_train)

### Support Vector Regression(SVM)

In [12]:
# Support Vector Regression
svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)

### Neural Networks

In [13]:
# Neural Network
mlp = MLPRegressor(hidden_layer_sizes=(50, 50), activation='relu', solver='adam', random_state=42)
mlp.fit(X_train, y_train)

### Voting Regressor

In [14]:
# Voting Regressor
voting_reg = VotingRegressor(estimators=[('lr', lin_reg), ('rf', forest_reg), ('gb', gb_reg)])
voting_reg.fit(X_train, y_train)

### Stacking Regressor

In [15]:
# Stacking Regressor
stacking_reg = StackingRegressor(estimators=[('rf', forest_reg), ('gb', gb_reg)], final_estimator=lin_reg)
stacking_reg.fit(X_train, y_train)

## Model Evaluation and Selection

In [16]:
# Scale the data before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_pca = pca.fit_transform(X_scaled)

In [17]:
# Evaluate using cross-validation
scores = cross_val_score(forest_reg, X_train, y_train, scoring='neg_mean_squared_error', cv=10)

# Convert to RMSE
rmse_scores = np.sqrt(-scores)
print(f'Scores: {rmse_scores}')
print(f'Mean: {rmse_scores.mean()}')
print(f'Standard deviation: {rmse_scores.std()}')

Scores: [143542.6162696  117576.73319902 125064.77512656 139578.43183937
 133891.0333763  128601.90223484 117613.64741305 156306.98179257
 124154.53611454 115901.7810782 ]
Mean: 130223.24384440485
Standard deviation: 12445.27018866576


In [18]:
# Random Forest hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

rf_grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)
rf_grid_search.fit(X_train, y_train)