In [1]:

import kagglehub
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error


path = kagglehub.dataset_download("harlfoxem/housesalesprediction")
print("Path to dataset files:", path)

print("Files in dataset directory:", os.listdir(path))


csv_file = os.path.join(path, "kc_house_data.csv")
df = pd.read_csv(csv_file)

# Display basic info about the dataset
print("Dataset shape:", df.shape)
print(df.head())

#   - 'sqft_living15' (living room area in SQFT; used here as a proxy)
cols = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'waterfront',
        'floors', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built',
        'zipcode', 'sqft_living15']
df = df[cols]

# Split data into features (X) and target (y)
X = df.drop('price', axis=1)
y = df['price']

# Encode categorical data: zipcode (use one-hot encoding)
X = pd.get_dummies(X, columns=['zipcode'], drop_first=True)

# Split data into training and test sets (20% for testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

# Scale features. For tree-based models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Step 3: Build and Train the Models


# 3.1 Multiple Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
y_pred_lin = lin_reg.predict(X_test_scaled)
r2_lin = r2_score(y_test, y_pred_lin)
mse_lin = mean_squared_error(y_test, y_pred_lin)

# 3.2 Polynomial Regression (degree 2)
poly_features = PolynomialFeatures(degree=2)
X_train_poly = poly_features.fit_transform(X_train_scaled)
X_test_poly = poly_features.transform(X_test_scaled)
poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train)
y_pred_poly = poly_reg.predict(X_test_poly)
r2_poly = r2_score(y_test, y_pred_poly)
mse_poly = mean_squared_error(y_test, y_pred_poly)

# 3.3 KNN Regression (using k=5; adjust if needed)
knn_reg = KNeighborsRegressor(n_neighbors=5)
knn_reg.fit(X_train_scaled, y_train)
y_pred_knn = knn_reg.predict(X_test_scaled)
r2_knn = r2_score(y_test, y_pred_knn)
mse_knn = mean_squared_error(y_test, y_pred_knn)

# 3.4 Linear Support Vector Regression (SVR) with a linear kernel
svr_lin = SVR(kernel='linear')
svr_lin.fit(X_train_scaled, y_train)
y_pred_svr_lin = svr_lin.predict(X_test_scaled)
r2_svr_lin = r2_score(y_test, y_pred_svr_lin)
mse_svr_lin = mean_squared_error(y_test, y_pred_svr_lin)

# 3.5 Non-Linear Support Vector Regression (SVR) with RBF kernel
svr_rbf = SVR(kernel='rbf')
svr_rbf.fit(X_train_scaled, y_train)
y_pred_svr_rbf = svr_rbf.predict(X_test_scaled)
r2_svr_rbf = r2_score(y_test, y_pred_svr_rbf)
mse_svr_rbf = mean_squared_error(y_test, y_pred_svr_rbf)

# 3.6 Decision Tree Regression
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X_train, y_train)  # Trees work on unscaled data
y_pred_tree = tree_reg.predict(X_test)
r2_tree = r2_score(y_test, y_pred_tree)
mse_tree = mean_squared_error(y_test, y_pred_tree)

# 3.7 Random Forest Regression
forest_reg = RandomForestRegressor(random_state=42, n_estimators=100)
forest_reg.fit(X_train, y_train)
y_pred_forest = forest_reg.predict(X_test)
r2_forest = r2_score(y_test, y_pred_forest)
mse_forest = mean_squared_error(y_test, y_pred_forest)


# Step 4: Test Models and Compare Results

print("Model Comparison:")
print("Multiple Linear Regression: R2 =", r2_lin, "MSE =", mse_lin)
print("Polynomial Regression (degree 2): R2 =", r2_poly, "MSE =", mse_poly)
print("KNN Regression (k=5): R2 =", r2_knn, "MSE =", mse_knn)
print("Linear SVR: R2 =", r2_svr_lin, "MSE =", mse_svr_lin)
print("Non-Linear SVR (RBF): R2 =", r2_svr_rbf, "MSE =", mse_svr_rbf)
print("Decision Tree Regression: R2 =", r2_tree, "MSE =", mse_tree)
print("Random Forest Regression: R2 =", r2_forest, "MSE =", mse_forest)


# Create a dictionary for the new house
new_house = {
    'bedrooms': 3,
    'bathrooms': 2,
    'sqft_living': 2000,
    'sqft_lot': 5000,
    'waterfront': 0,
    'floors': 1,
    'condition': 3,
    'grade': 7,
    'sqft_above': 1500,
    'sqft_basement': 500,
    'yr_built': 1990,
    'sqft_living15': 1800
}

# Add dummy variables for zipcode.
# First, get all zipcode dummy column names from the training data.
dummy_cols = [col for col in X_train.columns if col.startswith("zipcode_")]
# Set all dummy variables to 0 initially.
for col in dummy_cols:
    new_house[col] = 0

# Now, activate the column corresponding to zip code 98028
zip_dummy = "zipcode_" + "98028"
if zip_dummy in X_train.columns:
    new_house[zip_dummy] = 1

# Create a DataFrame for the new sample and ensure the column order matches X_train.
new_house_df = pd.DataFrame([new_house])
new_house_df = new_house_df[X_train.columns]

# For models that require scaled data, transform the new sample.
new_house_scaled = scaler.transform(new_house_df)

# Estimate house prices using each model.
price_lin = lin_reg.predict(new_house_scaled)
price_poly = poly_reg.predict(poly_features.transform(new_house_scaled))
price_knn = knn_reg.predict(new_house_scaled)
price_svr_lin = svr_lin.predict(new_house_scaled)
price_svr_rbf = svr_rbf.predict(new_house_scaled)
# For tree-based models, use the unscaled new_house_df.
price_tree = tree_reg.predict(new_house_df)
price_forest = forest_reg.predict(new_house_df)

print("\nEstimated House Prices for the new sample:")
print("Multiple Linear Regression:", price_lin[0])
print("Polynomial Regression:", price_poly[0])
print("KNN Regression:", price_knn[0])
print("Linear SVR:", price_svr_lin[0])
print("Non-Linear SVR (RBF):", price_svr_rbf[0])
print("Decision Tree Regression:", price_tree[0])
print("Random Forest Regression:", price_forest[0])


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\justino\.cache\kagglehub\datasets\harlfoxem\housesalesprediction\versions\1
Files in dataset directory: ['kc_house_data.csv']
Dataset shape: (21613, 21)
           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0         3       1.00         1180   
1  6414100192  20141209T000000  538000.0         3       2.25         2570   
2  5631500400  20150225T000000  180000.0         2       1.00          770   
3  2487200875  20141209T000000  604000.0         4       3.00         1960   
4  1954400510  20150218T000000  510000.0         3       2.00         1680   

   sqft_lot  floors  waterfront  view  ...  grade  sqft_above  sqft_basement  \
0      5650     1.0           0     0  ...      7        1180              0   
1      7242     2.0           0     0  ...      7        2170            400   
2     10000     1.0           0     0  ...      6         770              0   
3      5000     1.0       