In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [None]:
# Base path to your data folder 
base_path = "/Users/machome/Downloads/house-prices-advanced-regression-techniques/"

# Load datasets 
train = pd.read_csv(base_path + "train.csv") 
test = pd.read_csv(base_path + "test.csv")
sample_sub = pd.read_csv(base_path + "sample_submission.csv")


In [3]:
# Quick look at each dataset
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("sample_sub shape:", sample_sub.shape)


train.head()

Train shape: (1460, 81)
Test shape: (1459, 80)
sample_sub shape: (1459, 2)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
# Function to display missing values summary for a dataframe
def missing_summary(df, name):
    print(f"\nMissing values in {name}:")
    missing = df.isna().sum()
    missing = missing[missing > 0]
    if len(missing) == 0:
        print("  No missing values.")
    else:
        print(missing)

# Check each dataset
missing_summary(train, "train")
missing_summary(test, "test")




Missing values in train:
LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

Missing values in test:
MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType       894
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars  

In [5]:
# Log-transform the target
train["LogSalePrice"] = np.log(train["SalePrice"])

# Quick look at the new column
train[["SalePrice", "LogSalePrice"]].head()

Unnamed: 0,SalePrice,LogSalePrice
0,208500,12.247694
1,181500,12.109011
2,223500,12.317167
3,140000,11.849398
4,250000,12.429216


In [6]:
# Features and target (log-transformed SalePrice)
X = train.drop(columns=["SalePrice", "LogSalePrice"])
y = train["LogSalePrice"]

# Identify numeric and categorical columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# Preprocessing for numeric features: median imputation
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

# Preprocessing for categorical features: fill missing and one-hot encode (dummies)
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="None")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# Baseline regression model: Ridge regression with preprocessing
ridge_baseline_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Ridge(alpha=1.0))
])

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the baseline Ridge model
ridge_baseline_model.fit(X_train, y_train)

# Predict on validation set
val_preds = ridge_baseline_model.predict(X_val)

# Compute RMSE on log scale
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print("Baseline Ridge Validation RMSE (log scale):", rmse)



Baseline Ridge Validation RMSE (log scale): 0.14803943999341998


In [7]:
# Create a copy of the training data for the enhanced model
train_enhanced = train.copy()

# Polynomial term
train_enhanced["GrLivArea_sq"] = train_enhanced["GrLivArea"] ** 2

# Interaction term
train_enhanced["Qual_x_GrLivArea"] = train_enhanced["OverallQual"] * train_enhanced["GrLivArea"]

# Features and target for the enhanced model
X2 = train_enhanced.drop(columns=["SalePrice", "LogSalePrice"])
y2 = train_enhanced["LogSalePrice"]

# Identify numeric and categorical columns
num_cols2 = X2.select_dtypes(include=["int64", "float64"]).columns
cat_cols2 = X2.select_dtypes(include=["object"]).columns


In [8]:
# Numeric pipeline: impute -> scale -> PCA (keep 95% variance)
numeric_transformer_pca = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95))
])

# Categorical pipeline: impute -> one-hot encode (dummies)
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="None")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing
preprocessor_pca = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_pca, num_cols2),
        ("cat", categorical_transformer, cat_cols2)
    ]
)

# Enhanced regression model: Ridge + PCA + engineered features
ridge_pca_enhanced_model = Pipeline(steps=[
    ("preprocessor", preprocessor_pca),
    ("regressor", Ridge(alpha=1.0))
])


In [9]:
# Train/validation split (same split style as baseline)
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Fit enhanced model
ridge_pca_enhanced_model.fit(X2_train, y2_train)

# Predict on validation set
val_preds2 = ridge_pca_enhanced_model.predict(X2_val)

# Compute RMSE on log scale
rmse2 = np.sqrt(mean_squared_error(y2_val, val_preds2))
print("Enhanced Ridge + PCA Validation RMSE (log scale):", rmse2)


Enhanced Ridge + PCA Validation RMSE (log scale): 0.13735649437829525


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T


In [10]:
# Refit baseline model on full training data
ridge_baseline_model.fit(X, y)

# Predict on test set (log scale)
test_preds_log_baseline = ridge_baseline_model.predict(test)

# Convert back to original SalePrice scale
test_preds_baseline = np.exp(test_preds_log_baseline)

# Create submission DataFrame
submission_baseline = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_preds_baseline
})

# Save file
submission_baseline.to_csv("submission_ridge_baseline.csv", index=False)

print("Saved: submission_ridge_baseline.csv")


Saved: submission_ridge_baseline.csv


In [11]:
# Create enhanced version of test data
test_enhanced = test.copy()

test_enhanced["GrLivArea_sq"] = test_enhanced["GrLivArea"] ** 2
test_enhanced["Qual_x_GrLivArea"] = test_enhanced["OverallQual"] * test_enhanced["GrLivArea"]

# Refit enhanced model on full enhanced training data
ridge_pca_enhanced_model.fit(X2, y2)

# Predict on enhanced test set (log scale)
test_preds_log_enhanced = ridge_pca_enhanced_model.predict(test_enhanced)

# Convert back to original SalePrice scale
test_preds_enhanced = np.exp(test_preds_log_enhanced)

# Create submission DataFrame
submission_enhanced = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_preds_enhanced
})

# Save file
submission_enhanced.to_csv("submission_ridge_enhanced.csv", index=False)

print("Saved: submission_ridge_enhanced.csv")


Saved: submission_ridge_enhanced.csv


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
