In [1]:
!pip install category_encoders



In [2]:
# ==========================
# Import Libraries
# ==========================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, r2_score
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ==========================
# Load Dataset
# ==========================
data = pd.read_csv('kc_house_data.csv')

# ==========================
# Drop Unnecessary Columns
# ==========================
# Columns like date and certain square footage features may not be useful for prediction
drop_cols = ['date', 'yr_built', 'yr_renovated', 'sqft_living15', 'sqft_lot15']
data = data.drop(columns=drop_cols)

# ==========================
# Feature Engineering
# ==========================
# 1. Create a binary indicator for basement existence
data['basement_exists'] = data['sqft_basement'].apply(lambda x: 1 if x > 0 else 0)

# 2. Impute zero basements with median of non-zero basements
median_nonzero = data.loc[data['sqft_basement'] > 0, 'sqft_basement'].median()
data['sqft_basement_imputed'] = data['sqft_basement'].replace(0, median_nonzero)

# 3. Encode ordinal columns (condition, grade)
ordinal_cols = ['condition', 'grade']
le = LabelEncoder()
for col in ordinal_cols:
    data[col] = le.fit_transform(data[col])

# ==========================
# Separate Features and Target
# ==========================
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols.remove('price')  # Remove target column

X = data[numeric_cols]
y = data['price']

# ==========================
# Train-Validation-Test Split
# ==========================
# First split: train+val vs test (15% test)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

# Second split: train vs validation (~15% validation of original dataset)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.1765, random_state=42
)

# ==========================
# Target Encoding for Categorical Feature
# ==========================
# Target encoding for 'zipcode' column
encoder = TargetEncoder(cols=['zipcode'])
encoder.fit(X_train['zipcode'], y_train)

X_train['zipcode'] = encoder.transform(X_train['zipcode'])
X_val['zipcode'] = encoder.transform(X_val['zipcode'])
X_test['zipcode'] = encoder.transform(X_test['zipcode'])

# ==========================
# Feature Scaling
# ==========================
# Standardize numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# ==========================
# Train Decision Tree Regressor
# ==========================
dt_model = DecisionTreeRegressor(
    random_state=42,
    max_depth=14,           # Maximum tree depth
    min_samples_split=20,   # Minimum samples required to split a node
    min_samples_leaf=10     # Minimum samples required at a leaf node
)

dt_model.fit(X_train, y_train)

# ==========================
# Predictions
# ==========================
y_train_pred = dt_model.predict(X_train)
y_val_pred = dt_model.predict(X_val)
y_test_pred = dt_model.predict(X_test)

# ==========================
# Evaluate Model
# ==========================
print("Training MSE:", mean_squared_error(y_train, y_train_pred))
print("Validation MSE:", mean_squared_error(y_val, y_val_pred))
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Training R2:", r2_score(y_train, y_train_pred))
print("Validation R2:", r2_score(y_val, y_val_pred))
print("Test R2:", r2_score(y_test, y_test_pred))


Training MSE: 14440044229.133648
Validation MSE: 22901222907.045574
Test MSE: 28380132245.565525
Training R2: 0.8903204358456701
Validation R2: 0.8256640898965067
Test R2: 0.8140462415018328
