In [None]:
# OneHotEncoder compatibility wrapper for different scikit-learn versions
import sklearn.preprocessing as _skp
_orig_ohe = _skp.OneHotEncoder

def _compat_ohe(*args, **kwargs):
    if 'sparse' in kwargs:
        try:
            return _orig_ohe(*args, **kwargs)
        except TypeError:
            val = kwargs.pop('sparse')
            kwargs['sparse_output'] = val
            return _orig_ohe(*args, **kwargs)
    try:
        return _orig_ohe(*args, **kwargs)
    except TypeError:
        return _orig_ohe(*args, **kwargs)

_skp.OneHotEncoder = _compat_ohe

# Decision Tree Regressor: House Price Prediction
This notebook loads `House_Price_Prediction_Dataset.csv`, trains a Decision Tree Regressor, evaluates it, and shows visualizations.

In [None]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load dataset
df = pd.read_csv('House_Price_Prediction_Dataset.csv')
df.head()

In [None]:
# Quick EDA: types and missing values
df.info()
df.isna().sum()

In [None]:
# Features and target
X = df.drop(['Id','Price'], axis=1)
y = df['Price']
# Identify categorical vs numeric
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols, numeric_cols

In [None]:
# Preprocessing pipeline: One-hot encode categoricals
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols)
], remainder='passthrough')
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create pipeline with Decision Tree
model = Pipeline(steps=[('pre', preprocessor), ('dt', DecisionTreeRegressor(random_state=42))])
# Fit default model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('R2 :', r2_score(y_test, y_pred))

In [None]:
# Feature importance (map back after one-hot)
# Get processed feature names
ohe = model.named_steps['pre'].named_transformers_['cat']
ohe_features = []
if hasattr(ohe, 'get_feature_names_out'):
    ohe_features = list(ohe.get_feature_names_out(categorical_cols))
else:
    # fallback name generation
    for c in categorical_cols:
        values = df[c].astype(str).unique().tolist()
        for v in values:
            ohe_features.append(f'{c}_{v}')
all_features = ohe_features + numeric_cols
dt = model.named_steps['dt']
importances = dt.feature_importances_
feat_imp = pd.Series(importances, index=all_features)
feat_imp = feat_imp.sort_values(ascending=False).head(20)
plt.figure(figsize=(10,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index, palette='viridis')
plt.title('Top 20 Feature Importances')
plt.show()

In [None]:
# Actual vs Predicted plot
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.show()

## Notes
- This notebook trains a Decision Tree Regressor and provides visual diagnostics.
- The Streamlit app (`app.py`) in the repository will train the model live on the same dataset and provide an interactive frontend for exploring predictions and model behavior.