# Predicting House Prices — End-to-End Regression Workflow
This notebook walks through data loading, EDA, baselines, linear and regularized regression models, diagnostics, cross-validation, and saving the champion model and artifacts as requested.

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data (assumes the CSV is in the same folder as the notebook)
df = pd.read_csv('House Price India.csv')
print('rows,cols:', df.shape)
df.head()

rows,cols: (14620, 23)


Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,...,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,6762810145,42491,5,2.5,3650,9050,2.0,0,4,5,...,1921,0,122003,52.8645,-114.557,2880,5400,2,58,2380000
1,6762810635,42491,4,2.5,2920,4000,1.5,0,0,5,...,1909,0,122004,52.8878,-114.47,2470,4000,2,51,1400000
2,6762810998,42491,5,2.75,2910,9480,1.5,0,0,3,...,1939,0,122004,52.8852,-114.468,2940,6600,1,53,1200000
3,6762812605,42491,4,2.5,3310,42998,2.0,0,0,3,...,2001,0,122005,52.9532,-114.321,3350,42847,3,76,838000
4,6762812919,42491,3,2.0,2710,4500,1.5,0,0,4,...,1929,0,122006,52.9047,-114.485,2060,4500,1,51,805000


In [3]:
# Quick overview: types, nulls, basic stats
df.info()
df.isna().sum()
df.describe(include='all').T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14620 entries, 0 to 14619
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     14620 non-null  int64  
 1   Date                                   14620 non-null  int64  
 2   number of bedrooms                     14620 non-null  int64  
 3   number of bathrooms                    14620 non-null  float64
 4   living area                            14620 non-null  int64  
 5   lot area                               14620 non-null  int64  
 6   number of floors                       14620 non-null  float64
 7   waterfront present                     14620 non-null  int64  
 8   number of views                        14620 non-null  int64  
 9   condition of the house                 14620 non-null  int64  
 10  grade of the house                     14620 non-null  int64  
 11  Ar

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,14620.0,6762821000.0,6237.574799,6762810000.0,6762815000.0,6762821000.0,6762826000.0,6762832000.0
Date,14620.0,42604.54,67.347991,42491.0,42546.0,42600.0,42662.0,42734.0
number of bedrooms,14620.0,3.379343,0.938719,1.0,3.0,3.0,4.0,33.0
number of bathrooms,14620.0,2.129583,0.769934,0.5,1.75,2.25,2.5,8.0
living area,14620.0,2098.263,928.275721,370.0,1440.0,1930.0,2570.0,13540.0
lot area,14620.0,15093.28,37919.621304,520.0,5010.75,7620.0,10800.0,1074218.0
number of floors,14620.0,1.50236,0.540239,1.0,1.0,1.5,2.0,3.5
waterfront present,14620.0,0.007660739,0.087193,0.0,0.0,0.0,0.0,1.0
number of views,14620.0,0.2331053,0.766259,0.0,0.0,0.0,0.0,4.0
condition of the house,14620.0,3.430506,0.664151,1.0,3.0,3.0,4.0,5.0


In [4]:
# Basic cleaning & feature selection/engineering
# We'll create a working dataframe with commonly useful numeric features.
work_cols = ['Price', 'living area', 'number of bedrooms', 'number of bathrooms', 'lot area', 'number of floors', 'Built Year', 'Renovation Year', 'Postal Code', 'Lattitude', 'Longitude']
# Some datasets have duplicate or inconsistent names; ensure columns exist
for c in work_cols:
    if c not in df.columns:
        print('Missing col:', c)
# Create a copy and drop rows with missing target
data = df.copy()
data = data.dropna(subset=['Price'])
# Simple feature: age (approx)
data['age'] = 2025 - data['Built Year'].astype(float)
# Fill modest missing values with median where numerical
num_cols = ['living area', 'number of bedrooms', 'number of bathrooms', 'lot area', 'number of floors']
for c in num_cols:
    if data[c].isna().any():
        data[c] = data[c].fillna(data[c].median())
data = data[ ['Price'] + num_cols + ['age', 'Postal Code', 'Lattitude', 'Longitude'] ]
data = data.dropna()
data.shape

(14620, 10)

In [5]:
# Train/test split — keep a hold-out test set for final validation
train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)
X_train = train_df.drop(columns=['Price'])
y_train = train_df['Price']
X_test = test_df.drop(columns=['Price'])
y_test = test_df['Price']
print('train/test shapes:', X_train.shape, X_test.shape)

train/test shapes: (11696, 9) (2924, 9)


In [6]:
# Baselines: mean, median, group-wise by Postal Code median price (if group present)
def evaluate_preds(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mape = (np.abs((y_true - y_pred) / y_true).replace([np.inf, -np.inf], np.nan).dropna() * 100).mean()
    return dict(RMSE=rmse, MAE=mae, R2=r2, MAPE=mape)
# Mean baseline
mean_pred = y_train.mean()
mean_baseline = evaluate_preds(y_test, pd.Series(mean_pred, index=y_test.index))
# Median baseline
median_pred = y_train.median()
median_baseline = evaluate_preds(y_test, pd.Series(median_pred, index=y_test.index))
# Group-wise median by Postal Code baseline (fallback to global median if unseen)
postal_medians = train_df.groupby('Postal Code')['Price'].median()
postal_pred = X_test['Postal Code'].map(postal_medians).fillna(median_pred)
postal_baseline = evaluate_preds(y_test, postal_pred)
pd.DataFrame({'mean_baseline': mean_baseline, 'median_baseline': median_baseline, 'postal_baseline': postal_baseline})

TypeError: got an unexpected keyword argument 'squared'

In [None]:
# Simple Linear Regression (SLR) using 'living area'
slr_model = LinearRegression()
X_train_slr = X_train[['living area']].values.reshape(-1,1)
X_test_slr = X_test[['living area']].values.reshape(-1,1)
slr_model.fit(X_train_slr, y_train)
y_pred_slr = slr_model.predict(X_test_slr)
slr_metrics = evaluate_preds(y_test, pd.Series(y_pred_slr, index=y_test.index))
slr_coef = slr_model.coef_[0] if hasattr(slr_model, 'coef_') else None
slr_intercept = slr_model.intercept_ if hasattr(slr_model, 'intercept_') else None
print('SLR coef, intercept:', slr_coef, slr_intercept)
slr_metrics

In [None]:
# Multiple Linear Regression (MLR) with standard scaling
mlr_pipe = Pipeline([('scaler', StandardScaler()), ('lr', LinearRegression())])
mlr_pipe.fit(X_train, y_train)
y_pred_mlr = mlr_pipe.predict(X_test)
mlr_metrics = evaluate_preds(y_test, pd.Series(y_pred_mlr, index=y_test.index))
mlr_metrics

In [None]:
# Residual analysis for MLR
resid = y_test - y_pred_mlr
plt.figure(figsize=(6,4))
sns.histplot(resid, kde=True)
plt.title('Residuals distribution (MLR)')
plt.show()
plt.figure(figsize=(6,4))
plt.scatter(y_pred_mlr, resid, alpha=0.4)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.title('Predicted vs Residuals (MLR)')
plt.show()

In [None]:
# Multicollinearity check: VIF (on training features after filling/scaling)
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Prepare numeric design matrix for VIF (no scaler needed for VIF calc but avoid perfect multicollinearity)
X_vif = X_train.copy()
X_vif = X_vif.assign(const=1)
vif_data = []
for i, col in enumerate(X_vif.columns[:-1]):
    try:
        vif = variance_inflation_factor(X_vif.values, i)
    except Exception as e:
        vif = np.nan
    vif_data.append({'feature': col, 'VIF': vif})
pd.DataFrame(vif_data).sort_values('VIF', ascending=False)

In [None]:
# Regularized models with simple hyperparameter search using cross-validation
models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(max_iter=5000),
    'ElasticNet': ElasticNet(max_iter=5000)
}
param_grid = {'Ridge': {'alpha': [0.1, 1, 10, 50, 100]},
              'Lasso': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]},
              'ElasticNet': {'alpha': [0.001, 0.01, 0.1, 1], 'l1_ratio': [0.2, 0.5, 0.8]}}
results = {}
for name, model in models.items():
    pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
    gs = GridSearchCV(pipe, param_grid[name], cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
    gs.fit(X_train, y_train)
    best = gs.best_estimator_
    preds = best.predict(X_test)
    results[name] = {'best_params': gs.best_params_, 'metrics': evaluate_preds(y_test, pd.Series(preds, index=y_test.index))}
pd.DataFrame({k: v['metrics'] for k, v in results.items()}).T

In [None]:
# Cross-validation comparison (K-Fold) for final candidate models
kf = KFold(n_splits=5, shuffle=True, random_state=42)
candidates = {
    'MLR': mlr_pipe,
    'Ridge_best': Pipeline([('scaler', StandardScaler()), ('model', Ridge(**results['Ridge']['best_params']))]),
    'Lasso_best': Pipeline([('scaler', StandardScaler()), ('model', Lasso(**results['Lasso']['best_params'], max_iter=5000))]),
    'ElasticNet_best': Pipeline([('scaler', StandardScaler()), ('model', ElasticNet(**results['ElasticNet']['best_params'], max_iter=5000))])
}
cv_table = {}
for name, est in candidates.items():
    scores = -cross_val_score(est, X_train, y_train, cv=kf, scoring='neg_root_mean_squared_error', n_jobs=-1)
    cv_table[name] = {'RMSE_mean': scores.mean(), 'RMSE_std': scores.std()}
pd.DataFrame(cv_table).T

In [None]:
# Champion model selection: choose lowest CV RMSE (example) and evaluate on test set
cv_df = pd.DataFrame(cv_table).T
champion_name = cv_df['RMSE_mean'].idxmin()
print('Champion (by CV RMSE):', champion_name)
champion = candidates[champion_name]
champion.fit(X_train, y_train)
y_pred_champion = champion.predict(X_test)
champion_metrics = evaluate_preds(y_test, pd.Series(y_pred_champion, index=y_test.index))
champion_metrics

In [None]:
# Save artifacts
joblib.dump(champion, 'champion_model.joblib')
pred_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred_champion})
pred_df.to_csv('houseprice_predictions_test.csv', index=False)
with open('test_metrics.json', 'w') as f:
    json.dump(champion_metrics, f, indent=2)
print('Artifacts saved: champion_model.joblib, houseprice_predictions_test.csv, test_metrics.json')

## Short report (to expand in final notebook)
- Summarize EDA and key predictors (living area, bedrooms, bathrooms, age, lot area).
- Residual analysis: check distribution and heteroscedasticity; transform target if large skew.
- VIF: remove or combine highly collinear features.
- Regularization: Ridge/Lasso/ElasticNet helped reduce overfitting and stabilized coefficients; GridSearchCV picks alpha.
- Champion model selection: chosen by cross-validated RMSE and validated on hold-out test set.