# House Price Prediction â€” Regularised Regression

This notebook implements the full analysis required by the assignment. It is organized into sections: imports, data loading, preprocessing, hyperparameter selection (Ridge & Lasso), feature importance, `alpha` doubling experiment, and retraining when top features are unavailable.

Kernel:  Python 3

## 1) Imports and configuration

Standard imports and plotting configuration.

In [None]:
# Cell: imports
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
sns.set(style='whitegrid')
print('Imports OK')

In [None]:
# Utility imports used later in the notebook
import json
from pprint import pprint
print('json and pprint imported')

## 2) Load data and quick checks

Load `train.csv` (must be in the workspace root) and show basic diagnostics.

In [None]:
p = Path('train.csv')
assert p.exists(), 'train.csv not found in workspace root'
df = pd.read_csv(p)
print('Shape:', df.shape)
df.head(3)

# Basic missingness
miss = df.isnull().mean().sort_values(ascending=False)
miss[miss>0].head(10)

## 2.1) Exploratory Data Analysis (EDA)

Quick EDA: target distribution, missingness, duplicates, top numeric correlations, and a simple derived feature (`TotalSF`) if available. Figures are saved to disk for the report.

In [None]:
# EDA: target distribution, missingness, correlations, duplicates, derived feature
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams.update({'figure.max_open_warning': 0})

# Target distribution
plt.figure(figsize=(8,4))
sns.histplot(df['SalePrice'], kde=True, color='C0')
plt.title('SalePrice distribution')
plt.tight_layout()
plt.savefig('fig_target_hist.png', bbox_inches='tight')
plt.close()

plt.figure(figsize=(8,4))
sns.histplot(np.log1p(df['SalePrice']), kde=True, color='C1')
plt.title('log1p(SalePrice) distribution')
plt.tight_layout()
plt.savefig('fig_target_log_hist.png', bbox_inches='tight')
plt.close()

# Missingness summary and small heatmap (top missing columns)
miss = df.isnull().mean().sort_values(ascending=False)
top_miss = miss[miss>0].head(20)
print('Top missing columns (fraction missing):')
print(top_miss)

if len(top_miss):
    plt.figure(figsize=(10,6))
    sns.heatmap(df[top_miss.index].isnull(), cbar=False)
    plt.title('Missingness heatmap (top columns)')
    plt.tight_layout()
    plt.savefig('fig_missing_heatmap.png', bbox_inches='tight')
    plt.close()

# Duplicates check
dups = df.duplicated().sum()
print('Duplicate rows:', dups)

# Numeric correlations with target
num = df.select_dtypes(include=['int64','float64']).drop(columns=['SalePrice'])
corrs = num.corrwith(df['SalePrice']).abs().sort_values(ascending=False)
print('Top numeric correlations with SalePrice:')
print(corrs.head(10))

plt.figure(figsize=(8,6))
sns.barplot(x=corrs.head(10).values, y=corrs.head(10).index, palette='viridis')
plt.title('Top numeric features correlated with SalePrice')
plt.tight_layout()
plt.savefig('fig_top_corrs.png', bbox_inches='tight')
plt.close()

# Correlation heatmap for top numeric features (if enough exist)
top_nums = corrs.head(15).index.tolist()
if len(top_nums) > 1:
    plt.figure(figsize=(10,8))
    sns.heatmap(df[top_nums + ['SalePrice']].corr(), annot=False, cmap='coolwarm')
    plt.title('Correlation matrix (top numeric features)')
    plt.tight_layout()
    plt.savefig('fig_corr_matrix.png', bbox_inches='tight')
    plt.close()

# Simple derived feature: TotalSF if 1st/2nd/TotalBsmt present
derived_cols = ['1stFlrSF','2ndFlrSF','TotalBsmtSF']
if all(c in df.columns for c in derived_cols):
    df['TotalSF'] = df['1stFlrSF'].fillna(0) + df['2ndFlrSF'].fillna(0) + df['TotalBsmtSF'].fillna(0)
    print('Derived feature `TotalSF` created (sum of 1stFlrSF, 2ndFlrSF, TotalBsmtSF)')
    plt.figure(figsize=(8,4))
    sns.histplot(df['TotalSF'], kde=True, color='C2')
    plt.title('TotalSF distribution')
    plt.tight_layout()
    plt.savefig('fig_totalSF.png', bbox_inches='tight')
    plt.close()

## 3) Preprocessing pipeline

Median imputation for numerics, most-frequent for categoricals, StandardScaler for numerics, OneHotEncoder for categoricals.

## 3.1) Train/Test Split and Hold-out Evaluation

We split the data into train and test sets (80/20). All model selection and CV are performed on the train set. Final RMSE is reported on the hold-out test set.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Separate target
y = df['SalePrice']
X = df.drop(columns=['SalePrice'])
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
print('Numeric cols:', len(numeric_cols), 'Categorical cols:', len(cat_cols))

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_cols),('cat', categorical_transformer, cat_cols)])
print('Preprocessor ready')

## 4) Hyperparameter selection (alpha grid) and cross-validation

We use a log-spaced alpha grid and 5-fold CV. For Lasso we use `LassoCV` to get the full path; for Ridge we compute CV RMSE per alpha and pick the best.

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LassoCV, Ridge
alphas = np.logspace(-3, 3, 50)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Ridge CV (manual grid)
ridge_rmse = []
from sklearn.pipeline import make_pipeline
for a in alphas:
    pipe = make_pipeline(preprocessor, Ridge(alpha=a))
    scores = cross_val_score(pipe, X, y, cv=kf, scoring='neg_mean_squared_error')
    ridge_rmse.append(np.sqrt(-scores).mean())
# LassoCV (uses internal CV)
lasso_pipe = make_pipeline(preprocessor, LassoCV(alphas=alphas, cv=kf, random_state=42, max_iter=10000))
lasso_pipe.fit(X, y)
best_alpha_lasso = lasso_pipe.named_steps['lassocv'].alpha_
# Pick best ridge alpha from grid
best_idx = int(np.argmin(ridge_rmse))
best_alpha_ridge = float(alphas[best_idx])
# Compute CV RMSE summary
ridge_rmse_cv = float(ridge_rmse[best_idx])
# For Lasso, compute CV RMSE from mse_path_ if available
lasso_model = lasso_pipe.named_steps['lassocv']
mse_path = getattr(lasso_model, 'mse_path_', None)
if mse_path is not None:
    lasso_rmse_vals = np.sqrt(mse_path).mean(axis=1)
    # find index for selected alpha in model.alphas_
    if hasattr(lasso_model, 'alphas_'):
        idx = int(np.argmin(np.abs(lasso_model.alphas_ - lasso_model.alpha_)))
        lasso_rmse_cv = float(lasso_rmse_vals[idx])
    else:
        lasso_rmse_cv = float(np.sqrt(np.mean((y - lasso_pipe.predict(X))**2)))
else:
    lasso_rmse_cv = float(np.sqrt(np.mean((y - lasso_pipe.predict(X))**2)))
print('Best Ridge alpha:', best_alpha_ridge)
print('Best Lasso alpha:', best_alpha_lasso)
print('Ridge CV RMSE:', ridge_rmse_cv)
print('Lasso CV RMSE:', lasso_rmse_cv)

In [None]:
# Ensure a hold-out train/test split exists before training on the train set
from sklearn.model_selection import train_test_split
if 'X' not in globals() or 'y' not in globals():
    y = df['SalePrice']
    X = df.drop(columns=['SalePrice'])
# create a deterministic 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train / Test shapes:', X_train.shape, X_test.shape)

In [None]:
# Repeat model selection on train set only, then evaluate on test set
ridge_rmse_tr = []
for a in alphas:
    pipe = make_pipeline(preprocessor, Ridge(alpha=a))
    scores = cross_val_score(pipe, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')
    ridge_rmse_tr.append(np.sqrt(-scores).mean())
best_idx_tr = int(np.argmin(ridge_rmse_tr))
best_alpha_ridge_tr = float(alphas[best_idx_tr])
ridge_final_tr = make_pipeline(preprocessor, Ridge(alpha=best_alpha_ridge_tr))
ridge_final_tr.fit(X_train, y_train)
ridge_test_preds = ridge_final_tr.predict(X_test)
ridge_test_rmse = float(np.sqrt(np.mean((y_test - ridge_test_preds)**2)))
print('Ridge test RMSE:', ridge_test_rmse)
# LassoCV on train set
lasso_pipe_tr = make_pipeline(preprocessor, LassoCV(alphas=alphas, cv=kf, random_state=42, max_iter=10000))
lasso_pipe_tr.fit(X_train, y_train)
best_alpha_lasso_tr = lasso_pipe_tr.named_steps['lassocv'].alpha_
lasso_test_preds = lasso_pipe_tr.predict(X_test)
lasso_test_rmse = float(np.sqrt(np.mean((y_test - lasso_test_preds)**2)))
print('Lasso test RMSE:', lasso_test_rmse)

## 5) Feature importance

Map fitted coefficients back to original feature names and display the top predictors for Ridge and Lasso.

In [None]:
def get_feature_names_from_preprocessor(pre, numeric_cols, cat_cols):
    pre.fit(X)
    num_feats = numeric_cols
    cat_feats = []
    if 'cat' in pre.named_transformers_:
        cat_pipe = pre.named_transformers_['cat']
        if hasattr(cat_pipe, 'named_steps') and 'onehot' in cat_pipe.named_steps:
            ohe = cat_pipe.named_steps['onehot']
            cat_in = pre.transformers[1][2]
            cat_feats = list(ohe.get_feature_names_out(cat_in))
    return num_feats + cat_feats


def top_features_for_model(pipe, numeric_cols, cat_cols, top_n=10):
    # pipe must be fitted
    model = pipe.named_steps[list(pipe.named_steps.keys())[-1]]
    feat_names = get_feature_names_from_preprocessor(preprocessor, numeric_cols, cat_cols)
    coefs = model.coef_
    idx = np.argsort(np.abs(coefs))[::-1][:top_n]
    return [(feat_names[i], float(coefs[i])) for i in idx if i < len(feat_names)]

from sklearn.linear_model import Ridge
ridge_final = make_pipeline(preprocessor, Ridge(alpha=best_alpha_ridge))
ridge_final.fit(X, y)
lasso_final = lasso_pipe
lasso_final.fit(X, y)
ridge_top = top_features_for_model(ridge_final, numeric_cols, cat_cols, top_n=10)
lasso_top = top_features_for_model(lasso_final, numeric_cols, cat_cols, top_n=10)
print('Ridge top features:')
for f,v in ridge_top:
    print(f, v)
print('\nLasso top features:')
for f,v in lasso_top:
    print(f, v)

## 6) Doubling alpha experiment

Retrain models with doubled alpha and report training RMSE to show regularisation effect.

In [None]:
from sklearn.metrics import mean_squared_error
# Retrain Ridge and Lasso with doubled alpha and compute training RMSE for comparison
ridge_double = make_pipeline(preprocessor, Ridge(alpha=best_alpha_ridge*2))
ridge_double.fit(X, y)
from sklearn.linear_model import Lasso
lasso_double = make_pipeline(preprocessor, Lasso(alpha=best_alpha_lasso*2, max_iter=10000))
lasso_double.fit(X, y)

def rmse_on_train(pipe, X_local=X, y_local=y):
    preds = pipe.predict(X_local)
    return float(np.sqrt(mean_squared_error(y_local, preds)))

print('Ridge doubled alpha RMSE (train):', rmse_on_train(ridge_double))
print('Lasso doubled alpha RMSE (train):', rmse_on_train(lasso_double))

## 7) Retrain without top-5 Lasso predictors

Drop the top-5 features identified by Lasso and retrain a fallback Lasso model on the reduced dataset.

In [None]:
top5 = [f for f,_ in lasso_top[:5]]
print('Top-5 Lasso features to remove:', top5)
X_reduced = X.copy()
for feat in top5:
    if feat in X_reduced.columns:
        X_reduced = X_reduced.drop(columns=[feat])
    else:
        if '_' in feat:
            base = feat.split('_')[0]
            if base in X_reduced.columns:
                X_reduced = X_reduced.drop(columns=[base])

numeric_cols_r = X_reduced.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols_r = X_reduced.select_dtypes(include=['object']).columns.tolist()

numeric_transformer_r = Pipeline([('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
categorical_transformer_r = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_r = ColumnTransformer([('num', numeric_transformer_r, numeric_cols_r),('cat', categorical_transformer_r, cat_cols_r)])

from sklearn.linear_model import LassoCV
lasso_r = make_pipeline(preprocessor_r, LassoCV(alphas=alphas, cv=kf, random_state=42, max_iter=10000))
lasso_r.fit(X_reduced, y)

# extract new top features
preprocessor_r.fit(X_reduced)
if 'cat' in preprocessor_r.named_transformers_:
    cat_pipe = preprocessor_r.named_transformers_['cat']
    ohe = cat_pipe.named_steps['onehot']
    cat_in = preprocessor_r.transformers[1][2]
    cat_feats_r = list(ohe.get_feature_names_out(cat_in))
else:
    cat_feats_r = []
feat_names_r = numeric_cols_r + cat_feats_r
coefs_r = lasso_r.named_steps['lassocv'].coef_
idx_r = np.argsort(np.abs(coefs_r))[::-1][:10]
lasso_r_top = [(feat_names_r[i], float(coefs_r[i])) for i in idx_r if i < len(feat_names_r)]
print('Top features after removing top-5 Lasso predictors:')
for f,v in lasso_r_top:
    print(f,v)

In [None]:
# Save summary
summary = {
    'best_alpha_ridge': best_alpha_ridge,
    'best_alpha_lasso': float(best_alpha_lasso),
    'ridge_rmse_cv': ridge_rmse_cv,
    'lasso_rmse_cv': lasso_rmse_cv,
    'ridge_top': ridge_top,
    'lasso_top': lasso_top,
    'top5_lasso_removed': top5,
    'lasso_retrained_top': lasso_r_top,
    'ridge_test_rmse': ridge_test_rmse,
    'lasso_test_rmse': lasso_test_rmse,
}
Path('model_summary.json').write_text(json.dumps(summary, indent=2))
print('Saved model_summary.json')