
# 20 Hands-on ML Programs (House-Price Style Demos)
**Focus:** Practical regression workflows like predicting house prices  
**Stack:** Python, scikit-learn, NumPy, pandas, matplotlib (offline-friendly)  
**Last updated:** 2025-09-05 17:42

These 20 small programs showcase end-to-end ML tasks you can adapt for real projects:
- Clean *inputs → pipeline → model → metrics → sample predictions*
- Concepts: preprocessing, encoding, scaling, model zoo, regularization, CV, grid search, feature selection, robustness, learning curves, persistence.

> **Note:** We use **synthetic but realistic housing-like data** (e.g., area, bedrooms, location score, city, parking) so you can run offline without external downloads.


In [None]:

# Core imports and environment check
import sys, warnings, math
warnings.filterwarnings("ignore")

try:
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, learning_curve
    from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, RANSACRegressor
    from sklearn.svm import SVR
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
    from sklearn.impute import SimpleImputer
    from sklearn.feature_selection import SelectKBest, f_regression
    import joblib
    SKLEARN_OK = True
except Exception as e:
    SKLEARN_OK = False
    print("⚠️ scikit-learn or dependencies not available:", e)
    print("Please ensure scikit-learn, numpy, pandas, matplotlib are installed in this environment.")


In [None]:

# Utility functions: synthetic housing-like dataset + evaluation helpers

def generate_housing(n=800, seed=42, with_cats=True, with_noise=True):
    rng = np.random.default_rng(seed)
    # Numeric features
    area = rng.normal(1000, 300, n).clip(350, 3000)                 # sqft
    bedrooms = rng.integers(1, 5, n)                                # 1..4
    age = rng.integers(0, 35, n)                                    # years
    floor = rng.integers(0, 15, n)                                  # floor number
    loc_score = rng.integers(1, 6, n)                               # 1..5
    distance_center = rng.normal(8, 5, n).clip(0.5, 25)             # km
    
    # Categorical features
    if with_cats:
        city = rng.choice(['Bengaluru','Mumbai','Delhi','Chennai'], n, p=[0.4,0.25,0.2,0.15])
        parking = rng.choice(['none','street','garage'], n, p=[0.2,0.5,0.3])
        balcony = rng.integers(0, 3, n)                              # 0..2 (could treat as numeric)
    else:
        city = np.array(['Bengaluru']*n)
        parking = np.array(['street']*n)
        balcony = rng.integers(0, 3, n)
    
    # Price generation (non-linear + interactions) in lakhs
    base = (
        0.07*area
        + 8.0*bedrooms
        - 0.9*age
        + 2.0*loc_score
        - 1.5*distance_center
        + 1.0*floor
        + 3.0*balcony
    )
    # City effect
    city_adj = {'Bengaluru': 15, 'Mumbai': 35, 'Delhi': 25, 'Chennai': 18}
    city_bias = np.vectorize(city_adj.get)(city)
    # Parking effect
    pk_adj = {'none': 0, 'street': 6, 'garage': 12}
    pk_bias = np.vectorize(pk_adj.get)(parking)
    
    price = base + city_bias + pk_bias
    # Non-linear bump: diminishing returns on very large area
    price -= 0.000006*(area-1500).clip(0)**2
    # Noise
    if with_noise:
        price += rng.normal(0, 10, n)  # observation noise

    df = pd.DataFrame({
        "area": area.round(0),
        "bedrooms": bedrooms,
        "age": age,
        "floor": floor,
        "loc_score": loc_score,
        "distance_center": distance_center.round(2),
        "balcony": balcony,
        "city": city,
        "parking": parking,
        "price": price.round(2),
    })
    return df

def eval_regression(y_true, y_pred, label="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    print(f"{label} -> MAE: {mae:.2f} | RMSE: {rmse:.2f} | R²: {r2:.3f}")
    return mae, rmse, r2


## Dataset Preview

In [None]:

if SKLEARN_OK:
    df = generate_housing(n=12, seed=7)
    display(df.head(10))
else:
    print("Install scikit-learn, numpy, pandas to run the demos.")


In [None]:

# Common train/test split + feature lists
if SKLEARN_OK:
    data = generate_housing(n=1000, seed=10)
    num_features = ["area","bedrooms","age","floor","loc_score","distance_center","balcony"]
    cat_features = ["city","parking"]
    X = data[num_features + cat_features]
    y = data["price"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

    # Shared preprocessors
    numeric_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        # scaling optional per model; enable in pipelines where needed
    ])
    categorical_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])
    preproc = ColumnTransformer([
        ("num", numeric_pipe, num_features),
        ("cat", categorical_pipe, cat_features),
    ])


### 1) Baseline Linear Regression (numeric-only features)

In [None]:

if SKLEARN_OK:
    Xn = X_train[num_features]; Xt = X_test[num_features]
    pipe = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("lin", LinearRegression())
    ])
    pipe.fit(Xn, y_train)
    preds = pipe.predict(Xt)
    eval_regression(y_test, preds, "LinearRegression (numeric-only)")
    print("Sample preds:", np.round(preds[:5], 2))


### 2) Linear Regression with One-Hot Encoding (full features)

In [None]:

if SKLEARN_OK:
    pipe = Pipeline([
        ("pre", preproc),
        ("lin", LinearRegression())
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    eval_regression(y_test, preds, "LinearRegression + OneHot")
    print("Sample preds:", np.round(preds[:5], 2))


### 3) Polynomial Features + Linear Regression (captures simple non-linearities)

In [None]:

if SKLEARN_OK:
    poly_cols = ["area","distance_center","loc_score","age"]
    poly_prep = ColumnTransformer([
        ("poly", Pipeline([
            ("impute", SimpleImputer(strategy="median")),
            ("poly", PolynomialFeatures(degree=2, include_bias=False))
        ]), poly_cols),
        ("rest_num", numeric_pipe, [c for c in num_features if c not in poly_cols]),
        ("cat", categorical_pipe, cat_features),
    ])
    pipe = Pipeline([
        ("pre", poly_prep),
        ("lin", LinearRegression())
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    eval_regression(y_test, preds, "Poly(2)+LinearRegression")


### 4) Ridge Regression (with StandardScaler)

In [None]:

if SKLEARN_OK:
    ridge_pipe = Pipeline([
        ("pre", ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_features),
            ("cat", categorical_pipe, cat_features),
        ])),
        ("ridge", Ridge(alpha=10.0, random_state=0))
    ])
    ridge_pipe.fit(X_train, y_train)
    preds = ridge_pipe.predict(X_test)
    eval_regression(y_test, preds, "Ridge(alpha=10)")


### 5) Lasso Regression (sparse weights)

In [None]:

if SKLEARN_OK:
    lasso_pipe = Pipeline([
        ("pre", ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_features),
            ("cat", categorical_pipe, cat_features),
        ])),
        ("lasso", Lasso(alpha=0.01, random_state=0, max_iter=10000))
    ])
    lasso_pipe.fit(X_train, y_train)
    preds = lasso_pipe.predict(X_test)
    eval_regression(y_test, preds, "Lasso(alpha=0.01)")


### 6) ElasticNet (L1+L2 blend)

In [None]:

if SKLEARN_OK:
    enet_pipe = Pipeline([
        ("pre", ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_features),
            ("cat", categorical_pipe, cat_features),
        ])),
        ("enet", ElasticNet(alpha=0.02, l1_ratio=0.5, random_state=0, max_iter=10000))
    ])
    enet_pipe.fit(X_train, y_train)
    preds = enet_pipe.predict(X_test)
    eval_regression(y_test, preds, "ElasticNet(alpha=0.02, l1_ratio=0.5)")


### 7) Decision Tree Regressor

In [None]:

if SKLEARN_OK:
    tree_pipe = Pipeline([
        ("pre", preproc),
        ("tree", DecisionTreeRegressor(max_depth=6, random_state=0))
    ])
    tree_pipe.fit(X_train, y_train)
    preds = tree_pipe.predict(X_test)
    eval_regression(y_test, preds, "DecisionTree(max_depth=6)")


### 8) Random Forest Regressor (feature importances)

In [None]:

if SKLEARN_OK:
    rf = Pipeline([
        ("pre", preproc),
        ("rf", RandomForestRegressor(n_estimators=200, random_state=0))
    ])
    rf.fit(X_train, y_train)
    preds = rf.predict(X_test)
    eval_regression(y_test, preds, "RandomForest(200)")

    # Rough feature names from ColumnTransformer
    ohe = rf.named_steps["pre"].named_transformers_["cat"].named_steps["onehot"]
    cat_names = list(ohe.get_feature_names_out(cat_features))
    final_feat_names = num_features + cat_names
    importances = rf.named_steps["rf"].feature_importances_
    imp = sorted(zip(final_feat_names, importances), key=lambda x: x[1], reverse=True)[:10]
    print("Top 10 feature importances:", imp)


### 9) Gradient Boosting Regressor

In [None]:

if SKLEARN_OK:
    gbr = Pipeline([
        ("pre", preproc),
        ("gbr", GradientBoostingRegressor(random_state=0))
    ])
    gbr.fit(X_train, y_train)
    preds = gbr.predict(X_test)
    eval_regression(y_test, preds, "GradientBoosting")


### 10) HistGradientBoostingRegressor (fast boosting)

In [None]:

if SKLEARN_OK:
    hgb = Pipeline([
        ("pre", preproc),
        ("hgb", HistGradientBoostingRegressor(random_state=0))
    ])
    hgb.fit(X_train, y_train)
    preds = hgb.predict(X_test)
    eval_regression(y_test, preds, "HistGradientBoosting")


### 11) KNN Regressor (with scaling)

In [None]:

if SKLEARN_OK:
    knn_pipe = Pipeline([
        ("pre", ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_features),
            ("cat", categorical_pipe, cat_features),
        ])),
        ("knn", KNeighborsRegressor(n_neighbors=7))
    ])
    knn_pipe.fit(X_train, y_train)
    preds = knn_pipe.predict(X_test)
    eval_regression(y_test, preds, "KNN(k=7)")


### 12) Support Vector Regressor (RBF kernel)

In [None]:

if SKLEARN_OK:
    svr_pipe = Pipeline([
        ("pre", ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_features),
            ("cat", categorical_pipe, cat_features),
        ])),
        ("svr", SVR(C=5, epsilon=0.2, kernel="rbf"))
    ])
    svr_pipe.fit(X_train, y_train)
    preds = svr_pipe.predict(X_test)
    eval_regression(y_test, preds, "SVR(RBF)")


### 13) Huber Regressor (robust linear model)

In [None]:

if SKLEARN_OK:
    huber_pipe = Pipeline([
        ("pre", ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_features),
            ("cat", categorical_pipe, cat_features),
        ])),
        ("huber", HuberRegressor(epsilon=1.35, max_iter=2000))
    ])
    huber_pipe.fit(X_train, y_train)
    preds = huber_pipe.predict(X_test)
    eval_regression(y_test, preds, "HuberRegressor")


### 14) RANSAC Regressor (outlier-resistant)

In [None]:

if SKLEARN_OK:
    base = Pipeline([
        ("pre", ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_features),
            ("cat", categorical_pipe, cat_features),
        ])),
        ("lin", LinearRegression())
    ])
    ransac = RANSACRegressor(base_estimator=LinearRegression(), random_state=0)
    # Fit on preprocessed numeric-only for simplicity in RANSAC demo
    # (RANSAC expects array; we'll preprocess separately)
    pre_only = ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler())]), num_features),
        ("cat", categorical_pipe, cat_features),
    ])
    Xtr = pre_only.fit_transform(X_train)
    Xte = pre_only.transform(X_test)
    ransac.fit(Xtr, y_train)
    preds = ransac.predict(Xte)
    eval_regression(y_test, preds, "RANSAC(Linear)")


### 15) Handling Missing Values (SimpleImputer)

In [None]:

if SKLEARN_OK:
    # Inject missing values artificially
    X_miss = X.copy()
    X_miss.loc[X_miss.sample(frac=0.1, random_state=1).index, "age"] = np.nan
    Xm_train, Xm_test, ym_train, ym_test = train_test_split(X_miss, y, test_size=0.2, random_state=42)
    pipe = Pipeline([
        ("pre", ColumnTransformer([
            ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_features),
            ("cat", categorical_pipe, cat_features),
        ])),
        ("rf", RandomForestRegressor(n_estimators=150, random_state=0))
    ])
    pipe.fit(Xm_train, ym_train)
    preds = pipe.predict(Xm_test)
    eval_regression(ym_test, preds, "RF with Imputer (10% missing age)")


### 16) Feature Selection (SelectKBest + Linear Regression)

In [None]:

if SKLEARN_OK:
    prep = ColumnTransformer([
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_features),
        ("cat", categorical_pipe, cat_features),
    ])
    pipe = Pipeline([
        ("pre", prep),
        ("select", SelectKBest(score_func=f_regression, k=10)),
        ("lin", LinearRegression())
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    eval_regression(y_test, preds, "SelectKBest(k=10)+Linear")


### 17) Cross-Validation (KFold + RandomForest)

In [None]:

if SKLEARN_OK:
    rf = Pipeline([("pre", preproc), ("rf", RandomForestRegressor(n_estimators=200, random_state=0))])
    kf = KFold(n_splits=5, shuffle=True, random_state=0)
    scores = cross_val_score(rf, X, y, cv=kf, scoring="neg_mean_absolute_error")
    print("CV MAE (5-fold):", np.round(-scores, 2), " | Mean:", np.round(-scores.mean(), 2))


### 18) Hyperparameter Tuning (GridSearchCV on RandomForest)

In [None]:

if SKLEARN_OK:
    rf = Pipeline([("pre", preproc), ("rf", RandomForestRegressor(random_state=0))])
    grid = {
        "rf__n_estimators": [100, 200],
        "rf__max_depth": [None, 10, 15],
        "rf__min_samples_split": [2, 5]
    }
    gs = GridSearchCV(rf, grid, cv=3, scoring="neg_mean_absolute_error", n_jobs=None)
    gs.fit(X_train, y_train)
    print("Best params:", gs.best_params_)
    preds = gs.predict(X_test)
    eval_regression(y_test, preds, "GridSearchCV-RF (best)")


### 19) Learning Curve (Linear Regression)

In [None]:

if SKLEARN_OK:
    lin = Pipeline([("pre", preproc), ("lin", LinearRegression())])
    train_sizes, train_scores, val_scores = learning_curve(
        lin, X, y, cv=5, train_sizes=np.linspace(0.1, 1.0, 5), scoring="r2", random_state=None
    )
    plt.figure()
    plt.plot(train_sizes, train_scores.mean(axis=1), marker="o", label="Train R²")
    plt.plot(train_sizes, val_scores.mean(axis=1), marker="s", label="CV R²")
    plt.xlabel("Training examples")
    plt.ylabel("R² score")
    plt.title("Learning Curve: Linear Regression")
    plt.legend()
    plt.show()


### 20) Save & Load a Trained Model (joblib)

In [None]:

if SKLEARN_OK:
    final_model = Pipeline([("pre", preproc), ("gbr", GradientBoostingRegressor(random_state=0))])
    final_model.fit(X_train, y_train)
    joblib.dump(final_model, "/mnt/data/house_price_model.joblib")
    print("Saved model to /mnt/data/house_price_model.joblib")
    # Load and predict
    loaded = joblib.load("/mnt/data/house_price_model.joblib")
    sample = X_test.iloc[:3]
    print("Loaded model predictions:", np.round(loaded.predict(sample), 2))



---

## How to Use This Notebook
- Run cells top-to-bottom.
- Each program is standalone and prints metrics (MAE, RMSE, R²) and sometimes extra info (feature importances, plots, CV scores).
- Swap `seed` or tweak feature weights inside `generate_housing()` to simulate other cities/markets.

**Next ideas:** try stacking/ensembling, monotonic constraints (for boosting), partial dependence plots, or exporting to ONNX for deployment.
