
# 20 Hands‑On Mini‑Programs — Core Machine Learning Concepts

This notebook gives you **20 tiny, runnable programs** that explain the foundations of ML using **friendly analogies** and **real‑world mini examples**.  
We keep it dependency‑light (pure Python), so you can run everything anywhere.

**What you'll practice:** data → features → labels → models → evaluation → overfitting → good habits.


## Shared Utilities (used by multiple programs)

In [None]:

import random, math, statistics
random.seed(42)

# --- Metrics ---
def accuracy(y_true, y_pred):
    return sum(1 for a,b in zip(y_true, y_pred) if a==b) / max(1,len(y_true))

def mae(y_true, y_pred):
    return sum(abs(a-b) for a,b in zip(y_true, y_pred)) / max(1,len(y_true))

def mse(y_true, y_pred):
    return sum((a-b)**2 for a,b in zip(y_true, y_pred)) / max(1,len(y_true))

def precision_recall(y_true, y_pred, positive=1):
    tp = sum(1 for yt, yp in zip(y_true, y_pred) if yt==positive and yp==positive)
    fp = sum(1 for yt, yp in zip(y_true, y_pred) if yt!=positive and yp==positive)
    fn = sum(1 for yt, yp in zip(y_true, y_pred) if yt==positive and yp!=positive)
    prec = tp / (tp+fp) if (tp+fp)>0 else 0.0
    rec  = tp / (tp+fn) if (tp+fn)>0 else 0.0
    return prec, rec

# --- Splits ---
def train_test_split(data, test_ratio=0.3, seed=42):
    rnd = random.Random(seed)
    X = data[:]
    rnd.shuffle(X)
    cut = int(len(X)*(1-test_ratio))
    return X[:cut], X[cut:]

# --- Feature scaling (min-max) ---
def fit_minmax(train, feature_names):
    params = {}
    for f in feature_names:
        vals = [row[f] for row in train if row[f] is not None]
        mn = min(vals); mx = max(vals)
        params[f] = (mn, mx if mx>mn else mn+1e-9)
    return params

def apply_minmax(rows, feature_names, params):
    out = []
    for r in rows:
        nr = r.copy()
        for f in feature_names:
            mn, mx = params[f]
            nr[f] = (r[f]-mn)/(mx-mn)
        out.append(nr)
    return out

# --- Simple k-NN (from scratch) ---
def euclidean(a, b, feature_names):
    return math.sqrt(sum((a[f]-b[f])**2 for f in feature_names))

def knn_classify(train, x, k, feature_names, label_name):
    dists = [(euclidean(row, x, feature_names), row[label_name]) for row in train]
    dists.sort(key=lambda t: t[0])
    top = [lab for _,lab in dists[:k]]
    # majority vote
    return max(set(top), key=top.count)

def knn_regress(train, x, k, feature_names, target_name):
    dists = [(euclidean(row, x, feature_names), row[target_name]) for row in train]
    dists.sort(key=lambda t: t[0])
    top = [val for _,val in dists[:k]]
    return sum(top)/len(top)

# --- Gradient Descent for Univariate Linear Regression with L2 ---
def linreg_univariate_gd(X, y, lr=0.01, epochs=1000, l2=0.0):
    # Model: y_hat = w*x + b
    w = 0.0; b = 0.0
    n = len(X)
    for _ in range(epochs):
        y_hat = [w*x + b for x in X]
        # gradients
        dw = (2/n)*sum((yh - yt)*x for yh,yt,x in zip(y_hat, y, X)) + 2*l2*w
        db = (2/n)*sum((yh - yt) for yh,yt in zip(y_hat, y))
        w -= lr*dw
        b -= lr*db
    return w, b

# --- One-hot encoding for a single categorical feature ---
def fit_one_hot(values):
    uniq = sorted(set(values))
    index = {v:i for i,v in enumerate(uniq)}
    return index

def transform_one_hot(value, index):
    vec = [0]*len(index)
    vec[index[value]] = 1
    return vec

# --- Simple decision stump (one feature, best threshold by error) ---
def decision_stump_fit(rows, feature, label):
    # consider midpoints between sorted unique feature values
    arr = sorted((r[feature], r[label]) for r in rows)
    candidates = []
    for i in range(len(arr)-1):
        a, _ = arr[i]; b, _ = arr[i+1]
        if a != b:
            candidates.append((a+b)/2)
    best = (float("inf"), None, None)  # error, threshold, polarity
    for thr in candidates:
        for polarity in [1, -1]:  # 1: predict 1 if x>=thr else 0; -1: reverse
            errs = 0
            for x, y in arr:
                pred = 1 if (x>=thr) else 0
                if polarity==-1: pred = 1-pred
                errs += (pred!=y)
            if errs < best[0]:
                best = (errs, thr, polarity)
    return {"threshold": best[1], "polarity": best[2]}

def decision_stump_predict(model, x):
    thr = model["threshold"]; pol = model["polarity"]
    pred = 1 if (x>=thr) else 0
    if pol==-1: pred = 1-pred
    return pred

# --- Imputation ---
def mean_impute(rows, feature):
    vals = [r[feature] for r in rows if r[feature] is not None]
    m = sum(vals)/len(vals)
    for r in rows:
        if r[feature] is None:
            r[feature] = m
    return rows


### Program 1 — Data, Features, Labels (Fruit Ripeness Analogy)

In [None]:

# Data as a list of dicts; features: weight (g), color_intensity (0-1); label: ripe (1) or not (0)
data = [
    {"weight": 120, "color": 0.2, "ripe": 0},
    {"weight": 150, "color": 0.6, "ripe": 1},
    {"weight": 135, "color": 0.55, "ripe": 1},
    {"weight": 90,  "color": 0.1, "ripe": 0},
]
features = ["weight","color"]
labels = [row["ripe"] for row in data]
print("Features names:", features)
print("First row features:", {f:data[0][f] for f in features}, "Label:", data[0]["ripe"])


### Program 2 — Turning Raw Text into Features (Support Ticket Urgency)

In [None]:

tickets = [
    {"msg":"Server down ASAP!!!", "urgent":1},
    {"msg":"Minor UI bug on profile page", "urgent":0},
    {"msg":"Payment failing for multiple users!", "urgent":1},
    {"msg":"Typo in footer", "urgent":0},
]
urgent_words = {"asap","urgent","failing","down","error"}

def text_features(s):
    words = s.lower().split()
    return {
        "len": len(s),
        "bangs": s.count("!"),
        "urgent_count": sum(1 for w in words if w.strip("!.,") in urgent_words)
    }

feats = [{"features": text_features(t["msg"]), "label": t["urgent"]} for t in tickets]
for f in feats:
    print(f)


### Program 3 — Classification vs Regression (Spam vs Delivery Time)

In [None]:

emails = [
    {"words": 120, "links": 5, "spam":1},
    {"words": 30,  "links": 0, "spam":0},
]
deliveries = [
    {"distance_km": 2.0, "time_min": 9.0},
    {"distance_km": 6.0, "time_min": 22.0},
]

print("Classification label (spam):", [e["spam"] for e in emails])
print("Regression target (delivery time):", [d["time_min"] for d in deliveries])


### Program 4 — Train/Test Split + Baseline (Majority Class)

In [None]:

dataset = [{"x":i, "label": 1 if i%3==0 else 0} for i in range(30)]  # more zeros than ones
train, test = train_test_split(dataset, test_ratio=0.3, seed=7)
majority = 1 if sum(r["label"] for r in train) > (len(train)/2) else 0
y_true = [r["label"] for r in test]
y_pred = [majority]*len(test)
print("Baseline majority prediction:", majority)
print("Test accuracy:", round(accuracy(y_true, y_pred), 3))


### Program 5 — k-NN Classifier (Weather → Play Outside?)

In [None]:

weather = [
    {"temp":28, "humidity":60, "play":1},
    {"temp":35, "humidity":80, "play":0},
    {"temp":22, "humidity":50, "play":1},
    {"temp":31, "humidity":65, "play":0},
    {"temp":25, "humidity":55, "play":1},
]
features = ["temp","humidity"]
train, test = train_test_split(weather, test_ratio=0.4, seed=1)
k = 3
preds = [knn_classify(train, row, k, features, "play") for row in test]
print("True:", [r["play"] for r in test])
print("Pred:", preds, "Accuracy:", round(accuracy([r["play"] for r in test], preds), 3))


### Program 6 — k-NN Regressor (House Price by Size)

In [None]:

houses = [{"size": s, "price": 50 + 0.3*s + random.uniform(-5,5)} for s in range(600, 1001, 100)]
train, test = train_test_split(houses, test_ratio=0.33, seed=2)
features = ["size"]; target = "price"
k=3
y_true = [r[target] for r in test]
y_pred = [knn_regress(train, r, k, features, target) for r in test]
print("MAE:", round(mae(y_true, y_pred), 2))
for r, p in zip(test, y_pred):
    print(f"size={r['size']}, true={round(r[target],1)}, pred={round(p,1)}")


### Program 7 — Linear Regression via Gradient Descent (Ice‑cream Sales vs Temperature)

In [None]:

# Synthetic relation: sales = 10*temp + 5 + noise
temps = [18,20,22,24,26,28,30]
sales = [10*t + 5 + random.uniform(-10,10) for t in temps]
w, b = linreg_univariate_gd(temps, sales, lr=0.001, epochs=5000)
preds = [w*t + b for t in temps]
print("Learned: sales ≈", round(w,2), "* temp +", round(b,2))
print("MAE:", round(mae(sales, preds), 2))


### Program 8 — Feature Scaling Matters (k‑NN Distance)

In [None]:

people = [
    {"age":20, "income":20000, "buy":0},
    {"age":21, "income":21000, "buy":0},
    {"age":45, "income":120000, "buy":1},
    {"age":46, "income":125000, "buy":1},
]
features = ["age","income"]
train, test = people[:3], people[3:]

# Without scaling
pred_raw = [knn_classify(train, row, 1, features, "buy") for row in test]

# With min-max scaling using train stats
params = fit_minmax(train, features)
train_scaled = apply_minmax(train, features, params)
test_scaled  = apply_minmax(test,  features, params)
pred_scaled = [knn_classify(train_scaled, row, 1, features, "buy") for row in test_scaled]

print("Prediction w/o scaling:", pred_raw)
print("Prediction with scaling:", pred_scaled, "(often better because distances are balanced)")


### Program 9 — Overfitting: k=1 vs k=7 (Noisy Labels)

In [None]:

# Build noisy 1D classification: label = 1 if x>0.5, but flip 15% randomly
data = [{"x":random.random(), "y": 1 if random.random() > 0.15 else 0} for _ in range(80)]
for r in data: 
    r["y"] = 1 if r["x"]>0.5 else 0
    if random.random()<0.15: r["y"]=1-r["y"]
train, test = train_test_split(data, test_ratio=0.3, seed=3)
feat = ["x"]
y_true = [r["y"] for r in test]
pred_k1 = [knn_classify(train, row, 1, feat, "y") for row in test]  # high variance
pred_k7 = [knn_classify(train, row, 7, feat, "y") for row in test]  # smoother
print("Acc k=1:", round(accuracy(y_true, pred_k1),3), "| Acc k=7:", round(accuracy(y_true, pred_k7),3))


### Program 10 — Cross‑Validation (Pick a Good k for k‑NN)

In [None]:

# Small dataset: points near (0,0)->class 0, near (1,1)->class 1
data = []
for _ in range(20):
    data.append({"x": random.uniform(0,0.3), "y": random.uniform(0,0.3), "c":0})
    data.append({"x": random.uniform(0.7,1.0), "y": random.uniform(0.7,1.0), "c":1})

def kfold_indices(n, k=5, seed=123):
    idx = list(range(n))
    random.Random(seed).shuffle(idx)
    fold_size = n//k
    folds = [idx[i*fold_size:(i+1)*fold_size] for i in range(k-1)]
    folds.append(idx[(k-1)*fold_size:])
    return folds

def cv_score_k(k_val):
    folds = kfold_indices(len(data), k=5)
    feats = ["x","y"]
    scores = []
    for i in range(5):
        test_idx = set(folds[i])
        train_rows = [data[j] for j in range(len(data)) if j not in test_idx]
        test_rows  = [data[j] for j in range(len(data)) if j in test_idx]
        y_true = [r["c"] for r in test_rows]
        y_pred = [knn_classify(train_rows, r, k_val, feats, "c") for r in test_rows]
        scores.append(accuracy(y_true, y_pred))
    return sum(scores)/len(scores)

for k in [1,3,5,7,9]:
    print("k=",k,"CV accuracy=", round(cv_score_k(k),3))


### Program 11 — Regularization (L2) Reduces Overfitting (Noisy Line Fit)

In [None]:

# y = 2*x + 1 + noise. Compare GD with and without L2 (same epochs).
X = [i for i in range(20)]
y = [2*x + 1 + random.uniform(-3,3) for x in X]
w1, b1 = linreg_univariate_gd(X, y, lr=0.01, epochs=1000, l2=0.0)
w2, b2 = linreg_univariate_gd(X, y, lr=0.01, epochs=1000, l2=0.1)
pred1 = [w1*x+b1 for x in X]
pred2 = [w2*x+b2 for x in X]
print("No L2:    w=", round(w1,3), "b=", round(b1,3), "MSE=", round(mse(y,pred1),3))
print("With L2:  w=", round(w2,3), "b=", round(b2,3), "MSE=", round(mse(y,pred2),3))


### Program 12 — One‑Hot Encoding (City → Features)

In [None]:

rows = [
    {"city":"Bengaluru","temp":28},
    {"city":"Mumbai","temp":31},
    {"city":"Chennai","temp":33},
    {"city":"Bengaluru","temp":29},
]
index = fit_one_hot([r["city"] for r in rows])
encoded = [transform_one_hot(r["city"], index) + [r["temp"]] for r in rows]
print("Index:", index)
print("Encoded rows (one‑hot city + temp):")
for vec in encoded: print(vec)


### Program 13 — Handling Missing Values (Mean Imputation)

In [None]:

students = [
    {"hours": 5.0}, {"hours": 6.5}, {"hours": None}, {"hours": 4.0}, {"hours": None}
]
print("Before:", students)
students = mean_impute(students, "hours")
print("After:", students)


### Program 14 — Feature Engineering (Build an Engagement Score)

In [None]:

activity = [
  {"views": 5, "minutes": 10, "replies": 0},
  {"views": 12,"minutes": 45, "replies": 2},
  {"views": 2, "minutes": 3,  "replies": 0},
]
# A simple, explainable score (weights chosen by common sense)
for r in activity:
    r["engagement"] = 0.2*r["views"] + 0.7*r["minutes"] + 1.5*r["replies"]
print(activity)


### Program 15 — Naive Bayes (Tiny Spam Filter from Scratch)

In [None]:

dataset = [
    {"text":"win cash now", "spam":1},
    {"text":"meeting schedule attached", "spam":0},
    {"text":"limited offer cash prize", "spam":1},
    {"text":"please review the report", "spam":0},
]
# Build vocabulary
def tokenize(s): return [w.strip(".,!").lower() for w in s.split()]
vocab = sorted(set(w for row in dataset for w in tokenize(row["text"])))
# Train: P(class) and P(word|class) with Laplace smoothing
class_counts = {0:0,1:0}
word_counts = {0:{w:0 for w in vocab}, 1:{w:0 for w in vocab}}
for row in dataset:
    c = row["spam"]; class_counts[c]+=1
    for w in set(tokenize(row["text"])):
        word_counts[c][w]+=1
def predict(s):
    words = set(tokenize(s))
    total = sum(class_counts.values())
    logp = {}
    for c in [0,1]:
        # prior
        logp[c] = math.log((class_counts[c]+1)/(total+2))
        for w in vocab:
            # likelihood
            pwc = (word_counts[c][w]+1)/(class_counts[c]+2)
            logp[c] += math.log(pwc if (w in words) else (1-pwc))
    return 1 if logp[1]>logp[0] else 0
tests = ["cash prize available", "attach the schedule", "win big offer", "please check"]
for s in tests:
    print(s, "=>", "SPAM" if predict(s)==1 else "HAM")


### Program 16 — Decision Stump (One‑Rule Classifier: Hours Studied → Pass)

In [None]:

rows = [{"hours":h, "pass": 1 if h>=6 else 0} for h in [2,3,4,5,6,7,8]]
# add noise
rows[2]["pass"]=1; rows[5]["pass"]=0
model = decision_stump_fit(rows, "hours", "pass")
print("Model:", model)
preds = [decision_stump_predict(model, r["hours"]) for r in rows]
print("Training accuracy:", round(accuracy([r["pass"] for r in rows], preds),3))


### Program 17 — Imbalanced Data (Accuracy can mislead; check Precision/Recall)

In [None]:

# Fraud detection: only 5% positives
y_true = [1 if i<5 else 0 for i in range(100)]
# "Dumb" model predicts all zeros
y_pred = [0]*100
acc = accuracy(y_true, y_pred)
prec, rec = precision_recall(y_true, y_pred, positive=1)
print("Accuracy:", acc, "| Precision:", prec, "| Recall:", rec, "(terrible recall for rare frauds)")


### Program 18 — Data Leakage (Cheating Features)

In [None]:

# If we sneak the true label into features, model appears 'perfect' but it's cheating.
train = [{"x": i, "label": i%2} for i in range(10)]
# Bad feature: 'leaky' that equals the label itself
for r in train: r["leaky"] = r["label"]
def trivial_classifier(row): return row["leaky"]  # 100% on train, 0% useful in real world
pred = [trivial_classifier(r) for r in train]
print("Train accuracy (fake):", accuracy([r["label"] for r in train], pred), "← data leakage!")


### Program 19 — Data Drift (Compare Feature Means Over Time)

In [None]:

# Train period vs recent period
train_period = [{"x": random.gauss(10, 2)} for _ in range(200)]
recent      = [{"x": random.gauss(12, 2)} for _ in range(200)]
mean_train = statistics.mean(r["x"] for r in train_period)
mean_recent = statistics.mean(r["x"] for r in recent)
print("Mean(train)=", round(mean_train,2), "Mean(recent)=", round(mean_recent,2), "Δ=", round(mean_recent-mean_train,2))


### Program 20 — End‑to‑End Mini Pipeline (One‑Hot + Scaling + k‑NN)

In [None]:

# Predict if a rider will take a ride again next week based on city (categorical) and trips_this_week (numeric).
data = [
    {"city":"BLR","trips":3,"again":1},
    {"city":"BLR","trips":0,"again":0},
    {"city":"MUM","trips":5,"again":1},
    {"city":"MUM","trips":1,"again":0},
    {"city":"DEL","trips":2,"again":1},
    {"city":"DEL","trips":0,"again":0},
]
# One‑hot city
index = fit_one_hot([r["city"] for r in data])
def to_features(row):
    return {"c0": transform_one_hot(row["city"], index)[0] if len(index)>0 else 0,
            "c1": transform_one_hot(row["city"], index)[1] if len(index)>1 else 0,
            "c2": transform_one_hot(row["city"], index)[2] if len(index)>2 else 0,
            "trips": row["trips"],
            "again": row["again"]}
rows = [to_features(r) for r in data]
# scale numeric 'trips'
params = fit_minmax(rows, ["trips"])
rows = apply_minmax(rows, ["trips"], params)

train, test = train_test_split(rows, test_ratio=0.33, seed=5)
feature_names = ["c0","c1","c2","trips"]
preds = [knn_classify(train, r, 3, feature_names, "again") for r in test]
print("True:", [r["again"] for r in test])
print("Pred:", preds, "Accuracy:", round(accuracy([r["again"] for r in test], preds),3))
