# Introduction to Deep Learning    Brief historical trends and the core challenges that pushed deep learning forward. Includes a toy experiment showing when shallow models fail but deeper ones succeed.

In [None]:
    # !pip install torch matplotlib scikit-learn    import torch    import torch.nn as nn    import torch.optim as optim    import matplotlib.pyplot as plt    from sklearn.datasets import make_moons    from sklearn.model_selection import train_test_split    torch.manual_seed(0)    

## Historical Trends in Deep Learning    Parameter counts and compute have grown exponentially. Plot a tiny, stylized curve to visualize the rise.

In [None]:
    import numpy as np    years = np.array([1986, 1998, 2006, 2012, 2018, 2023])    params_millions = np.array([0.01, 1, 10, 60, 1000, 175000])  # stylized, not exact    plt.figure(figsize=(6,4))    plt.semilogy(years, params_millions, marker='o')    plt.title("Model parameter growth (log scale)")    plt.xlabel("Year")    plt.ylabel("Parameters (millions)")    plt.grid(True, which='both')    plt.show()    

## Challenges Motivating Deep Learning    Demonstrate a non-linear dataset where a shallow linear classifier underfits, while a deeper network captures the structure.

In [None]:
    X, y = make_moons(n_samples=800, noise=0.25, random_state=1)    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)    X_train_t = torch.tensor(X_train, dtype=torch.float32)    y_train_t = torch.tensor(y_train, dtype=torch.long)    X_test_t = torch.tensor(X_test, dtype=torch.float32)    y_test_t = torch.tensor(y_test, dtype=torch.long)    def train_model(model, lr=0.1, epochs=60):        opt = optim.SGD(model.parameters(), lr=lr)        crit = nn.CrossEntropyLoss()        for _ in range(epochs):            logits = model(X_train_t)            loss = crit(logits, y_train_t)            opt.zero_grad()            loss.backward()            opt.step()        with torch.no_grad():            acc = (model(X_test_t).argmax(1) == y_test_t).float().mean().item()        return acc    # Shallow: single linear layer    shallow = nn.Linear(2, 2)    shallow_acc = train_model(shallow, lr=0.5)    # Deeper: two hidden layers with ReLU    deep = nn.Sequential(nn.Linear(2, 16), nn.ReLU(), nn.Linear(16, 16), nn.ReLU(), nn.Linear(16, 2))    deep_acc = train_model(deep, lr=0.1)    print(f"Shallow linear accuracy: {shallow_acc:.3f}")    print(f"Deep non-linear accuracy: {deep_acc:.3f}")    