# Delta Learning

Delta Learning mainly concerns about **Domain Generalization** Problem.
That is: 
1. if the training and testing data are from **different domain** (or say distribution), 
2. but they contains the **same pattern**

how can we build model that can generalize accross different domains?

For example, when a child learns to recognize cat in real life, he can easily recognize cat in different domains without any hints or instructions:
* Emoji cat: 🐱
* Cartoon cat: <img src='https://ctl.s6img.com/society6/img/5uFHiOtud7B5teZ02cp0Mo5O_FY/w_700/prints/~artwork/s6-original-art-uploads/society6/uploads/misc/07c252f2a6f04364ab7d484376e803be/~~/calico-cat1907816-prints.jpg?wait=0&attempt=0' width=10%/>



In [22]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.autograd import Variable


class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
        out = self.linear(x)
        return out

    
def trainLR(model, X, y):
    lr = 0.01
    epochs = 10

    loss_func = torch.nn.MSELoss() 
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    REGULARIZE = False
    l1_lambda = 0.1

    batch_size = 10
    N = X.shape[0]

    for epoch in range(epochs):
        # Converting inputs and labels to Variable
        s, e = 0, batch_size
        while s < e and e <= N:
            inputs = Variable(torch.from_numpy(X[s:e]))
            labels = Variable(torch.from_numpy(y[s:e]))

            # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
            optimizer.zero_grad()

            # get output from the model, given the inputs
            outputs = model(inputs)

            # get loss for the predicted output
            loss = loss_func(outputs, labels)

            if REGULARIZE:
                for W in model.parameters():
                    loss +=  l1_lambda * W.norm(1).sum()

            # get gradients w.r.t to parameters
            loss.backward()
            # update parameters
            optimizer.step()

            s += batch_size
            e = min(e + batch_size, N)

        weight = model.linear.weight.detach().numpy()[0][0]
        bias = model.linear.bias.detach().numpy()[0]
        grad = model.linear.weight.grad.detach().numpy()[0][0]
        print(f'epoch {epoch}, loss {loss.item():.2f}, weight:{weight:.2f}, bias:{bias:.2f}, grad:{grad:.2f}')
        

In [23]:
x_values = [i for i in range(10)]
X = np.array(x_values, dtype=np.float32)
# np.random.shuffle(X)
X = X.reshape(-1, 1)
y = X * 2 + 1
n_feat = X.shape[-1]        # takes variable 'x' 
n_out = y.shape[-1]        # takes variable 'y'


# create model
model = linearRegression(n_feat, n_out)

trainLR(model, X, y)

print('=== Look at the parameter, the model has already capture the law: y = 2 * X + b ===')


epoch 0, loss 242.57, weight:0.83, bias:0.71, grad:-166.26
epoch 1, loss 41.91, weight:1.52, bias:0.82, grad:-69.10
epoch 2, loss 7.24, weight:1.81, bias:0.87, grad:-28.72
epoch 3, loss 1.25, weight:1.93, bias:0.89, grad:-11.93
epoch 4, loss 0.22, weight:1.98, bias:0.90, grad:-4.95
epoch 5, loss 0.04, weight:2.00, bias:0.90, grad:-2.05
epoch 6, loss 0.01, weight:2.01, bias:0.90, grad:-0.85
epoch 7, loss 0.00, weight:2.01, bias:0.90, grad:-0.35
epoch 8, loss 0.00, weight:2.01, bias:0.90, grad:-0.14
epoch 9, loss 0.00, weight:2.01, bias:0.90, grad:-0.05
=== Look at the parameter, the model has already capture the law: y = 2 * X + b ===


In [24]:
x_values = [i for i in range(10)]
# NOTE: We shift x_values by 1000, nothing else changes
print('=== Now we shift the data distribution, while not change the data generation function y = 2 * X + b ===')

X = np.array(x_values, dtype=np.float32) + 1000
# np.random.shuffle(X)
X = X.reshape(-1, 1)
y = X * 2 + 1
trainLR(model, X, y)

=== Now we shift the data distribution, while not change the data generation function y = 2 * X + b ===
epoch 0, loss 221.40, weight:-296.92, bias:0.61, grad:29893.29
epoch 1, loss 90158931968.00, weight:6032046.50, bias:6005.88, grad:-603234368.00
epoch 2, loss 36714140208742793216.00, weight:-121724190720.00, bias:-121177896.00, grad:12173022593024.00
epoch 3, loss 14950580188232114174024482816.00, weight:2456344409931776.00, bias:2445320126464.00, grad:-245646600045592576.00
epoch 4, loss 6088115114101711869062580563582386176.00, weight:-49568020475676721152.00, bias:-49345562163544064.00, grad:4957047933552014917632.00
epoch 5, loss inf, weight:1000262303490725912772608.00, bias:995773180684357599232.00, grad:-100031186758620896112410624.00
epoch 6, loss inf, weight:-20184881893226967581579018240.00, bias:-20094294268065837295337472.00, grad:2018588322381599526115013033984.00
epoch 7, loss inf, weight:407322622033045568305146863026176.00, bias:405494573869134977549715111936.00, gra