## ELLA

In [5]:
# Original Repo (8 years ago!): https://github.com/paulruvolo/ELLA

# ELLA.py converted to Python 3, PyTorch

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

class ELLA(nn.Module):
    def __init__(self, d, k, base_learner, base_learner_kwargs={}, mu=1, lam=1, k_init=False):
        super(ELLA, self).__init__()
        self.d = d
        self.k = k
        self.L = nn.Parameter(torch.randn(d, k))
        self.A = torch.zeros((d * k, d * k), dtype=torch.float64)
        self.b = torch.zeros((d * k, 1), dtype=torch.float64)
        self.S = torch.zeros((k, 0), dtype=torch.float64)
        self.T = 0
        self.mu = mu
        self.lam = lam
        self.k_init = k_init

        if base_learner in [nn.Linear, nn.Ridge]:
            self.perf_metric = nn.functional.mse_loss
        elif base_learner == nn.LogisticRegression:
            self.perf_metric = nn.functional.binary_cross_entropy
        else:
            raise Exception("Unsupported Base Learner")

        self.base_learner = base_learner
        self.base_learner_kwargs = base_learner_kwargs

    def forward(self, x):
        return x

    def fit(self, X, y, task_id):
        self.T += 1
        single_task_model = self.base_learner(self.d, 1, bias=False, **self.base_learner_kwargs).double()
        criterion = nn.MSELoss()

        optimizer = optim.SGD(single_task_model.parameters(), lr=0.01)

        X_tensor = Variable(torch.from_numpy(X).double())
        y_tensor = Variable(torch.from_numpy(y).double())

        for epoch in range(100):  # adjust the number of epochs as needed
            optimizer.zero_grad()
            outputs = single_task_model(X_tensor)
            loss = criterion(outputs, y_tensor)
            loss.backward()
            optimizer.step()

        D_t = self.get_hessian(single_task_model, X_tensor, y_tensor)
        D_t_sqrt = torch.matrix_sqrt(D_t)
        theta_t = single_task_model.weight.data.t()

        sparse_encode = nn.functional.lasso(torch.mm(D_t_sqrt, self.L),
                                            torch.mm(D_t_sqrt, theta_t.t()),
                                            alpha=self.mu / (X.shape[0] * 2.0),
                                            fit_intercept=False)
        sparse_coeffs = sparse_encode.coef_

        self.S = torch.cat((self.S, sparse_coeffs.t()))

        self.A += torch.kron(self.S[:, task_id].view(-1, 1), self.S[:, task_id].view(1, -1)) * D_t
        self.b += torch.kron(self.S[:, task_id].t(), theta_t.t() @ D_t).t()
        L_vectorized = torch.inverse(self.A / self.T + self.lam * torch.eye(self.d * self.k, self.d * self.k,
                                                                             dtype=torch.float64)) @ self.b / self.T
        self.L.data = L_vectorized.view(self.k, self.d).t()
        self.revive_dead_components()

    def revive_dead_components(self):
        for i, val in enumerate(torch.sum(self.L, dim=0)):
            if abs(val) < 1e-8:
                self.L[:, i] = torch.randn(self.d, dtype=torch.float64)

    def predict(self, X, task_id):
        if self.base_learner == nn.Linear or self.base_learner == nn.Ridge:
            return X @ self.L @ self.S[:, task_id]
        elif self.base_learner == nn.LogisticRegression:
            return 1. / (1.0 + torch.exp(-X @ self.L @ self.S[:, task_id])) > 0.5

    def score(self, X, y, task_id):
        return self.perf_metric(self.predict(X, task_id), y)

    def get_hessian(self, model, X, y):
        theta_t = model.weight.data.t()
        if self.base_learner == nn.Linear:
            return X.t() @ X / (2.0 * X.shape[0])
        elif self.base_learner == nn.Ridge:
            return X.t() @ X / (2.0 * X.shape[0]) + model.weight_decay * torch.eye(self.d, dtype=torch.float64)
        elif self.base_learner == nn.LogisticRegression:
            preds = 1. / (1.0 + torch.exp(-X @ theta_t.t()))
            base = preds * (1 - preds)
            hessian = (base.view(1, -1) * X).t() @ X / (2.0 * X.shape[0])
            return hessian + torch.eye(self.d, dtype=torch.float64) / (2.0 * model.C)

In [7]:
# ELLA.ipynb

%matplotlib inline
#from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split

def multi_task_train_test_split(Xs,Ys,train_size=0.5):
    Xs_train = []
    Ys_train = []
    Xs_test = []
    Ys_test = []
    for t in range(len(Xs)):
        X_train, X_test, y_train, y_test = train_test_split(Xs[t], np.squeeze(Ys[t]), train_size=train_size)
        Xs_train.append(X_train)
        Xs_test.append(X_test)
        Ys_train.append(y_train)
        Ys_test.append(y_test)
    return Xs_train, Xs_test, Ys_train, Ys_test

#from ELLA import ELLA
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
from scipy.linalg import norm
import numpy as np

T = 20
d = 10
n = 100
k = 5
noise_var = .1

model = ELLA(d,k,Ridge,mu=1,lam=10**-5)

S_true = np.random.randn(k,T)
L_true = np.random.randn(d,k)
w_true = L_true.dot(S_true)

# make sure to add a bias term (it is not done automatically)
Xs = [np.hstack((np.random.randn(n,d-1), np.ones((n,1)))) for i in range(T)]
# generate the synthetic labels
Ys = [Xs[i].dot(w_true[:,i]) + noise_var*np.random.randn(n,) for i in range(T)]
# break into train and test sets
Xs_train, Xs_test, Ys_train, Ys_test = multi_task_train_test_split(Xs,Ys,train_size=0.5)

for t in range(T):
    model.fit(Xs_train[t], Ys_train[t], t)
#print "Average explained variance score", np.mean([model.score(Xs_test[t], Ys_test[t], t) for t in range(T)])
print("Average explained variance score", np.mean([model.score(Xs_test[t], Ys_test[t], t) for t in range(T)]))

# Try out a classification problem
Ys_binarized_train = [Ys_train[i] > 0 for i in range(T)]
Ys_binarized_test = [Ys_test[i] > 0 for i in range(T)]

model = ELLA(d,k,LogisticRegression,mu=1,lam=10**-5)
for t in range(T):
    model.fit(Xs_train[t], Ys_binarized_train[t], t)

#print "Average classification accuracy", np.mean([model.score(Xs_test[t], Ys_binarized_test[t], t) for t in range(T)])
print("Average classification accuracy", np.mean([model.score(Xs_test[t], Ys_binarized_test[t], t) for t in range(T)]))


AttributeError: module 'torch.nn' has no attribute 'Ridge'

In [None]:
from scipy.io import loadmat
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

data = loadmat('landminedata.mat')

Xs_lm = []
Ys_lm = []
for t in range(data['feature'].shape[1]):
    X_t = data['feature'][0,t]
    Xs_lm.append(np.hstack((X_t,np.ones((X_t.shape[0],1)))))
    Ys_lm.append(data['label'][0,t] == 1.0)

d = Xs_lm[0].shape[1]
k = 1

Xs_lm_train, Xs_lm_test, Ys_lm_train, Ys_lm_test = multi_task_train_test_split(Xs_lm,Ys_lm,train_size=0.5)
model = ELLA(d,k,LogisticRegression,{'C':10**0},mu=1,lam=10**-5)
for t in range(T):
    model.fit(Xs_lm_train[t], Ys_lm_train[t], t)

print model.S    

print "Average AUC:", np.mean([roc_auc_score(Ys_lm_test[t],
                                             model.predict_logprobs(Xs_lm_test[t], t))
                               for t in range(1)])

## Elastic Weight Consolidation Only

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from copy import deepcopy
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Generate a regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define the model architecture
class RegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

# Define the loss function and optimizer
model = RegressionModel(input_dim=10, output_dim=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model using EWC
fisher_matrices = []
for name, param in model.named_parameters():
    fisher_matrices.append(torch.zeros_like(param))

for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(zip(X_train, y_train)):
        inputs, labels = data
        inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
        labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Compute the Fisher information matrix for each parameter
        loss.backward()
        for j, (name, param) in enumerate(model.named_parameters()):
            fisher_matrices[j] += (param.grad.detach() ** 2) / len(X_train)

        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {running_loss/len(X_train)}")

    # Evaluate the model on the test set
    with torch.no_grad():
        test_loss = 0.0
        for i, data in enumerate(zip(X_test, y_test)):
            inputs, labels = data
            inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
            labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            test_loss += loss.item()

        print(f"Test Loss: {test_loss/len(X_test)}")

    # Apply EWC to the model
    for j, (name, param) in enumerate(model.named_parameters()):
        penalty = 0.0
        for k in range(epoch):
            old_param = deepcopy(model).state_dict()[name]
            old_fisher = fisher_matrices[j]
            new_fisher = torch.zeros_like(old_fisher)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            new_fisher += (param.grad.detach() ** 2) / len(X_train)
            penalty += ((old_param - param) ** 2) * (old_fisher + new_fisher)
        param.grad += penalty

    optimizer.step()

# EWC, LwF, and IMM

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Generate a regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define the model architecture
class RegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

# Define the loss function and optimizer
model = RegressionModel(input_dim=10, output_dim=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Select the lifelong learning method to use
lifelong_learning_method = "EWC"  # Change this to "LwF" or "IMM" to use a different method

if lifelong_learning_method == "EWC":
    # Train the model using Elastic Weight Consolidation
    for epoch in range(10):
        running_loss = 0.0
        for i, data in enumerate(zip(X_train, y_train)):
            inputs, labels = data
            inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
            labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Compute the Fisher information matrix for each parameter
            loss.backward()
            for j, (name, param) in enumerate(model.named_parameters()):
                fisher_matrices[j] += (param.grad.detach() ** 2) / len(X_train)

            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1} - Loss: {running_loss/len(X_train)}")

        # Evaluate the model on the test set
        with torch.no_grad():
            test_loss = 0.0
            for i, data in enumerate(zip(X_test, y_test)):
                inputs, labels = data
                inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
                labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                test_loss += loss.item()

            print(f"Test Loss: {test_loss/len(X_test)}")

        # Apply EWC to the model
        for j, (name, param) in enumerate(model.named_parameters()):
            penalty = 0.0
            for k in range(epoch):
                old_param = deepcopy(model).state_dict()[name]
                old_fisher = fisher_matrices[j]
                new_fisher = torch.zeros_like(old_fisher)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                new_fisher += (param.grad.detach() ** 2) / len(X_train)
                penalty += ((old_param - param) ** 2) * (old_fisher + new_fisher)
            param.grad += penalty

        optimizer.step()

elif lifelong_learning_method == "LwF":
    # Train the model using Learning without Forgetting
    old_params = [param.clone() for param in model.parameters()]
    alpha = [1.0 for _ in range(len(old_params))]

    for epoch in range(10):
        running_loss = 0.0
        for i, data in enumerate(zip(X_train, y_train)):
            inputs, labels = data
            inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
            labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Add the penalty term to the gradient
            loss.backward()
            for j, (name, param) in enumerate(model.named_parameters()):
                param.grad += (param.data - old_params[j].data) / alpha[j]
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1} - Loss: {running_loss/len(X_train)}")

        # Evaluate the model on the test set
        #...

        # Save the current model parameters for the next epoch
        old_params = [param.clone() for param in model.parameters()]
        alpha = [1.0 for _ in range(len(old_params))]

elif lifelong_learning_method == "IMM":
    # Train the model using Incremental Moment Matching
    old_params = [param.clone() for param in model.parameters()]
    alpha = [1.0 for _ in range(len(old_params))]

    for epoch in range(10):
        running_loss = 0.0
        for i, data in enumerate(zip(X_train, y_train)):
            inputs, labels = data
            inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
            labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Add the penalty term to the gradient
            loss.backward()
            for j, (name, param) in enumerate(model.named_parameters()):
                param.grad += (param.data - old_params[j].data) / alpha[j]

            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1} - Loss: {running_loss/len(X_train)}")

        # Evaluate the model on the test set
        with torch.no_grad():
            test_loss = 0.0
            for i, data in enumerate(zip(X_test, y_test)):
                inputs, labels = data
                inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
                labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                test_loss += loss.item()

            print(f"

## Combining All Three Into One

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Generate a regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define the model architecture
class RegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

In [None]:
# Define the loss function and optimizer
model = RegressionModel(input_dim=10, output_dim=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Select the lifelong learning method to use
lifelong_learning_method = "EWC"  # Change this to "LwF" or "IMM" to use a different method

In [None]:
if lifelong_learning_method == "LwF" or lifelong_learning_method == "IMM":
    # Train the model using Learning without Forgetting / Incremental Moment Matching
    old_params = [param.clone() for param in model.parameters()]
    alpha = [1.0 for _ in range(len(old_params))]

# Train the model
for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(zip(X_train, y_train)):
        inputs, labels = data
        inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
        labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        if lifelong_learning_method == "EWC":
            # Compute the Fisher information matrix for each parameter
            for j, (name, param) in enumerate(model.named_parameters()):
                fisher_matrices[j] += (param.grad.detach() ** 2) / len(X_train)
        elif lifelong_learning_method == "LwF" or lifelong_learning_method == "IMM":
            # Add the penalty term to the gradient
            for j, (name, param) in enumerate(model.named_parameters()):
                param.grad += (param.data - old_params[j].data) / alpha[j]

        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1} - Loss: {running_loss/len(X_train)}")

    # Evaluate the model on the test set
    with torch.no_grad():
        test_loss = 0.0
        for i, data in enumerate(zip(X_test, y_test)):
            inputs, labels = data
            inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
            labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            test_loss += loss.item()

        print(f"Test Loss: {test_loss/len(X_test)}")

    
    if lifelong_learning_method == "EWC":
        # Apply EWC to the model
        for j, (name, param) in enumerate(model.named_parameters()):
            penalty = 0.0
            for k in range(epoch):
                old_param = deepcopy(model).state_dict()[name]
                old_fisher = fisher_matrices[j]
                new_fisher = torch.zeros_like(old_fisher)
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                new_fisher += (param.grad.detach() ** 2) / len(X_train)
                penalty += ((old_param - param) ** 2) * (old_fisher + new_fisher)
            param.grad += penalty
        optimizer.step()
    elif lifelong_learning_method == "LwF":
        # Save the current model parameters for the next epoch
        old_params = [param.clone() for param in model.parameters()]
        alpha = [1.0 for _ in range(len(old_params))]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

# Generate a regression dataset
X, y = make_regression(n_samples=1000, n_features=10, noise=0.1, random_state=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define the model architecture
class RegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(RegressionModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

# Define the loss function and optimizer
model = RegressionModel(input_dim=10, output_dim=1)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define the penalty term for IMM
penalty = 0.1

# Define the training function
def train_model(model, optimizer, criterion, X_train, y_train, X_test, y_test, penalty=None):
    old_params = [param.clone() for param in model.parameters()]
    alpha = [1.0 for _ in range(len(old_params))]

    for epoch in range(10):
        running_loss = 0.0
        for i, data in enumerate(zip(X_train, y_train)):
            inputs, labels = data
            inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
            labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Add the penalty term to the loss function
            if penalty is not None:
                for j, (name, param) in enumerate(model.named_parameters()):
                    loss += penalty * ((param - old_params[j]) ** 2).sum()

            loss.backward()

            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1} - Loss: {running_loss/len(X_train)}")

        # Evaluate the model on the test set
        with torch.no_grad():
            test_loss = 0.0
            for i, data in enumerate(zip(X_test, y_test)):
                inputs, labels = data
                inputs = torch.tensor(inputs, dtype=torch.float32).unsqueeze(0)
                labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(0)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                test_loss += loss.item()

            print(f"Test Loss: {test_loss/len(X_test)}")

        # Save the current model parameters for the next epoch
        old_params = [param.clone() for param in model.parameters()]
        alpha = [1.0 for _ in range(len(old_params))]

# Train the model using LwF
train_model(model, optimizer, criterion, X_train, y_train, X_test, y_test, penalty=None)

# Train the model using IMM
train_model(model, optimizer, criterion, X_train, y_train, X_test, y_test, penalty=penalty)

BingAI

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Code for using Incremental Moment Matching
def imm(model_old, model_new, alpha=0.5):
    for param_old, param_new in zip(model_old.parameters(), model_new.parameters()):
        param_new.data.copy_(alpha * param_old.data + (1 - alpha) * param_new.data)

# Defining the model architecture
class LifelongRegressor(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=100):
        super(LifelongRegressor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Defining the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training the model
for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    # Use Incremental Moment Matching
    imm(model, new_model)

    # Update the old model with the new one
    model.load_state_dict(new_model.state_dict())

In [None]:
# Importing the required libraries
import torch
import torch.nn as nn
import torch.optim as optim

# Defining the model architecture
class LifelongRegressor(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=100):
        super(LifelongRegressor, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Defining the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training the model on the first task
for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

# Train the model on a new task
new_model = LifelongRegressor(input_size, output_size)
criterion = nn.MSELoss()
optimizer = optim.SGD(new_model.parameters(), lr=0.01)

for epoch in range(num_epochs):
    for i, (inputs, targets) in enumerate(new_train_loader):
        optimizer.zero_grad()
        outputs = new_model(inputs)
        loss = criterion(outputs, targets)

        # Compute the distillation loss
        if epoch > 0:
            old_outputs = model(inputs)
            distillation_loss = criterion(outputs[:, :num_outputs], old_outputs.detach())
            loss += alpha * distillation_loss

        loss.backward()
        optimizer.step()

    # Update the old model with the new one
    model.load_state_dict(new_model.state_dict())

