# General
Let's start with the basics:
Given a function that we know has a compositionally sparse representation S(x), define a dense (fully-connected) neural network F(x). Then the optimal solution of this dense NN would be sparse. 
We will focus on SGD at the moment. 

Davide has shown that when the teacher and student have the same activation function, it can be learned well.

### Goal
Familiarize myself with the concept of learning compsitionally sparse function of FCNN and SNN. Duplicate the preliminary results from Davide that L1 induces sparsity which in some way helps. 

### What do we expect and want to achieve?
- Show that when using L1, the dense model achieves sparsity.

### TODO
- automate to see the results for different values of weight decay 

### Open Questions
- What network structure to use for the teacher? CNN or sparse connected network?
- What happens in teacher and student have different activation function?
- Role of Initialization
- Compute the absolute difference or not? If one whole layer is flipped  and activation function is symmetric, it's still kinda correct, no?

### Findings
- It does seem like L1 norm is better, also the bigger the normalizing value, the smaller the difference between L1 and L2 -> don't see this anymore
- Now it seemsl like L2 is better, test low does get quite low
- Use direct output, no binary output -> binary output doesn't give a lot of information

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

TEST_SET_SIZE = 1024
BATCH_SIZE = 8
PATIENCE = 20  # Early stopping patience
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f7f22f1a990>

## SparseNN Teacher and Student, both have exact same architecture

Define a sparse connected network as the "teacher" that generated a Dataset X, y. 

In [10]:
class SparseCNN_1(nn.Module):
    def __init__(self):
        super(SparseCNN_1, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3, stride=3, padding=0, bias=False)
        self.conv2 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=2, stride=2, padding=0, bias = False)
        self.conv3 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=2, stride=2, padding=0, bias = False)
        
    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension for CNN
        x = torch.tanh(self.conv1(x))
        x = torch.tanh(self.conv2(x))
        x = torch.tanh(self.conv3(x))
        return x

teacher_model = SparseCNN_1()

with torch.no_grad():
    teacher_model.conv1.weight.copy_(torch.tensor([[[2.59, -2.83, 0.87]]]))
    teacher_model.conv2.weight.copy_(torch.tensor([[[-1.38, 1.29]]]))
    teacher_model.conv3.weight.copy_(torch.tensor([[[0.86, -0.84]]]))

print("\nTarget function parameters:")
for param in teacher_model.parameters():
    print(param.data.numpy())


Target function parameters:
[[[ 2.59 -2.83  0.87]]]
[[[-1.38  1.29]]]
[[[ 0.86 -0.84]]]


In [11]:
# Use Teacher CNN to generate a new dataset
X_generated = torch.tensor(np.random.randn(TEST_SET_SIZE, 12), dtype=torch.float32)
y_generated = teacher_model(X_generated).detach()

Define the student with the exact same structure and train it on the training dataset

In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def train_model(model, X_train, y_train, optimizer, loss_fn, l1_lambda=0, batch_size=32):
    best_loss = float('inf')
    patience_counter = 0

    # Create a DataLoader to handle batching and shuffling
    dataset = TensorDataset(X_train, y_train)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(10000):
        epoch_loss = 0.0
        
        for batch_X, batch_y in dataloader:
            optimizer.zero_grad()
            y_pred = model(batch_X)
            loss = loss_fn(y_pred, batch_y)

            # Apply L1 regularization if l1_lambda > 0
            if l1_lambda > 0:
                l1_norm = sum(p.abs().sum() for p in model.parameters())
                loss += l1_norm * l1_lambda

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()  # Accumulate batch loss

        # Compute average loss for the epoch
        epoch_loss /= len(dataloader)

        if epoch % 1000 == 0:
            print(f'Epoch {epoch}, Loss: {epoch_loss:.4f}')

        # Early stopping logic
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= 20:
                print(f"Early stopping at epoch {epoch}, best loss: {best_loss:.4f}")
                break

    return model, best_loss


Train with SGD, Student and Teacher identical, no Regularization

In [13]:
student_model = SparseCNN_1()
#student_model.apply(init_weights)

print("\nTarget function parameters:")
for param in teacher_model.parameters():
    print(param.data.numpy())

print("\nStudent function parameters BEFORE training:")
for param in student_model.parameters():
    print(param.data.numpy())

sgd_optim = optim.SGD(student_model.parameters(), lr=0.05, momentum=0.9)
loss_fn = nn.MSELoss()
student_model, final_loss = train_model(student_model, X_train=X_generated, y_train=y_generated, optimizer=sgd_optim, loss_fn=loss_fn)

print("\nStudent function parameters AFTER training:")
for param in student_model.parameters():
    print(param.data.numpy())


Target function parameters:
[[[ 2.59 -2.83  0.87]]]
[[[-1.38  1.29]]]
[[[ 0.86 -0.84]]]

Student function parameters BEFORE training:
[[[ 0.33906782  0.50895905 -0.4235604 ]]]
[[[0.61461455 0.13234162]]]
[[[0.5224168  0.09576386]]]
Epoch 0, Loss: 0.2719
Early stopping at epoch 367, best loss: 0.0000

Student function parameters AFTER training:
[[[-2.5899653  2.8299623 -0.8699878]]]
[[[ 1.3799883 -1.2899888]]]
[[[ 0.860001   -0.84000134]]]


Train with SGD, Student and Teacher identical, L1 Regularization

In [6]:
student_model = SparseCNN_1()
#student_model.apply(init_weights)

print("\nTarget function parameters:")
for param in teacher_model.parameters():
    print(param.data.numpy())

print("\nStudent function parameters BEFORE training:")
for param in student_model.parameters():
    print(param.data.numpy())

sgd_optim = optim.SGD(student_model.parameters(), lr=0.05, momentum=0.9)
loss_fn = nn.MSELoss()
student_model, final_loss = train_model(student_model, X_train=X_generated, y_train=y_generated, optimizer=sgd_optim, loss_fn=loss_fn, l1_lambda=1e-5)

print("\nStudent function parameters AFTER training:")
for param in student_model.parameters():
    print(param.data.numpy())


Target function parameters:
[[[ 2.59 -2.83  0.87]]]
[[[-1.38  1.29]]]
[[[ 0.86 -0.84]]]

Student function parameters BEFORE training:
[[[ 0.27839142 -0.08151665  0.4450711 ]]]
[[[ 0.10451669 -0.33010566]]]
[[[ 0.18024033 -0.32579   ]]]
Epoch 0, Loss: 0.4138
Epoch 100, Loss: 0.0215
Epoch 200, Loss: 0.0050
Epoch 300, Loss: 0.0028
Epoch 400, Loss: 0.0017
Epoch 500, Loss: 0.0012
Epoch 600, Loss: 0.0009
Epoch 700, Loss: 0.0006
Epoch 800, Loss: 0.0005
Epoch 900, Loss: 0.0004
Epoch 1000, Loss: 0.0003
Epoch 1100, Loss: 0.0003
Epoch 1200, Loss: 0.0002
Epoch 1300, Loss: 0.0002
Epoch 1400, Loss: 0.0002
Epoch 1500, Loss: 0.0002
Epoch 1600, Loss: 0.0002
Epoch 1700, Loss: 0.0001
Epoch 1800, Loss: 0.0001
Epoch 1900, Loss: 0.0001
Epoch 2000, Loss: 0.0001
Epoch 2100, Loss: 0.0001
Epoch 2200, Loss: 0.0001
Epoch 2300, Loss: 0.0001
Epoch 2400, Loss: 0.0001
Epoch 2500, Loss: 0.0001
Epoch 2600, Loss: 0.0001
Epoch 2700, Loss: 0.0001
Epoch 2800, Loss: 0.0001
Epoch 2900, Loss: 0.0001
Epoch 3000, Loss: 0.0001


In [7]:
# Compute distance metric: how different are the entries of the teacher and student parameters
def compute_distance_metric(teacher_model, student_model):
    distance = 0.0
    for teacher_param, student_param in zip(teacher_model.parameters(), student_model.parameters()):
        distance += torch.norm(torch.abs(teacher_param) - torch.abs(student_param)).item() #TODO: maybe not do absolute values / make this smarter
    return distance

distance_metric = compute_distance_metric(teacher_model, student_model)
print(f"Distance metric between teacher and student model parameters: {distance_metric:.4f}")


Distance metric between teacher and student model parameters: 0.0141


In [8]:
#Run code 5 times and then average the results of the distance metric
def run_student_model(l1_lambda = 0, l2_lambda = 0, print_params = False):
    student_model = SparseCNN_1()

    # print("\nStudent function parameters BEFORE training:")
    # for param in student_model.parameters():
    #     print(param.data.numpy())

    sgd_optim = optim.SGD(student_model.parameters(), lr=0.05, momentum=0.9, weight_decay=l2_lambda)
    loss_fn = nn.MSELoss()
    student_model, final_loss = train_model(student_model, X_train=X_generated, y_train=y_generated, optimizer=sgd_optim, loss_fn=loss_fn, l1_lambda=l1_lambda)
    
    if print_params == True:
        print("\nTarget function parameters:")
        for param in teacher_model.parameters():
            print(param.data.numpy())
        
        print("\nStudent function parameters AFTER training:")
        for param in student_model.parameters():
            print(param.data.numpy())

    distance_metric = compute_distance_metric(teacher_model=teacher_model, student_model=student_model)
    return distance_metric, final_loss


# distances_l1 = []
# losses = []
# for _ in range(5):
#     distance_metric, final_loss = run_student_model()
#     distances_l1.append(distance_metric)
#     losses.append(round(final_loss,4))

# print("Avg. distances no weight decay", np.mean(distances_l1))
# print("Losses", losses)

distances_l1 = []
distances_l2 = []
for _ in range(5):
    distances_l1.append(run_student_model(l1_lambda=1e-5))
    distances_l2.append(run_student_model(l2_lambda=1e-5))

print("Avg. distances using L1 norm", np.mean(distances_l1))
print("Avg. distances using L2 norm", np.mean(distances_l2))


# distances_l1 = []
# distances_l2 = []
# for _ in range(5):
#     distances_l1.append(run_student_model(l1_lambda=0.1))
#     distances_l2.append(run_student_model(l2_lambda=0.1))

# print("Avg. distances using L1 norm", np.mean(distances_l1))
# print("Avg. distances using L2 norm", np.mean(distances_l2))

Epoch 0, Loss: 0.3855
Epoch 100, Loss: 0.0089
Epoch 200, Loss: 0.0038
Epoch 300, Loss: 0.0022
Epoch 400, Loss: 0.0015
Epoch 500, Loss: 0.0010
Epoch 600, Loss: 0.0008
Epoch 700, Loss: 0.0006
Epoch 800, Loss: 0.0005
Epoch 900, Loss: 0.0004
Epoch 1000, Loss: 0.0003
Epoch 1100, Loss: 0.0003
Epoch 1200, Loss: 0.0002
Epoch 1300, Loss: 0.0002
Epoch 1400, Loss: 0.0002
Epoch 1500, Loss: 0.0002
Epoch 1600, Loss: 0.0001
Epoch 1700, Loss: 0.0001
Epoch 1800, Loss: 0.0001
Epoch 1900, Loss: 0.0001
Epoch 2000, Loss: 0.0001
Epoch 2100, Loss: 0.0001
Epoch 2200, Loss: 0.0001
Epoch 2300, Loss: 0.0001
Epoch 2400, Loss: 0.0001
Epoch 2500, Loss: 0.0001
Epoch 2600, Loss: 0.0001
Epoch 2700, Loss: 0.0001
Epoch 2800, Loss: 0.0001
Epoch 2900, Loss: 0.0001
Epoch 3000, Loss: 0.0001
Epoch 3100, Loss: 0.0001
Epoch 3200, Loss: 0.0001
Epoch 3300, Loss: 0.0001
Epoch 3400, Loss: 0.0001
Epoch 3500, Loss: 0.0001
Epoch 3600, Loss: 0.0001
Epoch 3700, Loss: 0.0001
Epoch 3800, Loss: 0.0001
Epoch 3900, Loss: 0.0001
Epoch 4000, 

## Use a different student architecture than teacher

In [29]:
class Student_sparse(nn.Module):
    def __init__(self):
        super(Student_sparse, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=3, stride=3, padding=0)
        self.conv2 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=2, stride=2, padding=0)
        self.conv3 = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=2, stride=2, padding=0)
        
    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension for CNN
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        return x

teacher_model = SparseCNN_1()

In [30]:
X_generated = torch.tensor(np.random.randn(TEST_SET_SIZE, 12), dtype=torch.float32)
y_generated = teacher_model(X_generated).detach()

In [None]:
student_model = Student_sparse()

sgd_optim = optim.SGD(student_model.parameters(), lr=0.05, weight_decay=0.05)
loss_fn = nn.MSELoss()
student_model, final_loss = train_model(student_model, X_train=X_generated, y_train=y_generated, optimizer=sgd_optim, loss_fn=loss_fn) 

print("\nTarget function parameters:")
for param in student_model.parameters():
    print(param.data.numpy())
    
print("\nStudent function parameters AFTER training:")
for param in student_model.parameters():
    print(param.data.numpy())

Epoch 0, Loss: 0.0054
Early stopping at epoch 20, best loss: 0.0054

Target function parameters:
[[[-0.01213434 -0.22194281  0.19934896]]]
[0.0353068]
[[[ 0.47101322 -0.05958339]]]
[-0.423429]
[[[0.47402352 0.09368998]]]
[-0.19214378]

Student function parameters AFTER training:
[[[-0.01213434 -0.22194281  0.19934896]]]
[0.0353068]
[[[ 0.47101322 -0.05958339]]]
[-0.423429]
[[[0.47402352 0.09368998]]]
[-0.19214378]


In [32]:
#Run code 5 times and then average the results of the distance metric
def run_student_model(l1_lambda = 0, l2_lambda = 0, print_params = False):
    student_model = Student_sparse()

    sgd_optim = optim.SGD(student_model.parameters(), lr=0.005, weight_decay=l2_lambda)
    loss_fn = nn.MSELoss()
    student_model, final_loss = train_model(student_model, X_train=X_generated, y_train=y_generated, optimizer=sgd_optim, loss_fn=loss_fn, l1_lambda=l1_lambda)
    
    if print_params == True:
        print("\nTarget function parameters:")
        for param in teacher_model.parameters():
            print(param.data.numpy())
        
        print("\nStudent function parameters AFTER training:")
        for param in student_model.parameters():
            print(param.data.numpy())

    distance_metric = compute_distance_metric(teacher_model=teacher_model, student_model=student_model)
    return distance_metric, final_loss


distances_l1 = []
losses = []
for _ in range(5):
    distance_metric, final_loss = run_student_model()
    distances_l1.append(distance_metric)
    losses.append(round(final_loss,4))

print("Avg. distances no weight decay", np.mean(distances_l1))
print("Losses", losses)

distances_l1 = []
distances_l2 = []
for _ in range(5):
    distances_l1.append(run_student_model(l1_lambda=0.05))
    distances_l2.append(run_student_model(l2_lambda=0.05))

print("Avg. distances using L1 norm", np.mean(distances_l1))
print("Avg. distances using L2 norm", np.mean(distances_l2))


# distances_l1 = []
# distances_l2 = []
# for _ in range(5):
#     distances_l1.append(run_student_model(l1_lambda=0.1))
#     distances_l2.append(run_student_model(l2_lambda=0.1))

# print("Avg. distances using L1 norm", np.mean(distances_l1))
# print("Avg. distances using L2 norm", np.mean(distances_l2))

Epoch 0, Loss: 0.3389
Epoch 100, Loss: 0.0498
Epoch 200, Loss: 0.0111
Epoch 300, Loss: 0.0058
Epoch 400, Loss: 0.0051
Epoch 500, Loss: 0.0050
Epoch 600, Loss: 0.0050
Epoch 700, Loss: 0.0050
Epoch 800, Loss: 0.0050
Epoch 900, Loss: 0.0050
Epoch 0, Loss: 0.6455
Epoch 100, Loss: 0.0203
Epoch 200, Loss: 0.0063
Epoch 300, Loss: 0.0058
Epoch 400, Loss: 0.0057
Epoch 500, Loss: 0.0057
Epoch 600, Loss: 0.0057
Epoch 700, Loss: 0.0057
Epoch 800, Loss: 0.0056
Epoch 900, Loss: 0.0056
Epoch 0, Loss: 0.0054
Early stopping at epoch 20, best loss: 0.0054
Epoch 0, Loss: 0.0054
Early stopping at epoch 20, best loss: 0.0054
Epoch 0, Loss: 0.0050
Epoch 100, Loss: 0.0049
Epoch 200, Loss: 0.0049
Epoch 300, Loss: 0.0049
Epoch 400, Loss: 0.0049
Epoch 500, Loss: 0.0049
Early stopping at epoch 593, best loss: 0.0049
Avg. distances no weight decay 1.8564770758152007
Losses [0.005, 0.0056, 0.0054, 0.0054, 0.0049]
Epoch 0, Loss: 0.1495
Epoch 100, Loss: 0.0732
Epoch 200, Loss: 0.0555
Epoch 300, Loss: 0.0478
Epoch 40

## SparseNN Teacher, DenseNN Student

# Old Stuff

## Sparse NN defines the sparse function

Define "teacher" model that generates a Dataset X, y

QUESTION: why use a CNN? Couldn't I also manually add sparsity?

In [3]:
class SparseCNN_1(nn.Module):
    def __init__(self):
        super(SparseCNN_1, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=4, kernel_size=2, stride=1, padding=0, bias=False)
        self.conv2 = nn.Conv1d(in_channels=4, out_channels=2, kernel_size=2, stride=1, padding=0, bias=False)
        self.fc = nn.Linear(2, 1, bias=False)
        
    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension for CNN
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.tanh(self.fc(x))  # Apply tanh activation for positive/negative output
        return x

# Generate initial training data
X_train = torch.tensor(np.random.randn(TEST_SET_SIZE, 3), dtype=torch.float32)
y_train = (torch.randn(TEST_SET_SIZE) > 0).float().unsqueeze(1)  # Random labels for initial training

# Train the CNN model for one epoch
teacher_model = SparseCNN_1()
optimizer = optim.SGD(teacher_model.parameters(), lr=0.01)
loss_fn = nn.HingeEmbeddingLoss()

best_loss = float('inf')
patience_counter = 0

for epoch in range(50):
    optimizer.zero_grad()
    y_pred = torch.tanh(teacher_model(X_train))
    loss = loss_fn(y_pred, y_train)
    loss.backward()
    optimizer.step()

Dataset

In [4]:
# Use Teacher CNN to generate a new dataset
X_generated = torch.tensor(np.random.randn(TEST_SET_SIZE, 3), dtype=torch.float32)
print(teacher_model(X_generated).detach().numpy())
y_generated = (teacher_model(X_generated).detach().numpy() > 0).astype(int)  # Use trained model to create labels
y_generated = torch.tensor(y_generated, dtype=torch.float32)
print(X_generated)
print(y_generated)

[[-0.16926415]
 [-0.31554717]
 [-0.2561143 ]
 [-0.2923395 ]
 [-0.13538507]
 [-0.31636113]
 [-0.3006882 ]
 [-0.23830342]
 [-0.17360625]
 [-0.06610367]
 [-0.25173333]
 [-0.17179686]
 [-0.12237118]
 [-0.17486657]
 [-0.20299232]
 [-0.3394934 ]
 [-0.27969295]
 [-0.0836275 ]
 [-0.18193364]
 [-0.12299453]
 [-0.03974314]
 [-0.16686611]
 [-0.21119726]
 [-0.3473768 ]
 [-0.3602215 ]
 [-0.35807398]
 [-0.2445311 ]
 [-0.18066874]
 [-0.07693703]
 [-0.23945016]
 [-0.2826171 ]
 [-0.26390702]
 [-0.19646007]
 [-0.18455684]
 [-0.3121959 ]
 [-0.17985696]
 [-0.17452414]
 [-0.13670464]
 [-0.07541537]
 [-0.43158647]
 [-0.10061846]
 [-0.16781142]
 [-0.38297015]
 [-0.03491351]
 [-0.3891763 ]
 [-0.21923736]
 [-0.19632228]
 [-0.20115846]
 [-0.13250977]
 [-0.11153005]
 [-0.37487736]
 [-0.14422806]
 [-0.36020735]
 [-0.10803255]
 [-0.09847488]
 [-0.1610655 ]
 [-0.09989656]
 [-0.20776303]
 [-0.11780811]
 [-0.29866627]
 [-0.21608327]
 [-0.2601084 ]
 [-0.3309865 ]
 [-0.29319105]
 [-0.2750859 ]
 [-0.34698918]
 [-0.13343

Define fully-connected student model

In [5]:
# Training with early stopping
def train_model(model, X_train, y_train, optimizer, loss_fn, l1_lambda = 0):
    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(2000):
        optimizer.zero_grad()
        y_pred = torch.tanh(model(X_train))  # Apply sigmoid for binary classification
        loss = loss_fn(y_pred, y_train)

        # for the case where we have L1 norm
        l1_norm = sum(p.abs().sum() for p in model.parameters())
        loss += l1_norm * l1_lambda #is zero if l1_lambda is 0

        # end of that case
        loss.backward()
        optimizer.step()
        
        if epoch % 100 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item():.4f}')
        
        # Early stopping condition
        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"Early stopping at epoch {epoch}, best loss: {best_loss:.4f}")
                break

    # Evaluate model
    X_test = torch.tensor(np.random.randn(10, 3), dtype=torch.float32)
    y_test_pred = (torch.tanh(model(X_test)).detach().numpy() > 0).astype(int)  # Convert to binary output
    print("Binary Output Sequence:", y_test_pred.flatten())

In [6]:
# "Student" Model (fully connected with same number of neurons as teacher)
class DenseNN(nn.Module):
    def __init__(self):
        super(DenseNN, self).__init__()
        self.fc1 = nn.Linear(3, 4, bias=False)  # Fully connected layers without bias
        self.fc2 = nn.Linear(4, 2, bias=False)
        self.fc3 = nn.Linear(2, 1, bias=False)

        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.tanh(self.fc3(x))  # Sigmoid for binary classification
        return x

# Train the Student Model on the Generated Dataset
student_model = DenseNN()
sgd_optim = optim.SGD(student_model.parameters(), lr=0.01) #, weight_decay=1e-4) #L2 regularization
loss_fn = nn.HingeEmbeddingLoss()  # Binary cross-entropy loss
train_model(student_model, X_train=X_generated, y_train=y_generated, optimizer=sgd_optim, loss_fn=loss_fn)


Epoch 0, Loss: 1.0000
Early stopping at epoch 10, best loss: 1.0000
Binary Output Sequence: [0 0 0 0 1 0 0 0 0 0]


In [53]:
# print the weights
print("Teacher Model Weights:")
for name, param in teacher_model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.data}")

print("\nStudent Model Weights:")
for name, param in student_model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.data}")

Teacher Model Weights:
conv1.weight: tensor([[[ 0.6242, -0.6786]],

        [[ 0.5425, -0.0254]],

        [[-0.0823,  0.4446]],

        [[-0.0646,  0.4403]]])
conv2.weight: tensor([[[ 0.2979, -0.3002],
         [ 0.1592,  0.0759],
         [ 0.0830,  0.0675],
         [ 0.1038, -0.1544]],

        [[-0.0393,  0.3409],
         [ 0.2412, -0.0113],
         [-0.3310,  0.0182],
         [-0.2393,  0.2901]]])
fc.weight: tensor([[-0.4594, -0.0844]])

Student Model Weights:
fc1.weight: tensor([[-0.1281,  0.1027,  0.5433],
        [ 0.0549,  0.3344,  0.4481],
        [ 0.4661, -0.1994, -0.1291],
        [ 0.2782, -0.1575,  0.2703]])
fc2.weight: tensor([[-0.1092, -0.3391,  0.2035,  0.0767],
        [ 0.2229,  0.4967,  0.3414,  0.4740]])
fc3.weight: tensor([[ 0.0378, -0.6083]])


## Define Sparse Function, then train CNN and Dense NN on this and compare

f(x) = sin(x1) + log(1 + abs(x2)) + x3^2

Models learn: y = 1 if f(x) > 0, else 0 


F(x) = Conv1(x) + ReLU(Conv2(x)) + x^2 

In [None]:
# Generate training data
np.random.seed(42)
torch.manual_seed(42)
X_train = torch.tensor(np.random.randn(TEST_SET_SIZE, 3), dtype=torch.float32)
y_train = torch.sin(X_train[:, 0]) + torch.log(1 + torch.abs(X_train[:, 1])) + X_train[:, 2] ** 2

y_train = (y_train > 0).float().unsqueeze(1)  # Label is 1 if positive, 0 if negative

In [None]:
# Define the sparse function using a CNN
class SparseCNN_2(nn.Module):
    def __init__(self):
        super(SparseCNN_2, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=4, kernel_size=2, stride=1, padding=0)
        self.conv2 = nn.Conv1d(in_channels=4, out_channels=2, kernel_size=2, stride=1, padding=0)
        self.conv3 = nn.Conv1d(in_channels=2, out_channels=1, kernel_size=2, stride=1)
        self.fc = nn.Linear(2, 1)  # Fully connected layer to output a single value
        
    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dimension for CNN
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.tanh(self.fc(x))  # Apply tanh activation for positive/negative output
        return x

In [None]:
#Train model with early stopping
def train_model(model):
    optimizer = optim.SGD(model.parameters(), lr=0.01) # Could vary here
    loss_fn = nn.BCELoss()  # Binary cross-entropy loss

    best_loss = float('inf')
    patience_counter = 0

    for epoch in range(500):
        optimizer.zero_grad()
        y_pred = torch.sigmoid(model(X_train))  # Apply sigmoid for binary classification
        loss = loss_fn(y_pred, y_train)
        loss.backward()
        optimizer.step()
        
        if epoch % 20 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item():.4f}')
        
        # Early stopping condition
        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print(f"Early stopping at epoch {epoch}, best loss: {best_loss:.4f}")
                break

    #Evaluate model
    # Generate and print binary output sequence
    X_test = torch.tensor(np.random.randn(10, 3), dtype=torch.float32)
    y_test_pred = (torch.sigmoid(model(X_test)).detach().numpy() > 0).astype(int)  # Convert to binary output
    print("Binary Output Sequence:", y_test_pred.flatten())

In [None]:
# Train the CNN model with early stopping
model = SparseCNN_2()

In [None]:
# print the weights
print("Teacher Model Weights:")
for name, param in teacher_model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.data}")

print("\nStudent Model Weights:")
for name, param in student_model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.data}")