In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel, AutoTokenizer


In [None]:
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, l1_penalty):
        super(SparseAutoencoder, self).__init__()
        self.encoder = nn.Linear(input_dim, hidden_dim)
        self.decoder = nn.Linear(hidden_dim, input_dim)
        self.l1_penalty = l1_penalty
    
    def forward(self, x):
        encoded = torch.relu(self.encoder(x))
        decoded = self.decoder(encoded)
        return decoded, encoded

    def loss_function(self, recon_x, x, encoded):
        mse_loss = nn.MSELoss()(recon_x, x)
        l1_loss = self.l1_penalty * torch.sum(torch.abs(encoded))
        return mse_loss + l1_loss


In [None]:
model_name = 'distilbert-base-uncased'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Evaluation mode
model.eval()


In [None]:
input_text = "Hello, how are you?"
inputs = tokenizer(input_text, return_tensors='pt')

activations = {}

def get_activation(name):
    """Hook to capture activations."""
    def hook(model, input, output):
        activations[name] = output.detach()
    return hook

# Register hooks for specific layers
hook1 = model.embeddings.register_forward_hook(get_activation('embeddings'))
hook2 = model.transformer.layer[0].output.register_forward_hook(get_activation('layer_0_output'))

# Forward pass to get activations
with torch.no_grad():
    outputs = model(**inputs)

hook1.remove()
hook2.remove()

# Accessing the captured activations
embeddings_activation = activations['embeddings']
layer_0_output_activation = activations['layer_0_output']


In [None]:
# Layer 0 output as input for the Sparse Autoencoder
input_dim = layer_0_output_activation.size(-1)  # Dimension of the activation
hidden_dim = 64  # Number of features in the autoencoder
l1_penalty = 0.001

sparse_autoencoder = SparseAutoencoder(input_dim, hidden_dim, l1_penalty)
optimizer = optim.Adam(sparse_autoencoder.parameters(), lr=0.001)
n_epochs = 100

# Training loop for the Sparse Autoencoder using the extracted activation
for epoch in range(n_epochs):
    optimizer.zero_grad()
    recon_data, encoded_features = sparse_autoencoder(layer_0_output_activation)
    loss = sparse_autoencoder.loss_function(recon_data, layer_0_output_activation, encoded_features)
    loss.backward()
    optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


In [None]:
def feature_steering(model, feature_vector):
    with torch.no_grad():
        # Simulate steering by modifying feature vector (clamping)
        modified_output = feature_vector.clone()
        
        modified_output[0] *= 2  # Double the first feature's influence
        
        # Forward pass with modified features
        outputs = model(modified_output)
        
    return outputs

steered_output = feature_steering(model, encoded_features)
print("Steered Output:", steered_output)