In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class TransformerEncoderToVector(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_encoder_layers, dim_feedforward, max_len=5000):
        super(TransformerEncoderToVector, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(d_model, 1)

    def forward(self, src):
        src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        
        # Method 1: Average Pooling
        output = torch.mean(output, dim=1)
        
        # Method 2: Max Pooling (uncomment to use)
        # output, _ = torch.max(output, dim=1)

        # Method 3: Use first token (uncomment to use)
        # output = output[:, 0, :]
        
        # Linear Layer (if further dimensionality reduction is needed)
        output = self.fc(output)
        
        return output

# Example usage
input_dim = 1000  # Vocabulary size
d_model = 512
nhead = 8
num_encoder_layers = 6
dim_feedforward = 2048

model = TransformerEncoderToVector(input_dim, d_model, nhead, num_encoder_layers, dim_feedforward)

src = torch.randint(0, input_dim, (32, 10))  # (batch_size, sequence_length)

output = model(src)
print(output.shape)  # Should print (batch_size, 1)



torch.Size([32, 1])


In [3]:
import torch
import torch.nn as nn

class BlockLinear(nn.Module):
    def __init__(self, in_features, out_features, block_size):
        super(BlockLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.block_size = block_size
        
        # Create smaller linear layers for each block
        self.linear_blocks = nn.ModuleList()
        for i in range(0, in_features, block_size):
            current_block_size = min(block_size, in_features - i)
            self.linear_blocks.append(nn.Linear(current_block_size, out_features))
        
    def forward(self, x):
        # Assuming x is of shape (batch_size, in_features)
        batch_size, in_features = x.shape
        
        # Split x into blocks
        x_blocks = [x[:, i:i+self.block_size] for i in range(0, in_features, self.block_size)]
        
        # Apply linear transformation to each block and sum the results
        out_blocks = [self.linear_blocks[i](block) for i, block in enumerate(x_blocks)]
        
        # Sum the outputs from all blocks
        output = torch.sum(torch.stack(out_blocks, dim=2), dim=2)
        
        return output

# Example usage
in_features = 100000
out_features = 1000
block_size = 10000  # Block size

x = torch.randn(32, in_features).cuda()  # Batch size of 32, input features of 100000
model = BlockLinear(in_features, out_features, block_size).cuda()

out = model(x)
print(out.shape)  # Output shape should be (32, out_features)



torch.Size([32, 1000])
