In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [12]:
class myModel(nn.Module):
    
    def __init__(self, input_dims, out_dims):
        super().__init__()
        
        self.layer1 = nn.Linear(input_dims, out_dims)
        
    def forward(self, x):
        
        x = self.layer1(x)
        
        return x
    
model = myModel(16, 128)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

data = torch.rand(16)

model(data)

tensor([-0.4178,  0.0527,  0.2860, -0.2160, -0.9628, -0.0234,  0.1412, -0.2157,
        -0.1364,  0.4179, -0.4966, -0.5282,  0.4678,  0.1319,  0.3803,  0.7978,
        -0.0436, -0.0223, -0.1968,  0.3369,  0.3349, -0.7447, -0.4242, -0.0657,
         0.0318, -0.5602, -0.0867,  0.0194, -0.4563,  0.0027, -0.1443,  0.1436,
         0.7101,  0.4038, -0.1858,  0.2627,  0.2492,  0.1637,  0.3377, -0.6230,
        -0.4291,  0.0948,  0.6636, -0.3977,  0.6548, -0.2269, -0.3907, -0.7490,
        -0.1500,  0.3006, -0.3635,  0.4097, -0.1951, -0.0927,  0.5026,  0.8984,
         0.6316,  0.0168, -0.8660,  0.2549,  0.6663,  0.5140, -0.0759, -0.3557,
        -0.5989, -0.4600, -0.3807,  0.4687,  0.4531,  0.4831, -0.8452,  0.4256,
        -0.4777,  0.4604, -0.0837,  0.3188, -0.5522, -0.7701,  0.2572, -0.1687,
         0.2905,  0.7840,  0.3558,  0.1854, -0.1331, -0.0621,  0.2936, -0.2118,
        -0.4452, -0.0603, -0.2236,  0.3912, -0.2629,  0.4622, -0.0030,  0.2306,
         0.1220, -1.0691,  0.1937, -0.33

In [10]:
class myModel(nn.Module):
    
    def __init__(self, input_dims, out_dims, rank):
        super().__init__()
        
        self.layer1 = nn.Linear(input_dims, out_dims)
        self.A = nn.Linear(input_dims, rank)
        self.B = nn.Linear(rank, out_dims)
        
    def forward(self, x):
        
        AcrossB = self.A @ self.B
        out = self.layer1 + AcrossB
        x = out(x)
        
        return x
    
model = myModel(16, 128, 4)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

data = torch.rand(16)

model(data)

TypeError: unsupported operand type(s) for @: 'Linear' and 'Linear'

# working lora

In [15]:
import torch
import torch.nn as nn

class myModel(nn.Module):
    
    def __init__(self, input_dims, out_dims, rank):
        super().__init__()
        
        self.layer1 = nn.Linear(input_dims, out_dims)
        self.rank = rank

        # Initialize LoRA matrices A and B
        self.A = nn.Parameter(torch.randn(out_dims, rank))
        self.B = nn.Parameter(torch.randn(rank, input_dims))
        
    def forward(self, x):
        # Modify layer1 weights with LoRA adaptation
        modified_W = self.layer1.weight + self.A @ self.B
        x = torch.nn.functional.linear(x, modified_W, self.layer1.bias)
        return x

# Model and optimizer
model = myModel(16, 128, rank=5)  # Assuming rank=5 for LoRA
# optimizer = torch.optim.AdamW([model.A, model.B], lr=1e-3)  # Optimize only A and B

params_to_update = [param for name, param in model.named_parameters() if "layer1" not in name]
optimizer = torch.optim.AdamW(params_to_update, lr=1e-3)

# Sample data
data = torch.rand(1, 16)  # Adjust the shape for batch processing

# Forward pass
output = model(data)

output

tensor([[ 2.0814, -3.3569,  0.2003,  2.6699,  3.0409, -2.9248, -0.8539, -1.1681,
          5.6208,  5.4735,  3.5509, -0.7432, -3.7090, -0.2704, -2.5739,  4.2626,
          1.8779, -3.5988,  1.5255, -0.9325,  0.6537, -0.7779,  2.6920, -6.6480,
         -1.7500,  4.5094, -4.5425,  4.1396, -1.9845, -0.1102, -3.1174,  3.3400,
          2.3653,  1.5188,  1.8876, -0.5312, -5.4253,  2.0907, -1.2952, -0.4010,
          1.8288,  1.3090, -0.3476, -2.7510, -3.4891,  5.9249, -5.0030, -3.8181,
         -1.8146, -1.9316,  2.5059,  0.5311, -9.0898,  2.4114, -0.5630,  0.9388,
         -4.0803, -3.2848,  2.8547,  3.5545, -2.4992,  2.6213,  2.4592, -3.3518,
         -0.6746,  0.3652, -2.0115, -1.1867,  1.3993,  0.1393,  4.0982,  2.1443,
         -0.0917, -2.9917,  1.3907, -3.2344,  0.1646, -2.5243,  2.3510, -1.5404,
         -1.5686, -1.1608,  3.4435,  4.6569,  3.2349,  0.5873, -5.2667,  2.7617,
         -0.1261,  0.0996,  1.7889, -0.4298,  1.3854, -5.4287, -0.0255,  1.9962,
         -7.7961,  1.8806, -

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time



class myModel(nn.Module):
    
    def __init__(self, input_dims, out_dims):
        super().__init__()
        
        self.layer1 = nn.Linear(input_dims, out_dims)
        
    def forward(self, x):
        
        x = self.layer1(x)
        
        return x
    
model = myModel(10000, 100000).to('cuda')
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

data = torch.rand(10000).to('cuda')

start_time = time()

output = model(data)

end_time = time()

total_time = end_time - start_time

print("total time = ", total_time)
output.shape

total time =  0.3035280704498291


torch.Size([100000])

In [6]:
class myModel(nn.Module):
    
    def __init__(self, input_dims, out_dims, rank):
        super().__init__()
        
        self.layer1 = nn.Linear(input_dims, out_dims)
        self.rank = rank

        # Initialize LoRA matrices A and B
        self.A = nn.Parameter(torch.randn(out_dims, rank))
        self.B = nn.Parameter(torch.randn(rank, input_dims))
        
    def forward(self, x):
        # Modify layer1 weights with LoRA adaptation
        modified_W = self.layer1.weight + self.A @ self.B
        x = torch.nn.functional.linear(x, modified_W, self.layer1.bias)
        return x

# Model and optimizer
model = myModel(10000, 100000, rank=5).to('cuda')  # Assuming rank=5 for LoRA
# optimizer = torch.optim.AdamW([model.A, model.B], lr=1e-3)  # Optimize only A and B

params_to_update = [param for name, param in model.named_parameters() if "layer1" not in name]
optimizer = torch.optim.AdamW(params_to_update, lr=1e-3)

# Sample data
data = torch.rand(10000).to('cuda')  # Adjust the shape for batch processing

start_time = time()

# Forward pass
output = model(data)

end_time = time()

total_time = end_time - start_time

print("total time = ", total_time)
output.shape

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.73 GiB (GPU 0; 10.92 GiB total capacity; 7.46 GiB already allocated; 2.82 GiB free; 7.47 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [16]:
import torch
import torch.nn as nn
from time import time

class MyModelLoRA(nn.Module):
    
    def __init__(self, input_dims, hidden_dims, out_dims, rank, num_layers):
        super().__init__()

        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)

        self.rank = rank

        # Initialize LoRA matrices A and B for each layer
        self.A = nn.ParameterList([nn.Parameter(torch.randn(hidden_dims, rank)) for _ in range(num_layers)])
        self.B = nn.ParameterList([nn.Parameter(torch.randn(rank, input_dims if i == 0 else hidden_dims)) for i in range(num_layers)])
        
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            modified_W = layer.weight + self.A[i] @ self.B[i]
            x = torch.nn.functional.linear(x, modified_W, layer.bias)
            x = torch.nn.functional.relu(x)

        x = self.output_layer(x)
        return x

# Model and optimizer
model = MyModelLoRA(10000, 1000, 128, rank=5, num_layers=100).to('cuda')

# Freeze original layers
for layer in model.layers:
    for param in layer.parameters():
        param.requires_grad = False

# Only optimize LoRA parameters and output layer
params_to_optimize = list(model.A.parameters()) + list(model.B.parameters()) + list(model.output_layer.parameters())
optimizer = torch.optim.AdamW(params_to_optimize, lr=1e-3)

# Sample data
data = torch.rand(64,10000).to('cuda')

start_time = time()

# Forward pass
output = model(data)

end_time = time()

total_time = end_time - start_time

print("total time with LoRA = ", total_time)
print(output.shape)


total time with LoRA =  0.0157167911529541
torch.Size([64, 128])


In [15]:
import torch
import torch.nn as nn
from time import time

class MyModelStandard(nn.Module):
    
    def __init__(self, input_dims, hidden_dims, out_dims, num_layers):
        super().__init__()

        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)
        
    def forward(self, x):
        for layer in self.layers:
            x = torch.relu(layer(x))

        x = self.output_layer(x)
        return x

# Model and optimizer
model = MyModelStandard(10000, 1000, 128, num_layers=100).to('cuda')
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

# Sample data
data = torch.rand(64, 10000).to('cuda')

start_time = time()

# Forward pass
output = model(data)

end_time = time()

total_time = end_time - start_time

print("total time without LoRA = ", total_time)
print(output.shape)


total time without LoRA =  0.011386871337890625
torch.Size([64, 128])


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time

class MyModelLoRA(nn.Module):
    def __init__(self, input_dims, hidden_dims, out_dims, rank, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)
        self.rank = rank
        self.A = nn.ParameterList([nn.Parameter(torch.randn(hidden_dims, rank)) for _ in range(num_layers)])
        self.B = nn.ParameterList([nn.Parameter(torch.randn(rank, input_dims if i == 0 else hidden_dims)) for i in range(num_layers)])
        
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            modified_W = layer.weight + self.A[i] @ self.B[i]
            x = F.linear(x, modified_W, layer.bias)
            x = F.relu(x)
        x = self.output_layer(x)
        return x

# Model, optimizer, and loss function
model = MyModelLoRA(10000, 1000, 128, rank=5, num_layers=100).to('cuda')
for layer in model.layers:
    for param in layer.parameters():
        param.requires_grad = False
params_to_optimize = list(model.A.parameters()) + list(model.B.parameters()) + list(model.output_layer.parameters())
optimizer = torch.optim.AdamW(params_to_optimize, lr=1e-3)
loss_function = nn.MSELoss()

# Training parameters
epochs = 5
batch_size = 64

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    start_time = time()

    for _ in range(100):  # Number of batches
        # Generate random data and labels
        data = torch.rand(batch_size, 10000).to('cuda')
        target = torch.rand(batch_size, 128).to('cuda')

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_function(output, target)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    end_time = time()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/100:.4f}, Time: {end_time - start_time:.2f} seconds")

print("Training completed.")


Epoch 1/5, Loss: nan, Time: 3.10 seconds
Epoch 2/5, Loss: nan, Time: 3.14 seconds
Epoch 3/5, Loss: nan, Time: 3.33 seconds
Epoch 4/5, Loss: nan, Time: 3.28 seconds
Epoch 5/5, Loss: nan, Time: 3.18 seconds
Training completed.


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time

class MyModelStandard(nn.Module):
    def __init__(self, input_dims, hidden_dims, out_dims, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)
        
    def forward(self, x):
        for layer in self.layers:
            x = F.relu(layer(x))
        x = self.output_layer(x)
        return x

# Model, optimizer, and loss function
model = MyModelStandard(10000, 1000, 128, num_layers=300).to('cuda')
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
loss_function = nn.MSELoss()

# Training parameters
epochs = 5
batch_size = 64

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    start_time = time()

    for _ in range(100):  # Number of batches
        # Generate random data and labels
        data = torch.rand(batch_size, 10000).to('cuda')
        target = torch.rand(batch_size, 128).to('cuda')

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_function(output, target)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    end_time = time()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/100:.4f}, Time: {end_time - start_time:.2f} seconds")

print("Training completed.")


Epoch 1/5, Loss: 0.1066, Time: 13.62 seconds
Epoch 2/5, Loss: 0.0837, Time: 12.94 seconds


KeyboardInterrupt: 

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time

class MyModelLoRA(nn.Module):
    def __init__(self, input_dims, hidden_dims, out_dims, rank, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)
        self.rank = rank

        # Initialize LoRA matrices A and B with smaller scale
        self.A = nn.ParameterList([nn.Parameter(torch.randn(hidden_dims, rank) * 0.01) for _ in range(num_layers)])
        self.B = nn.ParameterList([nn.Parameter(torch.randn(rank, input_dims if i == 0 else hidden_dims) * 0.01) for i in range(num_layers)])
        
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            modified_W = layer.weight + self.A[i] @ self.B[i]
            x = F.linear(x, modified_W, layer.bias)
            x = F.relu(x)
        x = self.output_layer(x)
        return x

# Model, optimizer, and loss function
model = MyModelLoRA(10000, 1000, 128, rank=30, num_layers=300).to('cuda')
for layer in model.layers:
    for param in layer.parameters():
        param.requires_grad = False
params_to_optimize = list(model.A.parameters()) + list(model.B.parameters()) + list(model.output_layer.parameters())
optimizer = torch.optim.AdamW(params_to_optimize, lr=1e-3)
loss_function = nn.MSELoss()

# Training parameters
epochs = 5
batch_size = 64
grad_clip = 1.0  # Gradient clipping threshold

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    start_time = time()

    for _ in range(100):  # Number of batches
        # Generate random data and labels
        data = torch.rand(batch_size, 10000).to('cuda')
        target = torch.rand(batch_size, 128).to('cuda')

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_function(output, target)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping
        nn.utils.clip_grad_norm_(params_to_optimize, grad_clip)

        optimizer.step()

        epoch_loss += loss.item()

    end_time = time()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/100:.4f}, Time: {end_time - start_time:.2f} seconds")

print("Training completed.")


Epoch 1/5, Loss: 0.1054, Time: 8.54 seconds
Epoch 2/5, Loss: 0.0836, Time: 8.52 seconds
Epoch 3/5, Loss: 0.0836, Time: 8.71 seconds
Epoch 4/5, Loss: 0.0837, Time: 8.66 seconds
Epoch 5/5, Loss: 0.0837, Time: 8.62 seconds
Training completed.
