In [1]:
# Jupyter Notebook Cell
import torch
import time
import numpy as np
import pandas as pd

# -------------------------------
# Configuration Parameters
# -------------------------------

# Fixed Dimensions
L = 32          # Parameter for tensor shapes in Lorta
h = 32          # Parameter for tensor shapes in Lorta
d = 4096        # Assumed to be d_in for Lora
d_out = 128     # Parameter for tensor shapes in Lorta
m = 2

# Variable Parameters
r_list = [4, 64]             # Ranks to test
N_list = [10, 100]        # Number of repetitions

# Transfer settings
n_repetitions = 10  # Number of repetitions for timing

# Jupyter Notebook Cell

import torch
import time
import numpy as np
import pandas as pd

# -------------------------------
# Configuration Parameters
# -------------------------------

# Fixed Dimensions
L = 80          # Parameter for tensor shapes in Lorta
h = 32          # Parameter for tensor shapes in Lorta
d = 4096        # Assumed to be d_in for Lora
d_out = 128     # Parameter for tensor shapes in Lorta

# Variable Parameters
r_list = [4, 64]             # Ranks to test
N_list = [1, 10, 100]        # Number of repetitions

# Transfer settings
n_repetitions = 10  # Number of repetitions for timing

# Output CSV filename
output_csv = 'gpu_transfer_times_enhanced.csv'

# -------------------------------
# Data Preparation Functions
# -------------------------------

def prepare_lora_tensors(N, r):
    """
    Prepares batched tensors for Lora.
    Instead of creating N * L * d individual tensors, we create a single batched tensor.
    Shape: (N * L * d, d_in, r)
    """
    total_tensors = N * L * d
    d_in = d  # Assuming d_in equals d
    # To manage memory, create in smaller batches if needed
    # Here, we create a single large tensor
    lora_tensor = torch.randn(total_tensors, d_in, r)
    return lora_tensor

def prepare_lorta_tensors(N, r):
    """
    Prepares tuples of tensors for Lorta.
    Each tuple contains four tensors with shapes:
    [(L, r), (h, r), (d_out/h, r), (d_in, r)]
    """
    assert d_out % h == 0, "d_out must be divisible by h."
    d_out_per_tensor = d_out // h
    d_in = d  # Assuming d_in equals d
    lorta_tuples = []
    for _ in range(N):
        tuple_tensors = (
            torch.randn(L, r),
            torch.randn(h, r),
            torch.randn(d_out_per_tensor, r),
            torch.randn(d_in, r)
        )
        lorta_tuples.append(tuple_tensors)
    return lorta_tuples

# -------------------------------
# Warm-up GPU
# -------------------------------

def warm_up_gpu(lora_tensor_sample, lorta_tuples_sample):
    """
    Performs a warm-up transfer to the GPU to stabilize performance.
    """
    print("Warming up GPU...")
    # Transfer a small sample of Lora tensors
    _ = lora_tensor_sample.cuda()
    # Transfer a small sample of Lorta tuples
    for tuple_tensors in lorta_tuples_sample:
        for tensor in tuple_tensors:
            _ = tensor.cuda()
    torch.cuda.synchronize()
    print("Warm-up completed.\n")

# -------------------------------
# Transfer Timing Functions
# -------------------------------

def transfer_lora(lora_tensor):
    """
    Transfers the Lora tensor to the GPU.
    """
    _ = lora_tensor.cuda()
    torch.cuda.synchronize()

def transfer_lorta(lorta_tuples):
    """
    Transfers all tensors in Lorta tuples to the GPU.
    """
    for tuple_tensors in lorta_tuples:
        for tensor in tuple_tensors:
            _ = tensor.cuda()
    torch.cuda.synchronize()

# -------------------------------
# Memory Cleanup Function
# -------------------------------

def cleanup_memory():
    """
    Frees GPU memory by deleting variables and clearing the CUDA cache.
    """
    del torch
    torch.cuda.empty_cache()

# -------------------------------
# Measurement and Results
# -------------------------------

# Initialize a list to store results
results = []

# Iterate over each combination of rank and N
for r in r_list:
    for N in N_list:
        print(f"Testing with r={r} and N={N}...")
        
        # Prepare tensors
        try:
            lora_tensor = prepare_lora_tensors(N, r)
            lorta_tuples = prepare_lorta_tensors(N, r)
        except RuntimeError as e:
            print(f"  Error in tensor preparation: {e}")
            print(f"  Skipping r={r}, N={N}.\n")
            continue
        
        # Perform warm-up transfer
        # Use a small subset to avoid long warm-up times
        lora_tensor_sample = lora_tensor[:10] if N * L * d >=10 else lora_tensor
        lorta_tuples_sample = lorta_tuples[:10] if N >=10 else lorta_tuples
        warm_up_gpu(lora_tensor_sample, lorta_tuples_sample)
        
        # Initialize lists to store transfer times
        times_lora = []
        times_lorta = []
        
        print(f"  Starting transfer measurements for r={r}, N={N}...")
        
        for rep in range(1, n_repetitions + 1):
            print(f"    Repetition {rep}/{n_repetitions}")
            
            # Measure Lora transfer
            start_time = time.time()
            transfer_lora(lora_tensor)
            end_time = time.time()
            transfer_time_lora = end_time - start_time
            times_lora.append(transfer_time_lora)
            print(f"      Lora Transfer Time: {transfer_time_lora:.6f} seconds")
            
            # Measure Lorta transfer
            start_time = time.time()
            transfer_lorta(lorta_tuples)
            end_time = time.time()
            transfer_time_lorta = end_time - start_time
            times_lorta.append(transfer_time_lorta)
            print(f"      Lorta Transfer Time: {transfer_time_lorta:.6f} seconds\n")
        
        # Calculate statistics
        avg_lora = np.mean(times_lora)
        std_lora = np.std(times_lora)
        avg_lorta = np.mean(times_lorta)
        std_lorta = np.std(times_lorta)
        
        # Store the results in a single row
        results.append({
            'Rank (r)': r,
            'Repetitions (N)': N,
            'Lora_Average_Transfer_Time (s)': avg_lora,
            'Lora_Std_Dev (s)': std_lora,
            'Lorta_Average_Transfer_Time (s)': avg_lorta,
            'Lorta_Std_Dev (s)': std_lorta
        })
        
        print(f"  Completed measurements for r={r}, N={N}.\n")
        
        # Memory Cleanup (Not included in timing)
        del lora_tensor
        del lorta_tuples
        torch.cuda.empty_cache()
        
# -------------------------------
# Display Results
# -------------------------------

# Convert results to a pandas DataFrame for better visualization
results_df = pd.DataFrame(results)

# Display the results
print("Transfer Time Comparison Results:")
display(results_df)

# -------------------------------
# Save Results to CSV
# -------------------------------

# Save the DataFrame to a CSV file
results_df.to_csv(output_csv, index=False)
print(f"\nResults have been saved to '{output_csv}'.")


Testing with r=4 and N=10...


KeyboardInterrupt: 

In [26]:
# Jupyter Notebook Cell

import torch
import time
import numpy as np
import pandas as pd

# -------------------------------
# Configuration Parameters
# -------------------------------

# Fixed Dimensions
L = 80          # Parameter for tensor shapes in Lorta and Lotr
h = 32          # Parameter for tensor shapes in Lorta
d = 4096        # Assumed to be d_in for Lora
d_out = 128     # Parameter for tensor shapes in Lorta
M = 2

# Variable Parameters
r_list = [4, 64]             # Ranks to test
N_list = [1, 10, 100]        # Number of repetitions

# Transfer settings
n_repetitions = 100  # Number of repetitions for timing

# Output CSV filename
output_csv = 'gpu_transfer_times_enhanced_with_lotr.csv'

# -------------------------------
# Data Preparation Functions
# -------------------------------

def prepare_lora_tensors(N, r):
    """
    Prepares batched tensors for Lora.
    Instead of creating N * L individual tensors, we create a single batched tensor.
    Shape: (N * L,  d, d_in, r)
    """
    d_in = d  # Assuming d_in equals d
    # To manage memory, create in smaller batches if needed
    # Here, we create a single large tensor
    lora_tensor = torch.randn(2, N, M, L, d_in, r)
    return lora_tensor

def prepare_lorta_tensors(N, r):
    """
    Prepares tuples of tensors for Lorta.
    Each tuple contains four tensors with shapes:
    [(L, r), (h, r), (d_out/h, r), (d_in, r)]
    """
    assert d_out % h == 0, "d_out must be divisible by h."
    d_out_per_tensor = d_out // h
    d_in = d  # Assuming d_in equals d
    lorta_tuples = (
        torch.randn(N, L, r),
        torch.randn(N, h, r),
        torch.randn(N, d_out_per_tensor, r),
        torch.randn(N,d_in, r), 
        torch.randn(N,  M, r), 
    )
    return lorta_tuples

def prepare_lotr_tensors(N, r):
    """
    Prepares tuples of tensors for Lotr.
    Each tuple contains three tensors with shapes:
    [(L, r, r), (d, r), (d, r)]
    """
    tuple_tensors = (
        torch.randn(N, M, L, r, r),  # 3D tensor
        torch.randn(N, M, d, r),      # 2D tensor
        torch.randn(N, M, d, r)       # 2D tensor
    )
    return tuple_tensors

# -------------------------------
# Warm-up GPU
# -------------------------------

def warm_up_gpu(lora_tensor_sample, lorta_tuples_sample, lotr_tuples_sample):
    """
    Performs a warm-up transfer to the GPU to stabilize performance.
    """
    print("Warming up GPU...")
    # Transfer a small sample of Lora tensors
    _ = lora_tensor_sample.cuda()
    # Transfer a small sample of Lorta tuples
    for tuple_tensors in lorta_tuples_sample:
        for tensor in tuple_tensors:
            _ = tensor.cuda()
    # Transfer a small sample of Lotr tuples
    for tuple_tensors in lotr_tuples_sample:
        for tensor in tuple_tensors:
            _ = tensor.cuda()
    torch.cuda.synchronize()
    print("Warm-up completed.\n")

# -------------------------------
# Transfer Timing Functions
# -------------------------------

def transfer_lora(lora_tensor):
    """
    Transfers the Lora tensor to the GPU.
    """
    lora_tensor.cuda()
    torch.cuda.synchronize()
    return lora_tensor

def transfer_lorta(lorta_tuples):
    """
    Transfers all tensors in Lorta tuples to the GPU.
    """
    gpu_tuple = tuple(tensor.cuda() for tensor in lorta_tuples)
    torch.cuda.synchronize()
    return gpu_tuple

def transfer_lotr(lotr_tuples):
    """
    Transfers all tensors in Lotr tuples to the GPU.
    """
    gpu_tuple = tuple(tensor.cuda() for tensor in lotr_tuples)
    torch.cuda.synchronize()
    return gpu_tuple

# -------------------------------
# Memory Cleanup Function
# -------------------------------

def cleanup_memory(lora_tensor=None, lorta_tuples=None, lotr_tuples=None):
    """
    Frees GPU memory by deleting variables and clearing the CUDA cache.
    """
    if lora_tensor is not None:
        del lora_tensor
    if lorta_tuples is not None:
        del lorta_tuples
    if lotr_tuples is not None:
        del lotr_tuples
    torch.cuda.empty_cache()

# -------------------------------
# Measurement and Results
# -------------------------------

# Initialize a list to store results
results = []

# Iterate over each combination of rank and N
for r in r_list:
    for N in N_list:
        print(f"Testing with r={r} and N={N}...")
        
        # Prepare tensors
        try:
            lora_tensor = prepare_lora_tensors(N, r)
            lorta_tuples = prepare_lorta_tensors(N, r)
            lotr_tuples = prepare_lotr_tensors(N, r)
        except RuntimeError as e:
            print(f"  Error in tensor preparation: {e}")
            print(f"  Skipping r={r}, N={N}.\n")
            continue
        
        # Perform warm-up transfer
        # Use a small subset to avoid long warm-up times
        lora_tensor_sample = lora_tensor[:10] if (N * L * d) >= 10 else lora_tensor
        lorta_tuples_sample = lorta_tuples[:10] if N >= 10 else lorta_tuples
        lotr_tuples_sample = lotr_tuples[:10] if N >= 10 else lotr_tuples
        warm_up_gpu(lora_tensor_sample, lorta_tuples_sample, lotr_tuples_sample)
        
        # Initialize lists to store transfer times
        times_lora = []
        times_lorta = []
        times_lotr = []
        
        print(f"  Starting transfer measurements for r={r}, N={N}...")
        
        for rep in range(1, n_repetitions + 1):
            print(f"    Repetition {rep}/{n_repetitions}")
            
            # Measure Lora transfer
            start_time = time.time()
            gpu_lora = transfer_lora(lora_tensor)
            end_time = time.time()
            transfer_time_lora = end_time - start_time
            times_lora.append(transfer_time_lora)
            cleanup_memory(gpu_lora)
            print(f"      Lora Transfer Time: {transfer_time_lora:.6f} seconds")
            
            # Measure Lorta transfer
            start_time = time.time()
            gpu_lorta = transfer_lorta(lorta_tuples)
            end_time = time.time()
            cleanup_memory(None, [gpu_lorta])
            transfer_time_lorta = end_time - start_time
            times_lorta.append(transfer_time_lorta)
            print(f"      Lorta Transfer Time: {transfer_time_lorta:.6f} seconds")
            
            # Measure Lotr transfer
            start_time = time.time()
            gpu_lotr = transfer_lotr(lotr_tuples)
            cleanup_memory(None, None, [gpu_lotr])
            end_time = time.time()
            transfer_time_lotr = end_time - start_time
            times_lotr.append(transfer_time_lotr)
            print(f"      Lotr Transfer Time: {transfer_time_lotr:.6f} seconds\n")
        
        # Calculate statistics
        avg_lora = np.mean(times_lora)
        std_lora = np.std(times_lora)
        avg_lorta = np.mean(times_lorta)
        std_lorta = np.std(times_lorta)
        avg_lotr = np.mean(times_lotr)
        std_lotr = np.std(times_lotr)
        
        # Store the results in a single row
        results.append({
            'Rank (r)': r,
            'Repetitions (N)': N,
            'Lora_Average_Transfer_Time (s)': avg_lora,
            'Lora_Std_Dev (s)': std_lora,
            'Lorta_Average_Transfer_Time (s)': avg_lorta,
            'Lorta_Std_Dev (s)': std_lorta,
            'Lotr_Average_Transfer_Time (s)': avg_lotr,
            'Lotr_Std_Dev (s)': std_lotr
        })
        
        print(f"  Completed measurements for r={r}, N={N}.\n")
        
        # Memory Cleanup (Not included in timing)
        #cleanup_memory(lora_tensor, lorta_tuples, lotr_tuples)
        
# -------------------------------
# Display Results
# -------------------------------

# Convert results to a pandas DataFrame for better visualization
results_df = pd.DataFrame(results)

# Display the results
print("Transfer Time Comparison Results:")
display(results_df)

# -------------------------------
# Save Results to CSV
# -------------------------------

# Save the DataFrame to a CSV file
results_df.to_csv(output_csv, index=False)
print(f"\nResults have been saved to '{output_csv}'.")


Testing with r=4 and N=1...
Warming up GPU...
Warm-up completed.

  Starting transfer measurements for r=4, N=1...
    Repetition 1/100
      Lora Transfer Time: 0.893523 seconds
      Lorta Transfer Time: 0.000326 seconds
      Lotr Transfer Time: 0.002485 seconds

    Repetition 2/100
      Lora Transfer Time: 0.012233 seconds
      Lorta Transfer Time: 0.000122 seconds
      Lotr Transfer Time: 0.001609 seconds

    Repetition 3/100
      Lora Transfer Time: 0.004706 seconds
      Lorta Transfer Time: 0.000114 seconds
      Lotr Transfer Time: 0.000385 seconds

    Repetition 4/100
      Lora Transfer Time: 0.001386 seconds
      Lorta Transfer Time: 0.000109 seconds
      Lotr Transfer Time: 0.000391 seconds

    Repetition 5/100
      Lora Transfer Time: 0.001451 seconds
      Lorta Transfer Time: 0.000121 seconds
      Lotr Transfer Time: 0.000380 seconds

    Repetition 6/100
      Lora Transfer Time: 0.001403 seconds
      Lorta Transfer Time: 0.000115 seconds
      Lotr Transf

Unnamed: 0,Rank (r),Repetitions (N),Lora_Average_Transfer_Time (s),Lora_Std_Dev (s),Lorta_Average_Transfer_Time (s),Lorta_Std_Dev (s),Lotr_Average_Transfer_Time (s),Lotr_Std_Dev (s)
0,4,1,0.010464,0.088758,0.000115,3.4e-05,0.000415,0.000243
1,4,10,0.012645,7.6e-05,0.001063,1.3e-05,0.003248,4.5e-05
2,4,100,0.144434,0.001754,0.005471,0.000181,0.017121,0.000168
3,64,1,0.022806,0.005421,0.001077,1.4e-05,0.003437,0.00019
4,64,10,0.215736,0.001659,0.008811,5e-06,0.041022,3.9e-05
5,64,100,2.272057,0.030548,0.061495,0.000147,0.375066,0.000195



Results have been saved to 'gpu_transfer_times_enhanced_with_lotr.csv'.


In [27]:
df = results_df
# Convert seconds to milliseconds (1s = 1000ms)
df["Lora_Average_Transfer_Time (ms)"] = df["Lora_Average_Transfer_Time (s)"] * 1000
df["Lora_Std_Dev (ms)"] = df["Lora_Std_Dev (s)"] * 1000
df["Lorta_Average_Transfer_Time (ms)"] = df["Lorta_Average_Transfer_Time (s)"] * 1000
df["Lorta_Std_Dev (ms)"] = df["Lorta_Std_Dev (s)"] * 1000
df["Lotr_Average_Transfer_Time (ms)"] = df["Lotr_Average_Transfer_Time (s)"] * 1000
df["Lotr_Std_Dev (ms)"] = df["Lotr_Std_Dev (s)"] * 1000

# Function to format mean ± std and apply boldface to the smallest values
import math

def format_mean_std_bold(mean, std, min_value, min_threshold):
    """
    Format mean ± std for LaTeX, dynamically adjusting precision:
    - std is shown up to its first non-zero digit.
    - mean is displayed with the same precision as std.
    - Boldface applied if mean is within one std of the min value.
    """
    if std == 0:
        precision = 2  # Default to 2 decimal places if std is exactly 0
    else:
        # Find the position of the first non-zero digit in std
        precision = max(0, -math.floor(math.log10(abs(std))))

    # Format mean and std based on determined precision
    mean_str = f"{mean:.{precision}f}"
    std_str = f"{std:.{precision}f}"

    # Apply boldface if within one standard deviation of the minimum
    if mean <= min_value + min_threshold:
        return f"$\\mathbf{{{mean_str} \\pm {std_str}}}$"
    else:
        return f"${mean_str} \\pm {std_str}$"


# Generate LaTeX table with updated formatting
latex_table = """
\\begin{table*}[hb!]
\\setlength{\\tabcolsep}{5pt}
\\centering
\\begin{tabular}{l | r | ccc}
\\toprule
 Rank (r) & Num. Adapters & \\multicolumn{3}{c}{Transfer time (ms)} \\\\
 &  & Lora & Lotr & Lorta \\\\
\\midrule
"""

# Group by Rank (r)
for rank, group in df.groupby("Rank (r)"):
    first_row = True
    for _, row in group.iterrows():
        n = int(row["Repetitions (N)"])
        
        # Find the minimum value across the three methods
        min_value = min(row["Lora_Average_Transfer_Time (ms)"], row["Lotr_Average_Transfer_Time (ms)"], row["Lorta_Average_Transfer_Time (ms)"])
        
        # Compute threshold for bolding (one standard deviation)
        min_threshold = min(row["Lora_Std_Dev (ms)"], row["Lotr_Std_Dev (ms)"], row["Lorta_Std_Dev (ms)"])
        
        # Format transfer times with boldface where necessary
        lora = format_mean_std_bold(row["Lora_Average_Transfer_Time (ms)"], row["Lora_Std_Dev (ms)"], min_value, min_threshold)
        lotr = format_mean_std_bold(row["Lotr_Average_Transfer_Time (ms)"], row["Lotr_Std_Dev (ms)"], min_value, min_threshold)
        lorta = format_mean_std_bold(row["Lorta_Average_Transfer_Time (ms)"], row["Lorta_Std_Dev (ms)"], min_value, min_threshold)

        if first_row:
            latex_table += f"\\multirow{{{len(group)}}}{{*}}{{{rank}}} "
            first_row = False
        
        latex_table += f"& {n} & {lora} & {lotr} & {lorta} \\\\\n"

latex_table += """
\\bottomrule
\\end{tabular}
\\caption{GPU transfer times (Mean $\\pm$ Std) in milliseconds for Lora, Lotr, and Lorta at different ranks and number of concurrent adapters. Boldface indicates the smallest value (within one standard deviation).}
\\label{tab:gpu-transfer-times}
\\end{table*}
"""

# Display LaTeX code



In [28]:
print(latex_table)


\begin{table*}[hb!]
\setlength{\tabcolsep}{5pt}
\centering
\begin{tabular}{l | r | ccc}
\toprule
 Rank (r) & Num. Adapters & \multicolumn{3}{c}{Transfer time (ms)} \\
 &  & Lora & Lotr & Lorta \\
\midrule
\multirow{3}{*}{4} & 1 & $10 \pm 89$ & $0.4 \pm 0.2$ & $\mathbf{0.12 \pm 0.03}$ \\
& 10 & $12.64 \pm 0.08$ & $3.25 \pm 0.05$ & $\mathbf{1.06 \pm 0.01}$ \\
& 100 & $144 \pm 2$ & $17.1 \pm 0.2$ & $\mathbf{5.5 \pm 0.2}$ \\
\multirow{3}{*}{64} & 1 & $23 \pm 5$ & $3.4 \pm 0.2$ & $\mathbf{1.08 \pm 0.01}$ \\
& 10 & $216 \pm 2$ & $41.02 \pm 0.04$ & $\mathbf{8.811 \pm 0.005}$ \\
& 100 & $2272 \pm 31$ & $375.1 \pm 0.2$ & $\mathbf{61.5 \pm 0.1}$ \\

\bottomrule
\end{tabular}
\caption{GPU transfer times (Mean $\pm$ Std) in milliseconds for Lora, Lotr, and Lorta at different ranks and number of concurrent adapters. Boldface indicates the smallest value (within one standard deviation).}
\label{tab:gpu-transfer-times}
\end{table*}

