In [13]:
import torch
import h5py
from tqdm import tqdm
import matplotlib.pyplot as plt
from accelerate import Accelerator
from torch.utils.data import DataLoader, TensorDataset
from src.qwen import load_qwen
from src.preprocessor import preprocess_all_time_series
from lora.lora_skeleton import LoRALinear, process_sequences

# LoRA

In [6]:
with h5py.File("data/lotka_volterra_data.h5", "r") as f:
    # Access the full dataset
    trajectories = f["trajectories"][:]
    time_points = f["time"][:]

In [7]:
model, tokenizer = load_qwen()
lora_rank = 4

# Actually apply LoRA to the model:
for layer in model.model.layers:
    layer.self_attn.q_proj = LoRALinear(layer.self_attn.q_proj, r=lora_rank)
    layer.self_attn.v_proj = LoRALinear(layer.self_attn.v_proj, r=lora_rank)
# ^These are the parts that will actually be trained!

# Process the data into sequences of text
train_texts, val_texts = preprocess_all_time_series(trajectories)

# ^Each of these is a `list[str]` representing contiguous parts of the time series,
#  in text form (using the LLMTIME scheme).

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

## Model training

In [10]:
# Defines the maximum context length
max_ctx_length = 512

train_input_ids = process_sequences(
    train_texts, tokenizer, max_ctx_length, stride=max_ctx_length // 2
)
val_input_ids = process_sequences(
    val_texts, tokenizer, max_ctx_length, stride=max_ctx_length
)

batch_size = 4
learning_rate = 1e-5

optimizer = torch.optim.Adam(
    (p for p in model.parameters() if p.requires_grad), lr=learning_rate
)

train_dataset = TensorDataset(train_input_ids)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


# Prepare components with Accelerator
# Accelerator library automatically moves model to the correct device (CPU or GPU),
# so no need to manually call model.to(device) or batch.to(device)
accelerator = Accelerator()
model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader)

model.train()
steps = 0
while steps < 10000:
    progress_bar = tqdm(train_loader, desc=f"Steps {steps}")
    for (batch,) in progress_bar:
        batch = batch.to(model.device)  # Move the batch to the device
        optimizer.zero_grad()
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        accelerator.backward(loss) # with accelerator, batch is already on correct device
        optimizer.step()
        steps += 1

        progress_bar.set_postfix(loss=loss.item())
        if steps > 10000:
            break


Steps 0: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.838]
Steps 1000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.72]
Steps 2000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.734]
Steps 3000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.732]
Steps 4000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.698]
Steps 5000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.584]
Steps 6000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.591]
Steps 7000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.519]
Steps 8000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.68]
Steps 9000: 100%|██████████| 1000/1000 [04:50<00:00,  3.45it/s, loss=0.628]


## Model evaluation

In [15]:
model.eval()

val_dataset = TensorDataset(val_input_ids)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Prepare components with Accelerator
accelerator = Accelerator()
model, val_loader = accelerator.prepare(model, val_loader)

total_loss = 0
num_batches = 0
for (batch,) in tqdm(val_loader, desc="Evaluating"):
    with torch.no_grad():
        # Forward pass with the model
        outputs = model(batch, labels=batch)

        # Compute the loss
        loss = outputs.loss
        total_loss += loss.item()
        num_batches += 1

val_loss = total_loss / num_batches
print(f"\nValidation Loss: {val_loss:.4f}")

Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.16 GiB. GPU 0 has a total capacity of 39.56 GiB of which 904.88 MiB is free. Process 15203 has 38.66 GiB memory in use. Of the allocated memory 37.99 GiB is allocated by PyTorch, and 186.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Compute performance metrics

In [14]:
import torch.nn.functional as F

all_preds = []
all_labels = []
for (batch,) in tqdm(val_loader, desc="Evaluating"):
    with torch.no_grad():
        outputs = model(batch, labels=batch)
        logits = outputs.logits
        all_preds.append(logits)
        all_labels.append(batch)

all_preds = torch.cat(all_preds).detach().cpu()
all_labels = torch.cat(all_labels).detach().cpu()

mae = F.l1_loss(all_preds, all_labels)
mse = F.mse_loss(all_preds, all_labels)

print(f"\nMean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")

Evaluating:  20%|██        | 30/150 [00:04<00:17,  6.84it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.16 GiB. GPU 0 has a total capacity of 39.56 GiB of which 972.88 MiB is free. Process 15203 has 38.60 GiB memory in use. Of the allocated memory 37.93 GiB is allocated by PyTorch, and 173.29 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)