In [1]:
%pip install blimpy
%pip install setigen
%pip install hdf5plugin
%pip install h5py
%pip install h5py
%pip install torchvision
%pip install transformers
%pip install torch

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
    Blip2Processor,
    Blip2ForConditionalGeneration,    # contains vision_model + q_former + text_model
    T5ForConditionalGeneration,
)

# -----------------------------------------------------------------------------
# 1) Load and freeze backbone
# -----------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# adjust names to exact checkpoint
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-6.7b")
vlm = Blip2ForConditionalGeneration.from_pretrained(
    "Salesforce/blip2-opt-6.7b", torch_dtype=torch.float16
).to(device)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.31it/s]


In [3]:
# freeze everything first
for p in vlm.parameters():
    p.requires_grad = False

# grab the pieces
vision_encoder = vlm.vision_model
q_former       = vlm.qformer          
llm            = vlm.language_model   

# now un-freeze only the Q-Former (and your adapters)
for p in q_former.parameters():
    p.requires_grad = True

# 1) Define the head
class MLPClassifier(nn.Module):
    def __init__(self, d_model: int, hidden_dim: int = 64, num_classes: int = 4):
        super().__init__()
        self.fc1 = nn.Linear(d_model, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, z):
        x = F.relu(self.fc1(z))
        return self.fc2(x)

# 2) Instantiate it using Q-Former output dim
q_dim = q_former.config.hidden_size     # typically 768 or similar
classifier = MLPClassifier(d_model=q_dim).to(device)

# 3) Now you can create the optimizer
optimizer = torch.optim.AdamW(
    list(q_former.parameters()) +
    list(classifier.parameters()),
    lr=2e-4
)



In [4]:
def apply_qr_lora(linear: nn.Linear, rank: int = 8):
    """
    Replace a square nn.Linear with a QR-LoRA version:
      W_fixed + sum_i λ_i Q_i R_i^T
    """
    W = linear.weight.data.clone()
    # full QR
    Q, R = torch.linalg.qr(W)
    # keep only first `rank` factors
    Q_r = Q[:, :rank].detach()
    R_r = R[:rank, :].detach()
    linear.weight.requires_grad = False

    # create adapter parameters
    linear.register_buffer("Q_r", Q_r)
    linear.register_buffer("R_r", R_r)
    linear.lambdas = nn.Parameter(torch.zeros(rank, device=linear.weight.device))

    # monkey-patch forward
    def forward_qr(self, x):
        # ΔW = Q_r @ ( lam[:,None] * R_r )
        delta = self.Q_r @ (self.lambdas.unsqueeze(1) * self.R_r)
        return F.linear(x, self.weight + delta, self.bias)
    linear.forward = forward_qr.__get__(linear, linear.__class__)


# apply to every square Linear in Q-Former
for name, module in q_former.named_modules():
    if isinstance(module, nn.Linear) and module.weight.data.shape[0] == module.weight.data.shape[1]:
        apply_qr_lora(module, rank=8)

# -----------------------------------------------------------------------------
# 3) MLP classification head
# -----------------------------------------------------------------------------
class MLPClassifier(nn.Module):
    def __init__(self, d_model: int, hidden_dim: int = 64, num_classes: int = 4):
        super().__init__()
        self.fc1 = nn.Linear(d_model, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, z):
        x = F.relu(self.fc1(z))
        return self.fc2(x)

# assume Q-Former outputs have dimension `q_dim`
q_dim = q_former.config.hidden_size
classifier = MLPClassifier(q_dim).to(device)

# -----------------------------------------------------------------------------
# 4) Forward step
# -----------------------------------------------------------------------------

caption_tokenizer = processor.tokenizer  # shared with BLIP-2
cls_loss_fn     = nn.CrossEntropyLoss()
cap_loss_fn     = nn.CrossEntropyLoss(ignore_index=processor.tokenizer.pad_token_id)
alpha = 0.5  # balance factor

def forward_step(images, class_labels, captions):
    # 1) vision encoder
    vision_outputs = vision_encoder(pixel_values=images).last_hidden_state

    # 2) Q-Former
    q_outputs = q_former(
        inputs_embeds=vision_outputs,
        return_dict=True,
    )
    # we’ll pool via [CLS] token (first query)
    pooled = q_outputs.last_hidden_state[:, 0, :]  # (B, q_dim)

    # 3a) classification
    logits = classifier(pooled)
    loss_cls = cls_loss_fn(logits, class_labels)

    # 3b) caption generation
    # prepend Q-Former embeddings to text_model encoder inputs
    encoder_kwargs = {"encoder_outputs": q_outputs}
    input_ids = caption_tokenizer(captions, return_tensors="pt", padding=True).input_ids.to(device)
    cap_outputs = llm(
        input_ids=input_ids,
        **encoder_kwargs,
        labels=input_ids,
    )
    loss_cap = cap_outputs.loss

    # 4) combined loss
    loss = alpha * loss_cap + (1 - alpha) * loss_cls
    return loss, logits


In [5]:
import os
plugin_dir = "/home/jliang/hdf5_plugins"
os.makedirs(plugin_dir, exist_ok=True)
os.chmod(plugin_dir, 0o755)
os.environ["HDF5_PLUGIN_PATH"] = plugin_dir

In [6]:
# import pandas as pd
# from pathlib import Path
# import hdf5plugin
# import h5py
# from PIL import Image

# import torch
# from torch.utils.data import Dataset, DataLoader, random_split
# from torchvision import transforms

# # -----------------------------------------------------------------------------
# # 1) PyTorch Dataset for SETI spectrograms in .h5 files,
# #    splitting Cadence ID groups into 6-row chunks
# # -----------------------------------------------------------------------------
# class SETICadenceDataset(Dataset):
#     def __init__(
#         self,
#         csv_file: str,
#         h5_root: str = None,
#         transform=None,
#         dataset_key: str = None,
#     ):
#         """
#         Args:
#             csv_file: CSV with 'Cadence ID' and '.h5 path' columns
#             h5_root: base path to prepend to relative .h5 paths (None if CSV has absolutes)
#             transform: torchvision transforms applied to each image
#             dataset_key: HDF5 dataset key to load (defaults to first key)
#         """
#         self.df = pd.read_csv(csv_file)
#         self.h5_root = Path(h5_root) if h5_root else None
#         self.transform = transform or transforms.Compose([
#             transforms.Resize((224, 224)),
#             transforms.ToTensor(),
#         ])
#         self.dataset_key = dataset_key

#         # Organize cadences into non-overlapping chunks of 6
#         self.cadences = []
#         for _, group in self.df.groupby('Cadence ID', sort=False):
#             rows = group.reset_index(drop=True)
#             n_chunks = len(rows) // 6
#             for i in range(n_chunks):
#                 chunk = rows.iloc[i*6:(i+1)*6].reset_index(drop=True)
#                 self.cadences.append(chunk)

#     def __len__(self):
#         return len(self.cadences)

#     def __getitem__(self, idx):
#         chunk = self.cadences[idx]
#         images = []
#         paths = []
#         for _, row in chunk.iterrows():
#             h5_path = Path(row['.h5 path'])
#             if not h5_path.is_absolute() and self.h5_root:
#                 h5_path = self.h5_root / h5_path
#             paths.append(str(h5_path))
#             with h5py.File(h5_path, 'r') as hf:
#                 key = self.dataset_key or next(iter(hf.keys()))
#                 arr = hf[key][()]
#             img = Image.fromarray(arr) if arr.ndim == 3 else Image.fromarray(arr).convert('RGB')
#             images.append(self.transform(img))
#         images = torch.stack(images, dim=0)  # shape: (6, C, H, W)
#         return {
#             'pixel_values': images,
#             'paths': paths,
#         }

# # -----------------------------------------------------------------------------
# # 2) Build train/validation DataLoaders
# # -----------------------------------------------------------------------------
# csv_file = "/home/cgchoza/galaxies/complete_cadences_catalog.csv"
# h5_root = None  
# dataset_key = "data"  

# full_dataset = SETICadenceDataset(csv_file, h5_root, dataset_key=dataset_key)

# # split 80/20
# total = len(full_dataset)
# train_len = int(0.8 * total)
# val_len = total - train_len
# train_ds, val_ds = random_split(full_dataset, [train_len, val_len])

# batch_size = 4  # cadences per batch
# dl_kwargs = dict(num_workers=4, pin_memory=True)
# train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, **dl_kwargs)
# val_dataloader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, **dl_kwargs)



In [None]:
import pandas as pd
from pathlib import Path
import hdf5plugin
import h5py

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from PIL import Image
import numpy as np

# -----------------------------------------------------------------------------
# Dataset initialization
# -----------------------------------------------------------------------------
class SETICadenceDataset(Dataset):
    def __init__(
        self,
        csv_file: str,
        h5_root: str = None,
        dataset_key: str = 'data',
        transform=None,
    ):
        self.df = pd.read_csv(csv_file)
        self.h5_root = Path(h5_root) if h5_root else None
        self.dataset_key = dataset_key
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])

        # Group and chunk by Cadence ID every 6 rows
        self.cadences = []
        for _, group in self.df.groupby('Cadence ID', sort=False):
            rows = group.reset_index(drop=True)
            num = len(rows) // 6
            for i in range(num):
                chunk = rows.iloc[i*6:(i+1)*6].reset_index(drop=True)
                self.cadences.append(chunk)

    def __len__(self):
        return len(self.cadences)

    def __getitem__(self, idx):
        chunk = self.cadences[idx]
        images = []
        paths = []
        for _, row in chunk.iterrows():
            h5_path = Path(row['.h5 path'])
            if not h5_path.is_absolute() and self.h5_root:
                h5_path = self.h5_root / h5_path
            paths.append(str(h5_path))

            # load HDF5 dataset via h5py
            with h5py.File(h5_path, 'r') as hf:
                arr = hf[self.dataset_key][()]

            # ensure 2D spectrogram: squeeze singleton dims
            arr = np.array(arr)
            arr = arr.squeeze()
            # if still 1D, reshape manually (user must know dimensions)
            if arr.ndim == 1:
                raise ValueError(f"Spectrogram data is 1D, reshape required: {h5_path}")

            # normalize to [0,255]
            arr = arr.astype(np.float32)
            arr -= arr.min()
            arr /= (arr.max() + 1e-8)
            img_uint8 = (arr * 255).astype(np.uint8)

            # create PIL image
            img = Image.fromarray(img_uint8)
            img = img.convert('RGB')
            images.append(self.transform(img))

        # stack to (6, C, H, W)
        images = torch.stack(images, dim=0)
        return {'pixel_values': images, 'paths': paths}

# -----------------------------------------------------------------------------
# Build train/validation DataLoaders
# -----------------------------------------------------------------------------
csv_file = "/home/cgchoza/galaxies/complete_cadences_catalog.csv"
h5_root = None  # or '/absolute/path/to/h5'

dataset = SETICadenceDataset(csv_file, h5_root, dataset_key='data')

# split 80/20
total = len(dataset)
train_len = int(0.8 * total)
val_len = total - train_len
train_ds, val_ds = random_split(dataset, [train_len, val_len])

batch_size = 4  # cadences per batch
dl_kwargs = dict(num_workers=0, pin_memory=True)
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, **dl_kwargs)
val_dataloader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, **dl_kwargs)




KeyError: "Unable to synchronously open object (object 'features' doesn't exist)"

In [8]:
# training loop:
for epoch in range(10):
    for batch in train_loader:
        imgs, labels, texts = batch["pixel_values"].to(device), batch["labels"].to(device), batch["captions"]
        optimizer.zero_grad()
        loss, _ = forward_step(imgs, labels, texts)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} ▶ loss = {loss.item():.4f}")

NameError: name 'train_loader' is not defined

In [None]:
path = pd.read_csv(csv_file)['.h5 path'].iloc[0]
with h5py.File(path, 'r') as hf:
    def info(name, obj):
        print(f"{name:30s} shape={obj.shape} dtype={obj.dtype}")
    hf.visititems(info)

data                           shape=(32, 1, 536870912) dtype=float32
mask                           shape=(32, 1, 536870912) dtype=uint8


In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from transformers import Blip2Processor, Blip2ForConditionalGeneration

# # -----------------------------------------------------------------------------
# # 1) Setup
# # -----------------------------------------------------------------------------
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Processor + model
# processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-6.7b")
# vlm = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-6.7b").to(device)

# # Freeze every parameter (vision encoder, Q-Former, language decoder)
# for p in vlm.parameters():
#     p.requires_grad = False

# # Grab submodules
# vision_encoder = vlm.vision_model
# q_former       = vlm.qformer

# # Un-freeze only the Q-Former (we’ll inject QR-LoRA adapters here)
# for p in q_former.parameters():
#     p.requires_grad = True

# # -----------------------------------------------------------------------------
# # 2) QR-LoRA adapter definition & injection
# # -----------------------------------------------------------------------------
# def apply_qr_lora(linear: nn.Linear, rank: int = 8):
#     W = linear.weight.data.clone()
#     Q, R = torch.linalg.qr(W)                              # full QR
#     Q_r, R_r = Q[:, :rank].detach(), R[:rank, :].detach()  # truncated
#     linear.weight.requires_grad = False

#     linear.register_buffer("Q_r", Q_r)
#     linear.register_buffer("R_r", R_r)
#     linear.lambdas = nn.Parameter(torch.zeros(rank, device=W.device))

#     def forward_qr(self, x):
#         delta = self.Q_r @ (self.lambdas.unsqueeze(1) * self.R_r)
#         return F.linear(x, self.weight + delta, self.bias)
#     linear.forward = forward_qr.__get__(linear, linear.__class__)

# # Apply to every square Linear in Q-Former
# for module in q_former.modules():
#     if isinstance(module, nn.Linear) and module.weight.size(0) == module.weight.size(1):
#         apply_qr_lora(module, rank=8)

# # -----------------------------------------------------------------------------
# # 3) MLP classification head
# # -----------------------------------------------------------------------------
# class MLPClassifier(nn.Module):
#     def __init__(self, d_model: int, hidden_dim: int = 64, num_classes: int = 4):
#         super().__init__()
#         self.fc1 = nn.Linear(d_model, hidden_dim)
#         self.fc2 = nn.Linear(hidden_dim, num_classes)
#     def forward(self, z):
#         return self.fc2(F.relu(self.fc1(z)))

# # Instantiate
# q_dim = q_former.config.hidden_size
# classifier = MLPClassifier(q_dim).to(device)

# # -----------------------------------------------------------------------------
# # 4) Optimizer & losses
# # -----------------------------------------------------------------------------
# optimizer = torch.optim.AdamW(
#     list(q_former.parameters()) +
#     list(classifier.parameters()),
#     lr=2e-4
# )
# cls_loss_fn = nn.CrossEntropyLoss()
# alpha       = 0.5  # weight between caption & class losses

# # -----------------------------------------------------------------------------
# # 5) Forward step
# # -----------------------------------------------------------------------------
# def forward_step(images, class_labels, captions):
#     # — Classification branch —
#     vision_out = vision_encoder(pixel_values=images).last_hidden_state
#     q_out      = q_former(inputs_embeds=vision_out, return_dict=True)
#     pooled     = q_out.last_hidden_state[:, 0, :]         # use first query as [CLS]
#     logits     = classifier(pooled)
#     loss_cls   = cls_loss_fn(logits, class_labels)

#     # — Caption branch (via BLIP-2) —
#     cap_inputs = processor(text=captions, return_tensors="pt", padding=True).input_ids.to(device)
#     cap_out    = vlm(pixel_values=images, labels=cap_inputs)
#     loss_cap   = cap_out.loss

#     # — Combined loss —
#     loss = alpha * loss_cap + (1 - alpha) * loss_cls
#     return loss, logits

# # -----------------------------------------------------------------------------
# # 6) Training loop skeleton
# # -----------------------------------------------------------------------------
# for epoch in range(10):
#     for batch in train_dataloader:
#         imgs   = batch["pixel_values"].to(device)
#         labels = batch["labels"].to(device)
#         caps   = batch["captions"]

#         optimizer.zero_grad()
#         loss, _ = forward_step(imgs, labels, caps)
#         loss.backward()
#         optimizer.step()

#     print(f"Epoch {epoch} ▶ loss = {loss.item():.4f}")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

NameError: name 'train_dataloader' is not defined

In [None]:
# import os
# from pathlib import Path
# import pandas as pd
# from PIL import Image
# import requests

# import torch
# from torch.utils.data import Dataset, DataLoader
# from torchvision import transforms

# # -----------------------------------------------------------------------------
# # Download the CSV if it doesn't exist
# # -----------------------------------------------------------------------------
# csv_url = "https://raw.githubusercontent.com/PetchMa/ML_GBT_SETI/main/data_archive/L_band_directory.csv"
# csv_file = "data_archive/L_band_directory.csv"  # save it here
# img_dir = "data_archive"                        # base folder with images

# os.makedirs(img_dir, exist_ok=True)  # make sure directory exists

# if not os.path.exists(csv_file):
#     print(f"Downloading CSV from {csv_url} ...")
#     response = requests.get(csv_url)
#     with open(csv_file, "wb") as f:
#         f.write(response.content)
#     print("CSV downloaded successfully.")

# # -----------------------------------------------------------------------------
# # 1) A PyTorch Dataset for your SETI images + labels + captions
# # -----------------------------------------------------------------------------


# class SETIDataset(Dataset):
#     def __init__(self, csv_file: str, img_dir: str, transform=None):
#         self.df_raw = pd.read_csv(csv_file)
#         # Turn columns → rows, with column names in a new “cadence” field
#         df = self.df_raw.melt(
#             var_name="cadence",
#             value_name="filename"
#         ).dropna(subset=["filename"])
#         self.df = df.reset_index(drop=True)
#         self.img_dir = Path(img_dir)
#         self.transform = transform or transforms.Compose([
#             transforms.Resize((224, 224)),
#             transforms.ToTensor(),
#         ])
#         # map cadence → integer label
#         cadences = self.df["cadence"].unique().tolist()
#         self.label2idx = {c: i for i, c in enumerate(cadences)}

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, idx):
#         row = self.df.iloc[idx]
#         img_path = self.img_dir / row["filename"]
#         image = Image.open(img_path).convert("RGB")
#         image = self.transform(image)
#         label = self.label2idx[row["cadence"]]
#         # no captions available in this CSV
#         return image, label



# # -----------------------------------------------------------------------------
# # 2) Instantiate your DataLoader
# # -----------------------------------------------------------------------------
# train_dataset = SETIDataset(csv_file, img_dir)
# train_dataloader = DataLoader(
#     train_dataset,
#     batch_size=8,
#     shuffle=True,
#     num_workers=4,
#     pin_memory=True,
# )

# # -----------------------------------------------------------------------------
# # 3) Now you can plug train_dataloader straight into your training loop:
# # -----------------------------------------------------------------------------
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Placeholder for optimizer and forward_step:
# # optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
# # def forward_step(images, labels, captions):
# #     logits = model(images)
# #     loss = criterion(logits, labels)
# #     return loss, logits

# for epoch in range(10):
#     for images, labels, captions in train_dataloader:
#         images = images.to(device)
#         labels = labels.to(device)

#         optimizer.zero_grad()
#         loss, logits = forward_step(images, labels, captions)
#         loss.backward()
#         optimizer.step()
#     print(f"Epoch {epoch} ▶ loss = {loss.item():.4f}")


FileNotFoundError: Caught FileNotFoundError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "<ipython-input-9-2527491319>", line 56, in __getitem__
    image = Image.open(img_path).convert("RGB")
            ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/PIL/Image.py", line 3505, in open
    fp = builtins.open(filename, "rb")
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'data_archive/mnt_blpd7/datax2/dl/GBT_57849_28037_HIP69878_fine.h5'
