In [1]:
"""
Mask2Former + Semi-Supervised Pseudo-Label (Full runnable, 5-epoch quick run)

Data:
- Labeled train images:   X_train_uDRk9z9/images (well1-6)
- Unlabeled images:       X_unlabeled_mtkxUlo/images (well12-14)
- Test images:            X_test_xNbnvIa/images  (well7-11)
- Train labels (CSV):     Y_train_T9NrBYo.csv  (flattened + -1 padding)

Split (avoid leakage by well):
- Train: well1-5
- Val:   well6
- Test:  well7-11 (submission only)

Output:
- submission.csv (one row per patch name, flattened, padded to 160*272 with -1)

Notes:
- Model input uses 224x224 (Mask2Former pretrained backbone-friendly).
- Images are single-channel; we repeat channel to 3.
- Semi-supervised from epoch 1: supervised + lambda_u * unsup_loss
- Pseudo labels are filtered by pixel confidence threshold tau.

Install (in your env):
    pip install -U transformers accelerate
    # if transformers complains about huggingface-hub:
    # pip install "huggingface-hub<1.0,>=0.34.0"
"""

import re
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset

from transformers import AutoImageProcessor, Mask2FormerForUniversalSegmentation


# =========================
# 0) Paths
# =========================
DATA_ROOT = Path(r"C:\Users\lenovo\Desktop\deep_datachallenge")

TRAIN_IMAGES_DIR = DATA_ROOT / "X_train_uDRk9z9" / "images"
UNLAB_IMAGES_DIR = DATA_ROOT / "X_unlabeled_mtkxUlo" / "images"
TEST_IMAGES_DIR  = DATA_ROOT / "X_test_xNbnvIa" / "images"
Y_TRAIN_CSV      = DATA_ROOT / "Y_train_T9NrBYo.csv"

# Submission target size
TARGET_H, TARGET_W = 160, 272
IGNORE_INDEX = -1
NUM_CLASSES = 3  # classes are 0/1/2

# Model input size
MODEL_H, MODEL_W = 224, 224

# Pretrained checkpoint (semantic)
PRETRAINED = "facebook/mask2former-swin-tiny-ade-semantic"

# =========================
# 1) Hyperparameters (5-epoch quick run)
# =========================
EPOCHS = 5
BATCH_SIZE_L = 2       # labeled batch size
BATCH_SIZE_U = 2       # unlabeled batch size (keep small)
LR = 5e-5
WEIGHT_DECAY = 1e-4

# Semi-supervised schedule (start from epoch 1)
LAMBDA_U_MIN = 0.05
LAMBDA_U_MAX = 0.50
RAMP_EPOCHS = 5

# Pseudo label threshold schedule
PSEUDO_TH_START = 0.95
PSEUDO_TH_END   = 0.85

# Cache pseudo labels and refresh every N epochs
PSEUDO_UPDATE_EVERY = 2

# Unlabeled sampling ratio (0~1): use a subset of unlabeled per epoch for speed
UNLAB_SAMPLE_RATIO = 0.50

# Speed
NUM_WORKERS = 0  # set to 2~4 if your Windows setup is stable


DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
AMP = (DEVICE == "cuda")


# =========================
# 2) Utils
# =========================
def parse_well_id(name: str) -> int:
    m = re.search(r"well_(\d+)_", name)
    return int(m.group(1)) if m else -1


def minmax_normalize(x: np.ndarray) -> np.ndarray:
    x = x.astype(np.float32)
    x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
    x_min = float(x.min())
    x_max = float(x.max())
    if x_max - x_min < 1e-6:
        return np.zeros_like(x, dtype=np.float32)
    return (x - x_min) / (x_max - x_min)


def pad_to_160x272(img: np.ndarray, fill_value: float = 0.0) -> np.ndarray:
    h, w = img.shape
    assert h == TARGET_H, f"Expected height {TARGET_H}, got {h}"
    if w == TARGET_W:
        return img
    if w < TARGET_W:
        out = np.full((TARGET_H, TARGET_W), fill_value, dtype=img.dtype)
        out[:, :w] = img
        return out
    return img[:, :TARGET_W]


def decode_mask_from_csv_row(row_values: np.ndarray) -> np.ndarray:
    valid = row_values[row_values != IGNORE_INDEX]
    assert len(valid) % TARGET_H == 0, f"Valid mask length {len(valid)} not divisible by {TARGET_H}"
    w = len(valid) // TARGET_H
    return valid.reshape(TARGET_H, w).astype(np.int64)


def pad_mask_to_160x272(mask: np.ndarray) -> np.ndarray:
    h, w = mask.shape
    assert h == TARGET_H
    if w == TARGET_W:
        return mask
    out = np.full((TARGET_H, TARGET_W), IGNORE_INDEX, dtype=np.int64)
    out[:, :w] = mask
    return out


def resize_image_torch(img_1hw: torch.Tensor, h: int, w: int) -> torch.Tensor:
    x = img_1hw.unsqueeze(0)  # (1,1,H,W)
    x = F.interpolate(x, size=(h, w), mode="bilinear", align_corners=False)
    return x.squeeze(0)       # (1,h,w)


def resize_mask_torch(mask_hw: torch.Tensor, h: int, w: int) -> torch.Tensor:
    y = mask_hw.unsqueeze(0).unsqueeze(0).float()
    y = F.interpolate(y, size=(h, w), mode="nearest")
    return y.squeeze(0).squeeze(0).long()


def semantic_to_mask2former_targets(
    semantic_mask: torch.Tensor,
    num_classes: int,
    ignore_index: int = -1,
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Convert semantic (H,W) to set-based:
    - class_labels: (N,)
    - mask_labels:  (N,H,W) float 0/1
    Ignore pixels == ignore_index.
    """
    valid = semantic_mask != ignore_index
    if valid.sum() == 0:
        class_labels = torch.tensor([0], dtype=torch.long)
        mask_labels = torch.zeros((1, semantic_mask.shape[0], semantic_mask.shape[1]), dtype=torch.float32)
        return class_labels, mask_labels

    present = torch.unique(semantic_mask[valid]).tolist()
    present = [int(c) for c in present if 0 <= int(c) < num_classes]
    if len(present) == 0:
        class_labels = torch.tensor([0], dtype=torch.long)
        mask_labels = torch.zeros((1, semantic_mask.shape[0], semantic_mask.shape[1]), dtype=torch.float32)
        return class_labels, mask_labels

    masks, classes = [], []
    for c in present:
        m = (semantic_mask == c) & valid
        if m.sum() == 0:
            continue
        masks.append(m.float())
        classes.append(c)

    if len(classes) == 0:
        class_labels = torch.tensor([0], dtype=torch.long)
        mask_labels = torch.zeros((1, semantic_mask.shape[0], semantic_mask.shape[1]), dtype=torch.float32)
        return class_labels, mask_labels

    class_labels = torch.tensor(classes, dtype=torch.long)
    mask_labels = torch.stack(masks, dim=0).float()
    return class_labels, mask_labels


def get_lambda_u(epoch: int) -> float:
    if RAMP_EPOCHS <= 1:
        return LAMBDA_U_MAX
    t = (epoch - 1) / (RAMP_EPOCHS - 1)
    t = max(0.0, min(1.0, t))
    return LAMBDA_U_MIN + (LAMBDA_U_MAX - LAMBDA_U_MIN) * t


def get_pseudo_th(epoch: int) -> float:
    if EPOCHS <= 1:
        return PSEUDO_TH_END
    t = (epoch - 1) / (EPOCHS - 1)
    t = max(0.0, min(1.0, t))
    return PSEUDO_TH_START + (PSEUDO_TH_END - PSEUDO_TH_START) * t


# =========================
# 3) Datasets
# =========================
class LabeledWellDataset(Dataset):
    def __init__(self, images_dir: Path, y_csv_path: Path):
        self.image_paths = sorted(images_dir.glob("*.npy"))
        self.names = [p.stem for p in self.image_paths]
        self.y_df = pd.read_csv(y_csv_path, index_col=0)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx: int) -> Dict:
        name = self.names[idx]
        img = np.load(self.image_paths[idx])  # (160,160) or (160,272)
        raw_w = int(img.shape[1])

        img = minmax_normalize(img)
        img = pad_to_160x272(img, fill_value=0.0)

        img_t = torch.from_numpy(img).unsqueeze(0).float()         # (1,160,272)
        img_t = resize_image_torch(img_t, MODEL_H, MODEL_W)        # (1,224,224)

        row = self.y_df.loc[name].values.astype(np.int64)
        mask = decode_mask_from_csv_row(row)                       # (160,w)
        mask = pad_mask_to_160x272(mask)                           # (160,272)
        mask_t = torch.from_numpy(mask).long()                     # (160,272)
        mask_t = resize_mask_torch(mask_t, MODEL_H, MODEL_W)       # (224,224)

        return {"name": name, "image": img_t, "mask": mask_t, "raw_w": raw_w}


class UnlabeledWellDataset(Dataset):
    def __init__(self, images_dir: Path):
        self.image_paths = sorted(images_dir.glob("*.npy"))
        self.names = [p.stem for p in self.image_paths]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx: int) -> Dict:
        name = self.names[idx]
        img = np.load(self.image_paths[idx])
        raw_w = int(img.shape[1])

        img = minmax_normalize(img)
        img = pad_to_160x272(img, fill_value=0.0)

        img_t = torch.from_numpy(img).unsqueeze(0).float()         # (1,160,272)
        img_t = resize_image_torch(img_t, MODEL_H, MODEL_W)        # (1,224,224)

        return {"name": name, "image": img_t, "raw_w": raw_w, "idx": idx}


# =========================
# 4) Collate
# =========================
def collate_labeled(batch: List[Dict]) -> Dict:
    names = [b["name"] for b in batch]
    raw_ws = torch.tensor([b["raw_w"] for b in batch], dtype=torch.long)

    imgs_1 = torch.stack([b["image"] for b in batch], dim=0)   # (B,1,224,224)
    pixel_values = imgs_1.repeat(1, 3, 1, 1)                   # (B,3,224,224)
    pixel_mask = torch.ones((pixel_values.shape[0], MODEL_H, MODEL_W), dtype=torch.long)

    class_labels_list, mask_labels_list = [], []
    for b in batch:
        cls, msk = semantic_to_mask2former_targets(b["mask"], NUM_CLASSES, IGNORE_INDEX)
        class_labels_list.append(cls)
        mask_labels_list.append(msk)

    return {
        "names": names,
        "raw_ws": raw_ws,
        "pixel_values": pixel_values,
        "pixel_mask": pixel_mask,
        "class_labels": class_labels_list,
        "mask_labels": mask_labels_list,
    }


def collate_unlabeled(batch: List[Dict]) -> Dict:
    names = [b["name"] for b in batch]
    raw_ws = torch.tensor([b["raw_w"] for b in batch], dtype=torch.long)
    idxs = torch.tensor([b["idx"] for b in batch], dtype=torch.long)

    imgs_1 = torch.stack([b["image"] for b in batch], dim=0)
    pixel_values = imgs_1.repeat(1, 3, 1, 1)
    pixel_mask = torch.ones((pixel_values.shape[0], MODEL_H, MODEL_W), dtype=torch.long)

    return {
        "names": names,
        "raw_ws": raw_ws,
        "idxs": idxs,
        "pixel_values": pixel_values,
        "pixel_mask": pixel_mask,
    }


# =========================
# 5) Build model & processor
# =========================
def build_model_and_processor() -> Tuple[Mask2FormerForUniversalSegmentation, AutoImageProcessor]:
    id2label = {0: "class0", 1: "class1", 2: "class2"}
    label2id = {v: k for k, v in id2label.items()}

    processor = AutoImageProcessor.from_pretrained(PRETRAINED)

    model = Mask2FormerForUniversalSegmentation.from_pretrained(
        PRETRAINED,
        ignore_mismatched_sizes=True,
        id2label=id2label,
        label2id=label2id,
        num_labels=NUM_CLASSES,
        use_safetensors=True,  # safer + avoids torch.load restrictions
    )
    return model, processor


# =========================
# 6) Pseudo label from Mask2Former outputs
# =========================
@torch.no_grad()
def outputs_to_semantic_and_conf(outputs) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    From Mask2Former outputs -> semantic label map + confidence map (both HxW, H=W=224).
    Using:
      class_probs: softmax over (num_classes + no-object), drop last
      mask_probs:  sigmoid
      per-pixel per-class score = sum_q class_probs[q,c] * mask_probs[q, y, x]
    """
    class_logits = outputs.class_queries_logits[0]  # (Q, C+1)
    mask_logits  = outputs.masks_queries_logits[0]  # (Q, H, W)

    class_probs = F.softmax(class_logits, dim=-1)[..., :NUM_CLASSES]  # (Q, C)
    mask_probs  = torch.sigmoid(mask_logits)                          # (Q, H, W)

    # score: (C, H, W)
    score = torch.einsum("qc,qhw->chw", class_probs, mask_probs)

    conf, pred = torch.max(score, dim=0)  # (H,W), (H,W)
    return pred.to(torch.int64), conf.to(torch.float32)


@torch.no_grad()
def build_pseudo_cache(
    model: torch.nn.Module,
    unlab_loader: DataLoader,
    tau: float,
) -> Dict[int, torch.Tensor]:
    """
    Create a cache: {unlabeled_idx -> pseudo_mask(H,W) int64}, with low-confidence pixels set to IGNORE_INDEX.
    """
    model.eval()
    cache: Dict[int, torch.Tensor] = {}

    for batch in unlab_loader:
        pixel_values = batch["pixel_values"].to(DEVICE)
        pixel_mask = batch["pixel_mask"].to(DEVICE)
        idxs = batch["idxs"].tolist()

        with torch.cuda.amp.autocast(enabled=AMP):
            outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)

        # process each sample in batch
        for b_i, u_idx in enumerate(idxs):
            # take sample b_i outputs by slicing
            out_i = type(outputs)(
                **{k: (v[b_i:b_i+1] if torch.is_tensor(v) else v) for k, v in outputs.items()}
            )

            pred, conf = outputs_to_semantic_and_conf(out_i)
            pseudo = pred.clone()
            pseudo[conf < tau] = IGNORE_INDEX
            cache[u_idx] = pseudo.cpu()

    return cache


# =========================
# 7) Train / Eval
# =========================
def train_one_epoch(
    model: torch.nn.Module,
    labeled_loader: DataLoader,
    unlabeled_loader: DataLoader,
    pseudo_cache: Dict[int, torch.Tensor],
    optimizer: torch.optim.Optimizer,
    lambda_u: float,
) -> float:
    model.train()
    total_loss = 0.0
    n_seen = 0

    scaler = torch.cuda.amp.GradScaler(enabled=AMP)

    # iterate by min length (simple)
    it_u = iter(unlabeled_loader)
    for batch_l in labeled_loader:
        try:
            batch_u = next(it_u)
        except StopIteration:
            it_u = iter(unlabeled_loader)
            batch_u = next(it_u)

        # ----- labeled -----
        pv_l = batch_l["pixel_values"].to(DEVICE)
        pm_l = batch_l["pixel_mask"].to(DEVICE)
        cls_l = [x.to(DEVICE) for x in batch_l["class_labels"]]
        msk_l = [x.to(DEVICE) for x in batch_l["mask_labels"]]

        # ----- unlabeled -----
        pv_u = batch_u["pixel_values"].to(DEVICE)
        pm_u = batch_u["pixel_mask"].to(DEVICE)
        idxs_u = batch_u["idxs"].tolist()

        # build pseudo targets for this unlabeled batch from cache
        class_labels_u, mask_labels_u = [], []
        for u_idx in idxs_u:
            pseudo = pseudo_cache[u_idx]  # (224,224) on CPU
            pseudo_t = pseudo.to(torch.int64)
            cls, msk = semantic_to_mask2former_targets(pseudo_t, NUM_CLASSES, IGNORE_INDEX)
            class_labels_u.append(cls.to(DEVICE))
            mask_labels_u.append(msk.to(DEVICE))

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=AMP):
            out_l = model(
                pixel_values=pv_l,
                pixel_mask=pm_l,
                class_labels=cls_l,
                mask_labels=msk_l,
            )
            loss_sup = out_l.loss

            out_u = model(
                pixel_values=pv_u,
                pixel_mask=pm_u,
                class_labels=class_labels_u,
                mask_labels=mask_labels_u,
            )
            loss_unsup = out_u.loss

            loss = loss_sup + lambda_u * loss_unsup

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        bs = pv_l.size(0)
        total_loss += float(loss.item()) * bs
        n_seen += bs

    return total_loss / max(1, n_seen)


@torch.no_grad()
def eval_one_epoch(model: torch.nn.Module, val_loader: DataLoader) -> float:
    model.eval()
    total_loss = 0.0
    n_seen = 0

    for batch in val_loader:
        pv = batch["pixel_values"].to(DEVICE)
        pm = batch["pixel_mask"].to(DEVICE)
        cls = [x.to(DEVICE) for x in batch["class_labels"]]
        msk = [x.to(DEVICE) for x in batch["mask_labels"]]

        with torch.cuda.amp.autocast(enabled=AMP):
            out = model(pixel_values=pv, pixel_mask=pm, class_labels=cls, mask_labels=msk)
            loss = out.loss

        bs = pv.size(0)
        total_loss += float(loss.item()) * bs
        n_seen += bs

    return total_loss / max(1, n_seen)


# =========================
# 8) Predict & submission
# =========================
@torch.no_grad()
def predict_and_submit(model: torch.nn.Module, processor: AutoImageProcessor, out_csv_path: Path):
    model.eval()

    test_ds = UnlabeledWellDataset(TEST_IMAGES_DIR)  # same structure, no labels
    test_loader = DataLoader(
        test_ds, batch_size=1, shuffle=False, num_workers=NUM_WORKERS, collate_fn=collate_unlabeled
    )

    preds_dict = {}

    for batch in test_loader:
        name = batch["names"][0]
        raw_w = int(batch["raw_ws"][0].item())

        pv = batch["pixel_values"].to(DEVICE)
        pm = batch["pixel_mask"].to(DEVICE)

        with torch.cuda.amp.autocast(enabled=AMP):
            outputs = model(pixel_values=pv, pixel_mask=pm)

        # semantic map at 224x224
        pred_224, _ = outputs_to_semantic_and_conf(outputs)  # (224,224)

        # upsample back to 160x272
        pred_224_f = pred_224.unsqueeze(0).unsqueeze(0).float()
        pred_160_272 = F.interpolate(pred_224_f, size=(TARGET_H, TARGET_W), mode="nearest").squeeze(0).squeeze(0)
        pred_160_272 = pred_160_272.cpu().numpy().astype(np.int64)

        # crop to original width
        pred = pred_160_272[:, :raw_w]

        # flatten + pad -1 to 160*272
        if raw_w < TARGET_W:
            padded = np.full((TARGET_H * TARGET_W,), IGNORE_INDEX, dtype=np.int64)
            padded[: TARGET_H * raw_w] = pred.flatten()
            preds_dict[name] = padded
        else:
            preds_dict[name] = pred.flatten()

    sub = pd.DataFrame(preds_dict, dtype="int64").T
    sub.to_csv(out_csv_path)
    print(f"[OK] submission saved to: {out_csv_path}")


# =========================
# 9) Main
# =========================
def main():
    print(f"DEVICE: {DEVICE} | AMP: {AMP}")
    print(f"Labeled train dir: {TRAIN_IMAGES_DIR}")
    print(f"Unlabeled dir:     {UNLAB_IMAGES_DIR}")
    print(f"Test dir:          {TEST_IMAGES_DIR}")
    print(f"Pretrained:        {PRETRAINED}")
    print(f"EPOCHS={EPOCHS}, pseudo_update_every={PSEUDO_UPDATE_EVERY}, unlab_ratio={UNLAB_SAMPLE_RATIO}")

    # ----- labeled dataset (well1-6) -----
    labeled_all = LabeledWellDataset(TRAIN_IMAGES_DIR, Y_TRAIN_CSV)

    # split: well6 as val, well1-5 as train
    VAL_WELLS = {6}
    tr_idx, va_idx = [], []
    for i, name in enumerate(labeled_all.names):
        w = parse_well_id(name)
        if w in VAL_WELLS:
            va_idx.append(i)
        else:
            tr_idx.append(i)

    train_ds = Subset(labeled_all, tr_idx)
    val_ds   = Subset(labeled_all, va_idx)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE_L, shuffle=True,
                              num_workers=NUM_WORKERS, collate_fn=collate_labeled, drop_last=True)
    val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE_L, shuffle=False,
                              num_workers=NUM_WORKERS, collate_fn=collate_labeled)

    # ----- unlabeled dataset (well12-14) -----
    unlab_all = UnlabeledWellDataset(UNLAB_IMAGES_DIR)
    n_unlab = len(unlab_all)
    n_use = max(1, int(n_unlab * UNLAB_SAMPLE_RATIO))

    # sample a fixed subset for this run (fast & reproducible)
    rng = np.random.RandomState(42)
    use_indices = rng.choice(n_unlab, size=n_use, replace=False).tolist()
    unlab_ds = Subset(unlab_all, use_indices)

    unlab_loader = DataLoader(unlab_ds, batch_size=BATCH_SIZE_U, shuffle=True,
                              num_workers=NUM_WORKERS, collate_fn=collate_unlabeled, drop_last=True)

    # ----- model -----
    model, processor = build_model_and_processor()
    model = model.to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

    print(f"Labeled train: {len(train_ds)} | Val: {len(val_ds)} | Unlabeled used: {len(unlab_ds)}")

    best_val = 1e9
    best_path = DATA_ROOT / "best_mask2former_semi.pth"

    pseudo_cache: Dict[int, torch.Tensor] = {}

    for epoch in range(1, EPOCHS + 1):
        lambda_u = get_lambda_u(epoch)
        tau = get_pseudo_th(epoch)
        print(f"\nEpoch {epoch:02d}/{EPOCHS} | lambda_u={lambda_u:.3f} | tau={tau:.2f}")

        # refresh pseudo cache
        if (epoch == 1) or ((epoch - 1) % PSEUDO_UPDATE_EVERY == 0) or (len(pseudo_cache) == 0):
            print("[Info] Building pseudo-label cache...")
            # build cache over current unlabeled subset loader (teacher = current model)
            pseudo_cache = build_pseudo_cache(model, unlab_loader, tau=tau)
            print(f"[Info] Pseudo cache size: {len(pseudo_cache)}")

        tr_loss = train_one_epoch(model, train_loader, unlab_loader, pseudo_cache, optimizer, lambda_u=lambda_u)
        va_loss = eval_one_epoch(model, val_loader)

        print(f"[Loss] train={tr_loss:.4f} | val(well6)={va_loss:.4f}")

        if va_loss < best_val:
            best_val = va_loss
            torch.save(model.state_dict(), best_path)
            print(f"[OK] Best model saved: {best_path}")

    # ----- inference -----
    print("\n[Info] Loading best model and generating submission...")
    state = torch.load(best_path, map_location=DEVICE, weights_only=True)
    model.load_state_dict(state)

    out_csv = DATA_ROOT / "submission.csv"
    predict_and_submit(model, processor, out_csv)


if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm


DEVICE: cuda | AMP: True
Labeled train dir: C:\Users\lenovo\Desktop\deep_datachallenge\X_train_uDRk9z9\images
Unlabeled dir:     C:\Users\lenovo\Desktop\deep_datachallenge\X_unlabeled_mtkxUlo\images
Test dir:          C:\Users\lenovo\Desktop\deep_datachallenge\X_test_xNbnvIa\images
Pretrained:        facebook/mask2former-swin-tiny-ade-semantic
EPOCHS=5, pseudo_update_every=2, unlab_ratio=0.5


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
  image_processor = cls(**image_processor_dict)
Some weights of Mask2FormerForUniversalSegmentation were not initialized from the model checkpoint at facebook/mask2former-swin-tiny-ade-semantic and are newly initialized because the shapes did not match:
- class_predictor.bias: found shape torch.Size([151]) in the checkpoint and torch.Size([4]) in the model instantiated
- class_predictor.weight: found shape torch.Size([151, 256]) in the checkpoint and torch.Size([4, 256]) in the model instantiated
- criterion.empty_weight: found shape torch.Size([151]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream 

Labeled train: 2790 | Val: 1620 | Unlabeled used: 990

Epoch 01/5 | lambda_u=0.050 | tau=0.95
[Info] Building pseudo-label cache...


  with torch.cuda.amp.autocast(enabled=AMP):


[Info] Pseudo cache size: 990


  scaler = torch.cuda.amp.GradScaler(enabled=AMP)
  with torch.cuda.amp.autocast(enabled=AMP):
  with torch.cuda.amp.autocast(enabled=AMP):


[Loss] train=20.0172 | val(well6)=14.3855
[OK] Best model saved: C:\Users\lenovo\Desktop\deep_datachallenge\best_mask2former_semi.pth

Epoch 02/5 | lambda_u=0.163 | tau=0.92
[Loss] train=18.9800 | val(well6)=14.1031
[OK] Best model saved: C:\Users\lenovo\Desktop\deep_datachallenge\best_mask2former_semi.pth

Epoch 03/5 | lambda_u=0.275 | tau=0.90
[Info] Building pseudo-label cache...
[Info] Pseudo cache size: 990
[Loss] train=19.6246 | val(well6)=15.1079

Epoch 04/5 | lambda_u=0.388 | tau=0.88
[Loss] train=20.9259 | val(well6)=14.5028

Epoch 05/5 | lambda_u=0.500 | tau=0.85
[Info] Building pseudo-label cache...
[Info] Pseudo cache size: 990
[Loss] train=21.1578 | val(well6)=14.7531

[Info] Loading best model and generating submission...


  with torch.cuda.amp.autocast(enabled=AMP):


[OK] submission saved to: C:\Users\lenovo\Desktop\deep_datachallenge\submission.csv
