In [None]:
!pip install ultralytics

In [2]:
import cv2
import os
import glob
from tqdm import tqdm

# ================= CONFIGURATION =================
# Point this to the folder containing '01', '02', etc.
# Based on your screenshot, it ends in 'testing_videos'
dataset_root = r"/kaggle/working/cleaned_testing_videos"


# Where to save the output MP4 files
output_folder = "rendered_videos"

# Frame rate (Frames Per Second). 
# 25 or 30 is standard. Lower it (e.g., 10) if the video feels too fast.
fps = 25 
# =================================================

def create_video_from_frames(video_folder_path, output_path, fps):
    # 1. Find all images (jpg, png, jpeg)
    images = []
    for ext in ['*.jpg', '*.jpeg', '*.png']:
        images.extend(glob.glob(os.path.join(video_folder_path, ext)))
    
    if not images:
        print(f"No images found in {video_folder_path}")
        return

    # 2. Sort them numerically! 
    # Standard sort usually fails on "frame_1" vs "frame_10", 
    # but your data has padding (0093), so standard sort usually works.
    # We use a lambda just to be safe.
    try:
        images.sort(key=lambda x: int(os.path.basename(x).split('_')[1].split('.')[0]))
    except:
        images.sort() # Fallback if naming convention is different

    # 3. Read the first frame to get dimensions
    frame = cv2.imread(images[0])
    height, width, layers = frame.shape
    size = (width, height)

    # 4. Initialize VideoWriter
    # 'mp4v' is a standard codec for .mp4
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, size)

    # 5. Write frames
    video_name = os.path.basename(output_path)
    for image_path in tqdm(images, desc=f"Rendering {video_name}", unit="frame"):
        img = cv2.imread(image_path)
        out.write(img)

    out.release()
    print(f"Saved: {output_path}")

def main():
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Get list of video directories (01, 02, etc.)
    # We assume directories inside 'testing_videos' are the video clips
    video_dirs = [d for d in os.listdir(dataset_root) if os.path.isdir(os.path.join(dataset_root, d))]
    video_dirs.sort()

    print(f"Found {len(video_dirs)} video folders. Starting conversion...")

    for video_dir in video_dirs:
        full_path = os.path.join(dataset_root, video_dir)
        save_path = os.path.join(output_folder, f"video_{video_dir}.mp4")
        
        create_video_from_frames(full_path, save_path, fps)

    print("\nDone! Check the 'rendered_videos' folder.")

if __name__ == "__main__":
    main()

Found 21 video folders. Starting conversion...


Rendering video_01.mp4: 100%|██████████| 499/499 [00:01<00:00, 255.10frame/s]


Saved: rendered_videos/video_01.mp4


Rendering video_02.mp4: 100%|██████████| 1211/1211 [00:04<00:00, 261.50frame/s]


Saved: rendered_videos/video_02.mp4


Rendering video_03.mp4: 100%|██████████| 737/737 [00:02<00:00, 255.51frame/s]


Saved: rendered_videos/video_03.mp4


Rendering video_04.mp4: 100%|██████████| 947/947 [00:03<00:00, 258.81frame/s]


Saved: rendered_videos/video_04.mp4


Rendering video_05.mp4: 100%|██████████| 1007/1007 [00:03<00:00, 265.85frame/s]


Saved: rendered_videos/video_05.mp4


Rendering video_06.mp4: 100%|██████████| 627/627 [00:02<00:00, 268.46frame/s]


Saved: rendered_videos/video_06.mp4


Rendering video_07.mp4: 100%|██████████| 588/588 [00:02<00:00, 262.29frame/s]


Saved: rendered_videos/video_07.mp4


Rendering video_08.mp4: 100%|██████████| 36/36 [00:00<00:00, 239.82frame/s]


Saved: rendered_videos/video_08.mp4


Rendering video_09.mp4: 100%|██████████| 359/359 [00:01<00:00, 268.43frame/s]


Saved: rendered_videos/video_09.mp4


Rendering video_10.mp4: 100%|██████████| 722/722 [00:02<00:00, 257.58frame/s]


Saved: rendered_videos/video_10.mp4


Rendering video_11.mp4: 100%|██████████| 472/472 [00:01<00:00, 266.89frame/s]


Saved: rendered_videos/video_11.mp4


Rendering video_12.mp4: 100%|██████████| 735/735 [00:02<00:00, 259.84frame/s]


Saved: rendered_videos/video_12.mp4


Rendering video_13.mp4: 100%|██████████| 528/528 [00:01<00:00, 265.20frame/s]


Saved: rendered_videos/video_13.mp4


Rendering video_14.mp4: 100%|██████████| 496/496 [00:01<00:00, 270.71frame/s]


Saved: rendered_videos/video_14.mp4


Rendering video_15.mp4: 100%|██████████| 732/732 [00:02<00:00, 264.27frame/s]


Saved: rendered_videos/video_15.mp4


Rendering video_16.mp4: 100%|██████████| 740/740 [00:02<00:00, 269.77frame/s]


Saved: rendered_videos/video_16.mp4


Rendering video_17.mp4: 100%|██████████| 417/417 [00:01<00:00, 262.86frame/s]


Saved: rendered_videos/video_17.mp4


Rendering video_18.mp4: 100%|██████████| 275/275 [00:01<00:00, 253.72frame/s]


Saved: rendered_videos/video_18.mp4


Rendering video_19.mp4: 100%|██████████| 229/229 [00:00<00:00, 249.90frame/s]


Saved: rendered_videos/video_19.mp4


Rendering video_20.mp4: 100%|██████████| 273/273 [00:01<00:00, 259.94frame/s]


Saved: rendered_videos/video_20.mp4


Rendering video_21.mp4: 100%|██████████| 76/76 [00:00<00:00, 255.13frame/s]

Saved: rendered_videos/video_21.mp4

Done! Check the 'rendered_videos' folder.





In [None]:
import torch
import cv2
import os
import glob
import numpy as np
from tqdm import tqdm
import warnings
import logging
import sys

# ================= WARNING SUPPRESSION =================
# 1. Python Warnings
warnings.filterwarnings("ignore")

# 2. YOLO/Torch Hub Logging (The noisy part)
logging.getLogger("utils.general").setLevel(logging.ERROR)
logging.getLogger("models.yolo").setLevel(logging.ERROR)
logging.getLogger("torch.hub").setLevel(logging.ERROR)

# 3. Environment Flags
os.environ["YOLO_VERBOSE"] = "False"
# =======================================================

# ================= CONFIGURATION =================
# Input: Your Cleaned Training Videos
INPUT_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/training_videos'

# Output: The Masked Frames for AutoEncoder Training
OUTPUT_DIR = '/kaggle/working/training_videos_masked_context'

# COCO Classes to KEEP (Context)
# 0: Person
# 1-8: Vehicles (Bicycle, Car, Motorcycle, Airplane, Bus, Train, Truck, Boat)
# 24-28: Accessories (Backpack, Umbrella, Handbag, Tie, Suitcase)
# 32: Sports Ball
KEEP_CLASSES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 24, 25, 26, 27, 28, 32]

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

def apply_silent_context_masking():
    print(f"Loading YOLOv5s (Silent Mode) on {DEVICE}...")
    
    # Redirect stdout to suppress the "Fusing layers..." print from YOLO
    sys.stdout = open(os.devnull, 'w')
    try:
        try:
            model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, verbose=False)
        except:
            model = torch.hub.load('/root/.cache/torch/hub/ultralytics_yolov5_master', 'custom', path='yolov5s.pt', source='local', verbose=False)
    finally:
        # Restore stdout
        sys.stdout = sys.__stdout__
        
    model.to(DEVICE).eval()
    model.conf = 0.25

    if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR)

    videos = sorted(os.listdir(INPUT_DIR))
    print(f"Processing {len(videos)} training videos...")

    for vid in tqdm(videos):
        vid_path = os.path.join(INPUT_DIR, vid)
        if not os.path.isdir(vid_path): continue
        
        save_path = os.path.join(OUTPUT_DIR, vid)
        os.makedirs(save_path, exist_ok=True)
        
        frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
        
        for f_path in frames:
            img = cv2.imread(f_path)
            if img is None: continue
            
            # Inference
            results = model(img[..., ::-1], size=640)
            preds = results.xyxy[0].cpu().numpy()
            
            # Create Black Mask
            mask = np.zeros_like(img, dtype=np.uint8)
            
            h, w, _ = img.shape
            padding = 15 
            
            for *xyxy, conf, cls in preds:
                if int(cls) in KEEP_CLASSES:
                    x1, y1, x2, y2 = map(int, xyxy)
                    
                    x1 = max(0, x1 - padding)
                    y1 = max(0, y1 - padding)
                    x2 = min(w, x2 + padding)
                    y2 = min(h, y2 + padding)
                    
                    mask[y1:y2, x1:x2] = img[y1:y2, x1:x2]
            
            cv2.imwrite(os.path.join(save_path, os.path.basename(f_path)), mask)

    print(f"Done! Masked training data saved to: {OUTPUT_DIR}")

if __name__ == "__main__":
    apply_silent_context_masking()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
import gc
import torch.backends.cudnn as cudnn

# ================= CONFIGURATION =================
TRAIN_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/training_videos'
MODEL_SAVE_PATH = 'st_ae_fast_256.pth'

IMG_SIZE = 256
BATCH_SIZE = 16      # Increased further (smaller model + RAM cache)
EPOCHS = 20
CLIP_LEN = 16
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

cudnn.benchmark = True
# =================================================

# --- 1. RAM-CACHED DATASET ---
class CachedVideoDataset(Dataset):
    def __init__(self, root_dir, clip_len=16, img_size=256):
        self.clip_len = clip_len
        self.samples = []
        
        # Define transform once
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor()
        ])
        
        print(">>> Pre-loading dataset into RAM (This takes 1-2 mins but makes training 10x faster)...")
        
        video_folders = sorted([f for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))])
        
        # Cache for loaded tensors to avoid duplicate reads
        # Dict structure: { 'video_path/frame_01.jpg': tensor_data, ... }
        self.cache = {}
        
        for vid in tqdm(video_folders, desc="Caching Videos"):
            vid_path = os.path.join(root_dir, vid)
            frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
            
            if len(frames) < 2 * clip_len: continue
            
            # Load all frames for this video into RAM immediately
            vid_tensors = []
            for f_path in frames:
                # Load, Resize, ToTensor immediately. Store float32 tensor in RAM.
                img = Image.open(f_path).convert('RGB')
                vid_tensors.append(self.transform(img))
            
            # Create indices
            # Stride 2 to save RAM but keep variety
            for i in range(0, len(vid_tensors) - (2 * clip_len) + 1, 2):
                # We store INDICES (integers), not copies of data
                # (video_index, start_frame_index)
                self.samples.append((vid_tensors, i))
                
        print(f"Cached {len(self.samples)} clips in RAM.")

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        # Retrieve from RAM
        vid_tensors, start_idx = self.samples[idx]
        
        # Slicing a list is fast
        in_frames = vid_tensors[start_idx : start_idx + self.clip_len]
        tgt_frames = vid_tensors[start_idx + self.clip_len : start_idx + (2 * self.clip_len)]
        
        # Stack: (T, C, H, W) -> Permute to (C, T, H, W)
        return (torch.stack(in_frames, dim=0).permute(1, 0, 2, 3), 
                torch.stack(tgt_frames, dim=0).permute(1, 0, 2, 3))

# --- 2. OPTIMIZED ARCHITECTURE (Lighter Bottleneck) ---
class STAutoEncoder_Optimized(nn.Module):
    def __init__(self): 
        super(STAutoEncoder_Optimized, self).__init__()
        
        # Encoder: 32 -> 48 -> 64 -> 128 -> 256 (Capped at 256 to save speed)
        self.conv1 = nn.Conv3d(3, 32, 3, padding=1);   self.bn1 = nn.BatchNorm3d(32);  self.pool1 = nn.MaxPool3d((1, 2, 2))
        self.conv2 = nn.Conv3d(32, 48, 3, padding=1);  self.bn2 = nn.BatchNorm3d(48);  self.pool2 = nn.MaxPool3d((2, 2, 2))
        self.conv3 = nn.Conv3d(48, 64, 3, padding=1);  self.bn3 = nn.BatchNorm3d(64);  self.pool3 = nn.MaxPool3d((2, 2, 2))
        self.conv4 = nn.Conv3d(64, 128, 3, padding=1); self.bn4 = nn.BatchNorm3d(128); self.pool4 = nn.MaxPool3d((2, 2, 2))
        self.conv5 = nn.Conv3d(128, 256, 3, padding=1);self.bn5 = nn.BatchNorm3d(256) 
        self.relu = nn.LeakyReLU(0.1, inplace=True) # Inplace saves memory
        
        # Branch A: Reconstruction (256 -> 3)
        self.rec_d1 = nn.ConvTranspose3d(256, 128, 3, (2,2,2), 1, (1,1,1)); self.rbn1 = nn.BatchNorm3d(128)
        self.rec_d2 = nn.ConvTranspose3d(128, 64, 3, (2,2,2), 1, (1,1,1));  self.rbn2 = nn.BatchNorm3d(64)
        self.rec_d3 = nn.ConvTranspose3d(64, 48, 3, (2,2,2), 1, (1,1,1));   self.rbn3 = nn.BatchNorm3d(48)
        self.rec_d4 = nn.ConvTranspose3d(48, 32, 3, (1,2,2), 1, (0,1,1));   self.rbn4 = nn.BatchNorm3d(32)
        self.rec_out = nn.Conv3d(32, 3, 3, padding=1)
        
        # Branch B: Prediction (256 -> 3)
        self.pre_d1 = nn.ConvTranspose3d(256, 128, 3, (2,2,2), 1, (1,1,1)); self.pbn1 = nn.BatchNorm3d(128)
        self.pre_d2 = nn.ConvTranspose3d(128, 64, 3, (2,2,2), 1, (1,1,1));  self.pbn2 = nn.BatchNorm3d(64)
        self.pre_d3 = nn.ConvTranspose3d(64, 48, 3, (2,2,2), 1, (1,1,1));   self.pbn3 = nn.BatchNorm3d(48)
        self.pre_d4 = nn.ConvTranspose3d(48, 32, 3, (1,2,2), 1, (0,1,1));   self.pbn4 = nn.BatchNorm3d(32)
        self.pre_out = nn.Conv3d(32, 3, 3, padding=1)
        
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Encoder
        x = self.pool1(self.relu(self.bn1(self.conv1(x))))
        x = self.pool2(self.relu(self.bn2(self.conv2(x))))
        x = self.pool3(self.relu(self.bn3(self.conv3(x))))
        x = self.pool4(self.relu(self.bn4(self.conv4(x))))
        l = self.relu(self.bn5(self.conv5(x)))
        
        # Recon
        r = self.relu(self.rbn1(self.rec_d1(l)))
        r = self.relu(self.rbn2(self.rec_d2(r)))
        r = self.relu(self.rbn3(self.rec_d3(r)))
        r = self.relu(self.rbn4(self.rec_d4(r)))
        r = self.sigmoid(self.rec_out(r))
        
        # Pred
        p = self.relu(self.pbn1(self.pre_d1(l)))
        p = self.relu(self.pbn2(self.pre_d2(p)))
        p = self.relu(self.pbn3(self.pre_d3(p)))
        p = self.relu(self.pbn4(self.pre_d4(p)))
        p = self.sigmoid(self.pre_out(p))
        return r, p

# --- 3. TRAINING LOOP ---
def train_ram_charged():
    torch.cuda.empty_cache(); gc.collect()
    print(f"Training RAM-CHARGED Optimized STAE on {DEVICE}...")
    
    # Init Dataset (Will cache everything now)
    dataset = CachedVideoDataset(TRAIN_DIR, clip_len=CLIP_LEN, img_size=IMG_SIZE)
    
    # Persistent workers = False because data is already in RAM (Main Process)
    # Actually, num_workers=0 is fastest for RAM-resident data to avoid pickling overhead
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    
    model = STAutoEncoder_Optimized()
    if torch.cuda.device_count() > 1: model = nn.DataParallel(model)
    model = model.to(DEVICE)
    
    optimizer = optim.Adam(model.parameters(), lr=2e-4) # Slightly higher LR for speed
    scaler = torch.cuda.amp.GradScaler()
    criterion = nn.MSELoss()
    
    for epoch in range(EPOCHS):
        model.train()
        loop = tqdm(loader, desc=f"Ep {epoch+1}/{EPOCHS}")
        
        for inp, tgt in loop:
            inp, tgt = inp.to(DEVICE), tgt.to(DEVICE)
            optimizer.zero_grad(set_to_none=True) # Slightly faster than zero_grad()
            
            with torch.cuda.amp.autocast():
                rec, pred = model(inp)
                loss = criterion(rec, inp) + criterion(pred, tgt)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            loop.set_postfix(loss=loss.item())
            
    torch.save(model.module.state_dict() if hasattr(model, 'module') else model.state_dict(), MODEL_SAVE_PATH)
    print(f"DONE. Model saved to {MODEL_SAVE_PATH}")

if __name__ == "__main__":
    train_ram_charged()

In [1]:
import os
import glob
import torch
import torch.nn as nn
from torchvision import transforms, models
from PIL import Image
from tqdm import tqdm # Progress bar

# ================= CONFIGURATION =================
# Path to the CORRUPTED testing videos
TEST_DATA_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/testing_videos'

# Path where we will save the CLEANED videos
CLEAN_DATA_DIR = '/kaggle/working/cleaned_testing_videos'

MODEL_PATH = '/kaggle/input/vlg-rot/pytorch/default/1/rotnet_model(1).pth'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

def clean_dataset():
    print(f"Processing on: {DEVICE}")
    
    # 1. Load the Trained RotNet
    model = models.resnet18(pretrained=False) # No need to download weights again
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 2) # Matches our binary training
    
    model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
    model = model.to(DEVICE)
    model.eval()
    
    # Standard transform for the model input
    # Note: We do NOT augment here, just resize/norm
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # 2. Find all images
    # We walk through the directory to keep structure
    image_paths = sorted(glob.glob(os.path.join(TEST_DATA_DIR, '**', '*.jpg'), recursive=True))
    print(f"Found {len(image_paths)} frames to process.")
    
    # 3. Processing Loop
    flip_count = 0
    
    for img_path in tqdm(image_paths, desc="Cleaning"):
        # A. Setup paths
        # Get relative path (e.g., "01/frame_0001.jpg") to maintain structure
        rel_path = os.path.relpath(img_path, TEST_DATA_DIR)
        save_path = os.path.join(CLEAN_DATA_DIR, rel_path)
        
        # Create folder if not exists
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        
        # B. Predict Rotation
        image = Image.open(img_path).convert('RGB')
        input_tensor = preprocess(image).unsqueeze(0).to(DEVICE)
        
        with torch.no_grad():
            outputs = model(input_tensor)
            _, predicted = torch.max(outputs, 1)
            label = predicted.item()
            
        # C. Fix and Save
        # Label 0 = Upright (Keep as is)
        # Label 1 = Flipped (Needs 180 rotation to fix)
        
        if label == 1:
            # It was detected as Upside Down, so we rotate it -180 (or 180) to fix
            fixed_image = image.transpose(Image.FLIP_TOP_BOTTOM) 
            flip_count += 1
        else:
            fixed_image = image
            
        # Save the fixed image
        fixed_image.save(save_path)

    print("-" * 30)
    print("Cleaning Complete!")
    print(f"Total Images: {len(image_paths)}")
    print(f"Images Flipped/Fixed: {flip_count}")
    print(f"Cleaned dataset saved to: {CLEAN_DATA_DIR}")

if __name__ == "__main__":
    clean_dataset()

Processing on: cuda




Found 11706 frames to process.


Cleaning: 100%|██████████| 11706/11706 [02:57<00:00, 66.09it/s]

------------------------------
Cleaning Complete!
Total Images: 11706
Images Flipped/Fixed: 1195
Cleaned dataset saved to: /kaggle/working/cleaned_testing_videos





In [None]:
import torch
import cv2
import os
import glob
import numpy as np
from tqdm import tqdm
import warnings
import logging
import sys

# ================= WARNING SUPPRESSION =================
# 1. Python Warnings
warnings.filterwarnings("ignore")

# 2. YOLO/Torch Hub Logging (The noisy part)
logging.getLogger("utils.general").setLevel(logging.ERROR)
logging.getLogger("models.yolo").setLevel(logging.ERROR)
logging.getLogger("torch.hub").setLevel(logging.ERROR)

# 3. Environment Flags
os.environ["YOLO_VERBOSE"] = "False"
# =======================================================

# ================= CONFIGURATION =================
# Input: Your Cleaned Training Videos
INPUT_DIR = '/kaggle/working/cleaned_testing_videos'

# Output: The Masked Frames for AutoEncoder Training
OUTPUT_DIR = '/kaggle/working/testing_videos_masked_context'

# COCO Classes to KEEP (Context)
# 0: Person
# 1-8: Vehicles (Bicycle, Car, Motorcycle, Airplane, Bus, Train, Truck, Boat)
# 24-28: Accessories (Backpack, Umbrella, Handbag, Tie, Suitcase)
# 32: Sports Ball
KEEP_CLASSES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 24, 25, 26, 27, 28, 32]

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

def apply_silent_context_masking():
    print(f"Loading YOLOv5s (Silent Mode) on {DEVICE}...")
    
    # Redirect stdout to suppress the "Fusing layers..." print from YOLO
    sys.stdout = open(os.devnull, 'w')
    try:
        try:
            model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True, verbose=False)
        except:
            model = torch.hub.load('/root/.cache/torch/hub/ultralytics_yolov5_master', 'custom', path='yolov5s.pt', source='local', verbose=False)
    finally:
        # Restore stdout
        sys.stdout = sys.__stdout__
        
    model.to(DEVICE).eval()
    model.conf = 0.25

    if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR)

    videos = sorted(os.listdir(INPUT_DIR))
    print(f"Processing {len(videos)} training videos...")

    for vid in tqdm(videos):
        vid_path = os.path.join(INPUT_DIR, vid)
        if not os.path.isdir(vid_path): continue
        
        save_path = os.path.join(OUTPUT_DIR, vid)
        os.makedirs(save_path, exist_ok=True)
        
        frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
        
        for f_path in frames:
            img = cv2.imread(f_path)
            if img is None: continue
            
            # Inference
            results = model(img[..., ::-1], size=640)
            preds = results.xyxy[0].cpu().numpy()
            
            # Create Black Mask
            mask = np.zeros_like(img, dtype=np.uint8)
            
            h, w, _ = img.shape
            padding = 15 
            
            for *xyxy, conf, cls in preds:
                if int(cls) in KEEP_CLASSES:
                    x1, y1, x2, y2 = map(int, xyxy)
                    
                    x1 = max(0, x1 - padding)
                    y1 = max(0, y1 - padding)
                    x2 = min(w, x2 + padding)
                    y2 = min(h, y2 + padding)
                    
                    mask[y1:y2, x1:x2] = img[y1:y2, x1:x2]
            
            cv2.imwrite(os.path.join(save_path, os.path.basename(f_path)), mask)

    print(f"Done! Masked training data saved to: {OUTPUT_DIR}")

if __name__ == "__main__":
    apply_silent_context_masking()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torchvision import transforms
from tqdm import tqdm
from scipy.ndimage import gaussian_filter1d

# ================= CONFIGURATION =================
TEST_DIR = '/kaggle/working/testing_videos_masked_context'
MODEL_PATH = 'st_autoencoder_rgb_dual.pth'
TARGET_VIDEO_ID = '20'  # Change this to test different videos ('01', '02', etc.)

CLIP_LEN = 16
IMG_SIZE = 128
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. MODEL ARCHITECTURE (Must match training) ---
class STAutoEncoder_RGB(nn.Module):
    def __init__(self): 
        super(STAutoEncoder_RGB, self).__init__()
        self.conv1 = nn.Conv3d(3, 32, 3, padding=1); self.bn1 = nn.BatchNorm3d(32); self.pool1 = nn.MaxPool3d(2, 2)
        self.conv2 = nn.Conv3d(32, 48, 3, padding=1); self.bn2 = nn.BatchNorm3d(48); self.pool2 = nn.MaxPool3d(2, 2)
        self.conv3 = nn.Conv3d(48, 64, 3, padding=1); self.bn3 = nn.BatchNorm3d(64); self.pool3 = nn.MaxPool3d(2, 2)
        self.conv4 = nn.Conv3d(64, 64, 3, padding=1); self.bn4 = nn.BatchNorm3d(64)
        self.relu = nn.LeakyReLU(0.1)
        
        self.rec_deconv1 = nn.ConvTranspose3d(64, 48, 3, 2, 1, 1); self.rec_bn1 = nn.BatchNorm3d(48)
        self.rec_deconv2 = nn.ConvTranspose3d(48, 32, 3, 2, 1, 1); self.rec_bn2 = nn.BatchNorm3d(32)
        self.rec_deconv3 = nn.ConvTranspose3d(32, 32, 3, 2, 1, 1); self.rec_bn3 = nn.BatchNorm3d(32)
        self.rec_final = nn.Conv3d(32, 3, 3, padding=1)
        
        self.pred_deconv1 = nn.ConvTranspose3d(64, 48, 3, 2, 1, 1); self.pred_bn1 = nn.BatchNorm3d(48)
        self.pred_deconv2 = nn.ConvTranspose3d(48, 32, 3, 2, 1, 1); self.pred_bn2 = nn.BatchNorm3d(32)
        self.pred_deconv3 = nn.ConvTranspose3d(32, 32, 3, 2, 1, 1); self.pred_bn3 = nn.BatchNorm3d(32)
        self.pred_final = nn.Conv3d(32, 3, 3, padding=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h = self.relu(self.bn4(self.conv4(self.pool3(self.relu(self.bn3(self.conv3(self.pool2(self.relu(self.bn2(self.conv2(self.pool1(self.relu(self.bn1(self.conv1(x)))))))))))))))
        r = self.sigmoid(self.rec_final(self.relu(self.rec_bn3(self.rec_deconv3(self.relu(self.rec_bn2(self.rec_deconv2(self.relu(self.rec_bn1(self.rec_deconv1(h)))))))))))
        p = self.sigmoid(self.pred_final(self.relu(self.pred_bn3(self.pred_deconv3(self.relu(self.pred_bn2(self.pred_deconv2(self.relu(self.pred_bn1(self.pred_deconv1(h)))))))))))
        return r, p

# --- 2. VISUALIZATION LOGIC ---
def visualize_dual_branch():
    print(f"Visualizing Dual-Branch Model on Video {TARGET_VIDEO_ID}...")
    
    # Locate Video
    vid_path = os.path.join(TEST_DIR, TARGET_VIDEO_ID)
    if not os.path.exists(vid_path):
        candidates = [d for d in os.listdir(TEST_DIR) if str(int(d)) == str(int(TARGET_VIDEO_ID))]
        if not candidates: print("Video not found."); return
        vid_path = os.path.join(TEST_DIR, candidates[0])
    
    frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
    print(f"Found {len(frames)} frames.")
    
    # Load Model
    model = STAutoEncoder_RGB()
    if not os.path.exists(MODEL_PATH): print("Model file not found!"); return
    
    # Handle DataParallel state dict if saved that way
    state_dict = torch.load(MODEL_PATH, map_location=DEVICE)
    new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
    model.load_state_dict(new_state_dict)
    
    model.to(DEVICE).eval()
    
    tf = transforms.Compose([transforms.Resize((IMG_SIZE, IMG_SIZE)), transforms.ToTensor()])
    loss_fn = nn.MSELoss(reduction='none')
    
    rec_scores = []
    pred_scores = []
    frame_indices = []
    
    print("Running Inference...")
    with torch.no_grad():
        # Stride 1 for smooth plotting
        for i in tqdm(range(len(frames) - (2 * CLIP_LEN))):
            # Input: Frames [i ... i+16]
            inp_paths = frames[i : i + CLIP_LEN]
            # Target: Frames [i+16 ... i+32] (For Prediction check)
            tgt_paths = frames[i + CLIP_LEN : i + (2 * CLIP_LEN)]
            
            # Load
            inp_vol = torch.stack([tf(Image.open(p).convert('RGB')) for p in inp_paths]).permute(1,0,2,3).unsqueeze(0).to(DEVICE)
            tgt_vol = torch.stack([tf(Image.open(p).convert('RGB')) for p in tgt_paths]).permute(1,0,2,3).unsqueeze(0).to(DEVICE)
            
            # Forward
            rec, pred = model(inp_vol)
            
            # Calculate Errors
            # Reconstruction Error: Compare 'rec' with 'inp'
            l_rec = loss_fn(rec, inp_vol).mean().item()
            
            # Prediction Error: Compare 'pred' with 'tgt'
            l_pred = loss_fn(pred, tgt_vol).mean().item()
            
            rec_scores.append(l_rec)
            pred_scores.append(l_pred)
            frame_indices.append(i + CLIP_LEN) # Plot at the "future" point

    # Normalize scores for plotting
    def normalize(arr):
        arr = np.array(arr)
        return (arr - arr.min()) / (arr.max() - arr.min())

    norm_rec = normalize(rec_scores)
    norm_pred = normalize(pred_scores)
    
    # Combine (Total Anomaly Score)
    # Prediction error usually weighs more for motion anomalies
    total_score = (0.3 * norm_rec) + (0.7 * norm_pred)
    smoothed_score = gaussian_filter1d(total_score, sigma=3)
    
    # Plot
    plt.figure(figsize=(15, 8))
    
    plt.subplot(2, 1, 1)
    plt.plot(frame_indices, norm_rec, label='Reconstruction Error (Static)', color='blue', alpha=0.6)
    plt.plot(frame_indices, norm_pred, label='Prediction Error (Future)', color='red', alpha=0.6)
    plt.title(f"Dual-Branch Error Analysis: Video {TARGET_VIDEO_ID}")
    plt.ylabel("Normalized Error")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(2, 1, 2)
    plt.plot(frame_indices, total_score, color='gray', alpha=0.3, label='Raw Combined Score')
    plt.plot(frame_indices, smoothed_score, color='green', linewidth=2, label='Smoothed Anomaly Score')
    plt.xlabel("Frame Number")
    plt.ylabel("Anomaly Score")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Anomaly Threshold Line (Arbitrary 0.5 for viz)
    plt.axhline(y=0.5, color='black', linestyle='--')
    
    save_file = f"dual_branch_plot_{TARGET_VIDEO_ID}.png"
    plt.savefig(save_file)
    print(f"Plot saved to {save_file}")
    plt.show()

if __name__ == "__main__":
    visualize_dual_branch()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
import torch.multiprocessing

# ================= CONFIGURATION =================
TEST_DIR = '/kaggle/working/testing_videos_masked_context'
MODEL_PATH = 'st_autoencoder_rgb_dual.pth'
OUTPUT_CSV = 'submission_dual_fast.csv'

CLIP_LEN = 16    # Input 16 frames
PRED_LEN = 16    # Predict 16 frames
IMG_SIZE = 128
BATCH_SIZE = 64  # Increased for Dual GPU Speed
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Weights
W_REC = 0.3
W_PRED = 0.7
# =================================================

# --- 1. MODEL ARCHITECTURE ---
class STAutoEncoder_RGB(nn.Module):
    def __init__(self): 
        super(STAutoEncoder_RGB, self).__init__()
        self.conv1 = nn.Conv3d(3, 32, 3, padding=1); self.bn1 = nn.BatchNorm3d(32); self.pool1 = nn.MaxPool3d(2, 2)
        self.conv2 = nn.Conv3d(32, 48, 3, padding=1); self.bn2 = nn.BatchNorm3d(48); self.pool2 = nn.MaxPool3d(2, 2)
        self.conv3 = nn.Conv3d(48, 64, 3, padding=1); self.bn3 = nn.BatchNorm3d(64); self.pool3 = nn.MaxPool3d(2, 2)
        self.conv4 = nn.Conv3d(64, 64, 3, padding=1); self.bn4 = nn.BatchNorm3d(64)
        self.relu = nn.LeakyReLU(0.1)
        
        self.rec_deconv1 = nn.ConvTranspose3d(64, 48, 3, 2, 1, 1); self.rec_bn1 = nn.BatchNorm3d(48)
        self.rec_deconv2 = nn.ConvTranspose3d(48, 32, 3, 2, 1, 1); self.rec_bn2 = nn.BatchNorm3d(32)
        self.rec_deconv3 = nn.ConvTranspose3d(32, 32, 3, 2, 1, 1); self.rec_bn3 = nn.BatchNorm3d(32)
        self.rec_final = nn.Conv3d(32, 3, 3, padding=1)
        
        self.pred_deconv1 = nn.ConvTranspose3d(64, 48, 3, 2, 1, 1); self.pred_bn1 = nn.BatchNorm3d(48)
        self.pred_deconv2 = nn.ConvTranspose3d(48, 32, 3, 2, 1, 1); self.pred_bn2 = nn.BatchNorm3d(32)
        self.pred_deconv3 = nn.ConvTranspose3d(32, 32, 3, 2, 1, 1); self.pred_bn3 = nn.BatchNorm3d(32)
        self.pred_final = nn.Conv3d(32, 3, 3, padding=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        h = self.relu(self.bn4(self.conv4(self.pool3(self.relu(self.bn3(self.conv3(self.pool2(self.relu(self.bn2(self.conv2(self.pool1(self.relu(self.bn1(self.conv1(x)))))))))))))))
        r = self.sigmoid(self.rec_final(self.relu(self.rec_bn3(self.rec_deconv3(self.relu(self.rec_bn2(self.rec_deconv2(self.relu(self.rec_bn1(self.rec_deconv1(h)))))))))))
        p = self.sigmoid(self.pred_final(self.relu(self.pred_bn3(self.pred_deconv3(self.relu(self.pred_bn2(self.pred_deconv2(self.relu(self.pred_bn1(self.pred_deconv1(h)))))))))))
        return r, p

# --- 2. FAST DATASET (Pre-indexing) ---
class FastTestDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.samples = []
        self.transform = transform
        
        print("Indexing all test videos...")
        videos = sorted(os.listdir(root_dir))
        
        for vid in videos:
            vid_path = os.path.join(root_dir, vid)
            if not os.path.isdir(vid_path): continue
            
            frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
            if len(frames) < CLIP_LEN + PRED_LEN: continue
            
            # Create a sample for every valid sliding window
            for i in range(len(frames) - CLIP_LEN - PRED_LEN):
                # We store indices/paths only to save RAM
                input_paths = frames[i : i + CLIP_LEN]
                target_paths = frames[i + CLIP_LEN : i + CLIP_LEN + PRED_LEN]
                
                # ID corresponds to the start of the prediction
                pred_frame = frames[i + CLIP_LEN]
                fid = f"{int(vid) if vid.isdigit() else vid}_{int(os.path.basename(pred_frame).split('_')[-1].split('.')[0])}"
                
                self.samples.append((input_paths, target_paths, fid))
        
        print(f"Indexed {len(self.samples)} samples for inference.")

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        in_paths, tgt_paths, fid = self.samples[idx]
        
        def load_vol(paths):
            vol = [Image.open(p).convert('RGB') for p in paths]
            if self.transform: vol = [self.transform(img) for img in vol]
            return torch.stack(vol, dim=0).permute(1, 0, 2, 3)
            
        return load_vol(in_paths), load_vol(tgt_paths), fid

# --- 3. GENERATION LOOP ---
def generate_fast():
    print(f">>> Initializing FAST Dual-GPU Inference (Batch={BATCH_SIZE})...")
    
    # Load Model
    model = STAutoEncoder_RGB()
    if not os.path.exists(MODEL_PATH):
        print(f"Error: {MODEL_PATH} not found.")
        return
        
    state = torch.load(MODEL_PATH, map_location=DEVICE)
    clean_state = {k.replace('module.', ''): v for k, v in state.items()}
    model.load_state_dict(clean_state)
    
    # Enable Dual GPU
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        model = nn.DataParallel(model)
    model.to(DEVICE).eval()
    
    # Dataset & Loader
    tf = transforms.Compose([transforms.Resize((IMG_SIZE, IMG_SIZE)), transforms.ToTensor()])
    dataset = FastTestDataset(TEST_DIR, transform=tf)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
    
    mse_loss = nn.MSELoss(reduction='none')
    results = []
    
    print("Running Inference Loop...")
    with torch.no_grad():
        for inp, tgt, fids in tqdm(loader):
            inp, tgt = inp.to(DEVICE), tgt.to(DEVICE)
            
            rec, pred = model(inp)
            
            # Vectorized Loss Calculation (per batch item)
            # MSE returns (Batch, C, T, H, W). Mean over dimensions (1,2,3,4) gives per-item loss
            loss_rec = ((rec - inp) ** 2).mean(dim=(1,2,3,4))
            loss_pred = ((pred - tgt) ** 2).mean(dim=(1,2,3,4))
            
            # Combine
            scores = (W_REC * loss_rec * 100) + (W_PRED * loss_pred * 100)
            
            # Store
            scores_np = scores.cpu().numpy()
            for fid, score in zip(fids, scores_np):
                results.append({'Id': fid, 'Predicted': float(score)})
    
    # --- POST PROCESSING ---
    print("Normalizing...")
    df = pd.DataFrame(results)
    
    # 1. Fill Missing Frames
    all_frames = []
    for v in sorted(os.listdir(TEST_DIR)):
        vp = os.path.join(TEST_DIR, v)
        if not os.path.isdir(vp): continue
        fs = sorted(glob.glob(os.path.join(vp, '*.jpg')))
        for f in fs:
            fid = f"{int(v) if v.isdigit() else v}_{int(os.path.basename(f).split('_')[-1].split('.')[0])}"
            all_frames.append(fid)
            
    df_full = pd.DataFrame({'Id': all_frames})
    df_final = pd.merge(df_full, df, on='Id', how='left').fillna(0.0)
    
    # 2. Global Normalization (No Smoothing)
    mx = df_final['Predicted'].max()
    mn = df_final['Predicted'].min()
    if mx > mn:
        df_final['Predicted'] = (df_final['Predicted'] - mn) / (mx - mn)
        
    df_final.to_csv(OUTPUT_CSV, index=False)
    print(f"Done! Saved to {OUTPUT_CSV}")

if __name__ == "__main__":
    generate_fast()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from scipy.ndimage import gaussian_filter1d
from tqdm import tqdm
import gc

# ================= CONFIGURATION =================
TEST_DIR = '/kaggle/working/cleaned_testing_videos'
MODEL_PATH = 'st_ae_fast_256.pth' 

TARGET_VIDEO_ID = '09'   
IMG_SIZE = 256
CLIP_LEN = 16
BATCH_SIZE = 8           # Reduced to 8 to prevent OOM
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Submission Weights
W_REC = 0.3
W_PRED = 0.7
# =================================================

# --- 1. MODEL ARCHITECTURE (Must match training) ---
class STAutoEncoder_Optimized(nn.Module):
    def __init__(self): 
        super(STAutoEncoder_Optimized, self).__init__()
        # Encoder
        self.conv1 = nn.Conv3d(3, 32, 3, padding=1);   self.bn1 = nn.BatchNorm3d(32);  self.pool1 = nn.MaxPool3d((1, 2, 2))
        self.conv2 = nn.Conv3d(32, 48, 3, padding=1);  self.bn2 = nn.BatchNorm3d(48);  self.pool2 = nn.MaxPool3d((2, 2, 2))
        self.conv3 = nn.Conv3d(48, 64, 3, padding=1);  self.bn3 = nn.BatchNorm3d(64);  self.pool3 = nn.MaxPool3d((2, 2, 2))
        self.conv4 = nn.Conv3d(64, 128, 3, padding=1); self.bn4 = nn.BatchNorm3d(128); self.pool4 = nn.MaxPool3d((2, 2, 2))
        self.conv5 = nn.Conv3d(128, 256, 3, padding=1);self.bn5 = nn.BatchNorm3d(256) 
        self.relu = nn.LeakyReLU(0.1, inplace=True)
        
        # Recon
        self.rec_d1 = nn.ConvTranspose3d(256, 128, 3, (2,2,2), 1, (1,1,1)); self.rbn1 = nn.BatchNorm3d(128)
        self.rec_d2 = nn.ConvTranspose3d(128, 64, 3, (2,2,2), 1, (1,1,1));  self.rbn2 = nn.BatchNorm3d(64)
        self.rec_d3 = nn.ConvTranspose3d(64, 48, 3, (2,2,2), 1, (1,1,1));   self.rbn3 = nn.BatchNorm3d(48)
        self.rec_d4 = nn.ConvTranspose3d(48, 32, 3, (1,2,2), 1, (0,1,1));   self.rbn4 = nn.BatchNorm3d(32)
        self.rec_out = nn.Conv3d(32, 3, 3, padding=1)
        
        # Pred
        self.pre_d1 = nn.ConvTranspose3d(256, 128, 3, (2,2,2), 1, (1,1,1)); self.pbn1 = nn.BatchNorm3d(128)
        self.pre_d2 = nn.ConvTranspose3d(128, 64, 3, (2,2,2), 1, (1,1,1));  self.pbn2 = nn.BatchNorm3d(64)
        self.pre_d3 = nn.ConvTranspose3d(64, 48, 3, (2,2,2), 1, (1,1,1));   self.pbn3 = nn.BatchNorm3d(48)
        self.pre_d4 = nn.ConvTranspose3d(48, 32, 3, (1,2,2), 1, (0,1,1));   self.pbn4 = nn.BatchNorm3d(32)
        self.pre_out = nn.Conv3d(32, 3, 3, padding=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(self.relu(self.bn1(self.conv1(x))))
        x = self.pool2(self.relu(self.bn2(self.conv2(x))))
        x = self.pool3(self.relu(self.bn3(self.conv3(x))))
        x = self.pool4(self.relu(self.bn4(self.conv4(x))))
        l = self.relu(self.bn5(self.conv5(x)))
        
        r = self.sigmoid(self.rec_out(self.relu(self.rbn4(self.rec_d4(self.relu(self.rbn3(self.rec_d3(self.relu(self.rbn2(self.rec_d2(self.relu(self.rbn1(self.rec_d1(l))))))))))))))
        p = self.sigmoid(self.pre_out(self.relu(self.pbn4(self.pre_d4(self.relu(self.pbn3(self.pre_d3(self.relu(self.pbn2(self.pre_d2(self.relu(self.pbn1(self.pre_d1(l))))))))))))))
        return r, p

# --- 2. STANDARD DATASET (On-Demand Loading) ---
class SafeVideoDataset(Dataset):
    def __init__(self, vid_id, root_dir, clip_len=16, img_size=256):
        self.clip_len = clip_len
        
        # Locate Video
        vid_path = os.path.join(root_dir, vid_id)
        if not os.path.exists(vid_path):
            candidates = [d for d in os.listdir(root_dir) if str(int(d)) == str(int(vid_id))]
            if candidates: vid_path = os.path.join(root_dir, candidates[0])
            else: raise ValueError("Video not found")
            
        self.frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
        print(f"Found {len(self.frames)} frames in {vid_path}")
        
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor()
        ])
        
        # Valid start indices
        self.valid_indices = []
        if len(self.frames) >= 2 * clip_len:
            self.valid_indices = list(range(len(self.frames) - (2 * clip_len)))

    def __len__(self): return len(self.valid_indices)

    def __getitem__(self, idx):
        start = self.valid_indices[idx]
        
        # Load Input [t ... t+16]
        in_vol = []
        for i in range(start, start + self.clip_len):
            img = Image.open(self.frames[i]).convert('RGB')
            in_vol.append(self.transform(img))
            
        # Load Target [t+16 ... t+32]
        tgt_vol = []
        for i in range(start + self.clip_len, start + (2 * self.clip_len)):
            img = Image.open(self.frames[i]).convert('RGB')
            tgt_vol.append(self.transform(img))
            
        return (torch.stack(in_vol, dim=0).permute(1, 0, 2, 3), 
                torch.stack(tgt_vol, dim=0).permute(1, 0, 2, 3),
                start + self.clip_len) # Frame Index for plotting

# --- 3. VISUALIZATION LOGIC ---
def visualize_safe():
    torch.cuda.empty_cache(); gc.collect()
    print(f"Running Safe Visualization for Video {TARGET_VIDEO_ID}...")
    
    # 1. Load Model
    model = STAutoEncoder_Optimized()
    if not os.path.exists(MODEL_PATH): print(f"Error: {MODEL_PATH} missing."); return
    
    # Clean state dict (remove 'module.' if present)
    st = torch.load(MODEL_PATH, map_location=DEVICE)
    if 'module.' in list(st.keys())[0]: st = {k.replace('module.', ''): v for k, v in st.items()}
    model.load_state_dict(st)
    
    model.to(DEVICE).eval()
    
    # 2. Setup Data
    ds = SafeVideoDataset(TARGET_VIDEO_ID, TEST_DIR, CLIP_LEN, IMG_SIZE)
    # num_workers=2 is safe, 0 is safest
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
    
    raw_scores = []
    frame_indices = []
    
    print("Running Inference (Safe Mode)...")
    with torch.no_grad():
        for inp, tgt, idxs in tqdm(loader):
            inp, tgt = inp.to(DEVICE), tgt.to(DEVICE)
            
            rec, pred = model(inp)
            
            # Per-sample loss
            loss_rec = ((rec - inp)**2).mean(dim=(1,2,3,4)).cpu().numpy()
            loss_pred = ((pred - tgt)**2).mean(dim=(1,2,3,4)).cpu().numpy()
            
            # Combine
            batch_scores = (W_REC * loss_rec) + (W_PRED * loss_pred)
            
            raw_scores.extend(batch_scores)
            frame_indices.extend(idxs.numpy())

    # --- 4. SUBMISSION LOGIC ---
    print("Calculating Final Scores...")
    scores = np.array(raw_scores)
    
    # Normalize (Min-Max)
    if scores.max() > scores.min():
        scores = (scores - scores.min()) / (scores.max() - scores.min())
        
    
    # --- 5. PLOT ---
    plt.figure(figsize=(15, 6))
    plt.plot(frame_indices, scores, color='gray', alpha=0.3, label='Raw Score')
    plt.plot(frame_indices, scores, color='red', linewidth=2, label='Smoothed Score (Final)')
    
    plt.title(f"Anomaly Detection: Video {TARGET_VIDEO_ID}", fontsize=14)
    plt.xlabel("Frame Number")
    plt.ylabel("Anomaly Probability")
    plt.axhline(y=0.5, color='black', linestyle='--', alpha=0.5)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

if __name__ == "__main__":
    visualize_safe()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from scipy.ndimage import gaussian_filter1d
import gc

# ================= CONFIGURATION =================
TEST_DIR = '/kaggle/working/cleaned_testing_videos'
MODEL_PATH = 'st_ae_fast_256.pth'
OUTPUT_CSV = 'submission.csv'

IMG_SIZE = 256
CLIP_LEN = 16
BATCH_SIZE = 32          # Safe batch size for Dual GPU (16 per card)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Submission Weights
W_REC = 0.3
W_PRED = 0.7
# =================================================

# --- 1. MODEL ARCHITECTURE ---
class STAutoEncoder_Optimized(nn.Module):
    def __init__(self): 
        super(STAutoEncoder_Optimized, self).__init__()
        # Encoder
        self.conv1 = nn.Conv3d(3, 32, 3, padding=1);   self.bn1 = nn.BatchNorm3d(32);  self.pool1 = nn.MaxPool3d((1, 2, 2))
        self.conv2 = nn.Conv3d(32, 48, 3, padding=1);  self.bn2 = nn.BatchNorm3d(48);  self.pool2 = nn.MaxPool3d((2, 2, 2))
        self.conv3 = nn.Conv3d(48, 64, 3, padding=1);  self.bn3 = nn.BatchNorm3d(64);  self.pool3 = nn.MaxPool3d((2, 2, 2))
        self.conv4 = nn.Conv3d(64, 128, 3, padding=1); self.bn4 = nn.BatchNorm3d(128); self.pool4 = nn.MaxPool3d((2, 2, 2))
        self.conv5 = nn.Conv3d(128, 256, 3, padding=1);self.bn5 = nn.BatchNorm3d(256) 
        self.relu = nn.LeakyReLU(0.1, inplace=True)
        
        # Recon
        self.rec_d1 = nn.ConvTranspose3d(256, 128, 3, (2,2,2), 1, (1,1,1)); self.rbn1 = nn.BatchNorm3d(128)
        self.rec_d2 = nn.ConvTranspose3d(128, 64, 3, (2,2,2), 1, (1,1,1));  self.rbn2 = nn.BatchNorm3d(64)
        self.rec_d3 = nn.ConvTranspose3d(64, 48, 3, (2,2,2), 1, (1,1,1));   self.rbn3 = nn.BatchNorm3d(48)
        self.rec_d4 = nn.ConvTranspose3d(48, 32, 3, (1,2,2), 1, (0,1,1));   self.rbn4 = nn.BatchNorm3d(32)
        self.rec_out = nn.Conv3d(32, 3, 3, padding=1)
        
        # Pred
        self.pre_d1 = nn.ConvTranspose3d(256, 128, 3, (2,2,2), 1, (1,1,1)); self.pbn1 = nn.BatchNorm3d(128)
        self.pre_d2 = nn.ConvTranspose3d(128, 64, 3, (2,2,2), 1, (1,1,1));  self.pbn2 = nn.BatchNorm3d(64)
        self.pre_d3 = nn.ConvTranspose3d(64, 48, 3, (2,2,2), 1, (1,1,1));   self.pbn3 = nn.BatchNorm3d(48)
        self.pre_d4 = nn.ConvTranspose3d(48, 32, 3, (1,2,2), 1, (0,1,1));   self.pbn4 = nn.BatchNorm3d(32)
        self.pre_out = nn.Conv3d(32, 3, 3, padding=1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(self.relu(self.bn1(self.conv1(x))))
        x = self.pool2(self.relu(self.bn2(self.conv2(x))))
        x = self.pool3(self.relu(self.bn3(self.conv3(x))))
        x = self.pool4(self.relu(self.bn4(self.conv4(x))))
        l = self.relu(self.bn5(self.conv5(x)))
        
        r = self.sigmoid(self.rec_out(self.relu(self.rbn4(self.rec_d4(self.relu(self.rbn3(self.rec_d3(self.relu(self.rbn2(self.rec_d2(self.relu(self.rbn1(self.rec_d1(l))))))))))))))
        p = self.sigmoid(self.pre_out(self.relu(self.pbn4(self.pre_d4(self.relu(self.pbn3(self.pre_d3(self.relu(self.pbn2(self.pre_d2(self.relu(self.pbn1(self.pre_d1(l))))))))))))))
        return r, p

# --- 2. PRE-INDEXED DATASET (Efficient) ---
class InferenceDataset(Dataset):
    def __init__(self, root_dir, clip_len=16, img_size=256):
        self.clip_len = clip_len
        self.samples = [] # Stores (path_list_input, path_list_target, frame_id)
        
        print("Indexing Test Videos...")
        videos = sorted(os.listdir(root_dir))
        
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor()
        ])
        
        for vid in videos:
            vid_path = os.path.join(root_dir, vid)
            if not os.path.isdir(vid_path): continue
            
            frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
            if len(frames) < 2 * clip_len: continue
            
            # Create sliding window samples
            for i in range(len(frames) - (2 * clip_len)):
                in_paths = frames[i : i + clip_len]
                tgt_paths = frames[i + clip_len : i + (2 * clip_len)]
                
                # ID for submission (frame at the end of the input clip)
                # Usually we predict for frame t+16, so let's use that ID
                target_frame = frames[i + clip_len]
                fid = f"{int(vid) if vid.isdigit() else vid}_{int(os.path.basename(target_frame).split('_')[-1].split('.')[0])}"
                
                self.samples.append((in_paths, tgt_paths, fid))

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        in_p, tgt_p, fid = self.samples[idx]
        
        vol_in = [self.transform(Image.open(p).convert('RGB')) for p in in_p]
        vol_tgt = [self.transform(Image.open(p).convert('RGB')) for p in tgt_p]
        
        return (torch.stack(vol_in, dim=0).permute(1, 0, 2, 3), 
                torch.stack(vol_tgt, dim=0).permute(1, 0, 2, 3),
                fid)

# --- 3. GENERATION LOOP ---
def generate_submission():
    torch.cuda.empty_cache(); gc.collect()
    print(">>> Initializing Dual-GPU Submission Generation...")
    
    # Load Model
    model = STAutoEncoder_Optimized()
    if not os.path.exists(MODEL_PATH): print("Model not found!"); return
    
    # Handle DataParallel loading
    st = torch.load(MODEL_PATH, map_location=DEVICE)
    if 'module.' in list(st.keys())[0]: st = {k.replace('module.', ''): v for k, v in st.items()}
    model.load_state_dict(st)
    
    # Dual GPU Activation
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        model = nn.DataParallel(model)
    model.to(DEVICE).eval()
    
    # Dataset
    dataset = InferenceDataset(TEST_DIR, CLIP_LEN, IMG_SIZE)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
    
    results = []
    
    print("Running Inference...")
    with torch.no_grad():
        for inp, tgt, fids in tqdm(loader):
            inp, tgt = inp.to(DEVICE), tgt.to(DEVICE)
            
            rec, pred = model(inp)
            
            # Vectorized Loss Calculation
            loss_rec = ((rec - inp)**2).mean(dim=(1,2,3,4))
            loss_pred = ((pred - tgt)**2).mean(dim=(1,2,3,4))
            
            # Weighted Sum
            scores = (W_REC * loss_rec) + (W_PRED * loss_pred)
            
            # Move to CPU
            scores_np = scores.cpu().numpy()
            
            for fid, s in zip(fids, scores_np):
                results.append({'Id': fid, 'Predicted': float(s)})

    # --- POST PROCESSING ---
    print("Processing Scores...")
    df = pd.DataFrame(results)
    
    # 1. Fill Missing Frames
    all_frames = []
    for v in sorted(os.listdir(TEST_DIR)):
        vp = os.path.join(TEST_DIR, v)
        if not os.path.isdir(vp): continue
        fs = sorted(glob.glob(os.path.join(vp, '*.jpg')))
        for f in fs:
            fid = f"{int(v) if v.isdigit() else v}_{int(os.path.basename(f).split('_')[-1].split('.')[0])}"
            all_frames.append(fid)
            
    df_full = pd.DataFrame({'Id': all_frames})
    df_final = pd.merge(df_full, df, on='Id', how='left').fillna(0.0)
    
    # 2. Extract Video IDs for grouping
    df_final['VideoID'] = df_final['Id'].apply(lambda x: x.split('_')[0])
    
    # 3. Per-Video Normalization & Smoothing
    final_preds = []
    
    for vid, group in df_final.groupby('VideoID', sort=False):
        raw = group['Predicted'].values
        
        # Min-Max Normalize this video
        if raw.max() > raw.min():
            raw = (raw - raw.min()) / (raw.max() - raw.min())
            
        # Gaussian Smooth
        smooth = gaussian_filter1d(raw, sigma=2.0)
        final_preds.extend(smooth)
    
    # 4. Final Global Normalization (Just in case smoothing pushed boundaries)
    final_arr = np.array(final_preds)
    if final_arr.max() > final_arr.min():
        final_arr = (final_arr - final_arr.min()) / (final_arr.max() - final_arr.min())
        
    df_final['Predicted'] = final_arr
    
    # Save
    sub_df = df_final[['Id', 'Predicted']]
    sub_df.to_csv(OUTPUT_CSV, index=False)
    print(f"Success! Saved to {OUTPUT_CSV}")
    print(sub_df.head())

if __name__ == "__main__":
    generate_submission()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from scipy.ndimage import gaussian_filter1d
import gc

# ================= CONFIGURATION =================
TEST_DIR = '/kaggle/working/cleaned_testing_videos'
MODEL_PATH = '/kaggle/input/vlg-256/pytorch/default/1/st_ae_fast_256.pth'
OUTPUT_CSV = 'submission_corrected.csv'

IMG_SIZE = 256
CLIP_LEN = 16
BATCH_SIZE = 32
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Submission Weights
W_REC = 0.3
W_PRED = 0.7
# =================================================

# --- 1. MODEL ARCHITECTURE ---
class STAutoEncoder_Optimized(nn.Module):
    def __init__(self): 
        super(STAutoEncoder_Optimized, self).__init__()
        # Encoder
        self.conv1 = nn.Conv3d(3, 32, 3, padding=1);   self.bn1 = nn.BatchNorm3d(32);  self.pool1 = nn.MaxPool3d((1, 2, 2))
        self.conv2 = nn.Conv3d(32, 48, 3, padding=1);  self.bn2 = nn.BatchNorm3d(48);  self.pool2 = nn.MaxPool3d((2, 2, 2))
        self.conv3 = nn.Conv3d(48, 64, 3, padding=1);  self.bn3 = nn.BatchNorm3d(64);  self.pool3 = nn.MaxPool3d((2, 2, 2))
        self.conv4 = nn.Conv3d(64, 128, 3, padding=1); self.bn4 = nn.BatchNorm3d(128); self.pool4 = nn.MaxPool3d((2, 2, 2))
        self.conv5 = nn.Conv3d(128, 256, 3, padding=1);self.bn5 = nn.BatchNorm3d(256) 
        self.relu = nn.LeakyReLU(0.1, inplace=True)
        # Recon
        self.rec_d1 = nn.ConvTranspose3d(256, 128, 3, (2,2,2), 1, (1,1,1)); self.rbn1 = nn.BatchNorm3d(128)
        self.rec_d2 = nn.ConvTranspose3d(128, 64, 3, (2,2,2), 1, (1,1,1));  self.rbn2 = nn.BatchNorm3d(64)
        self.rec_d3 = nn.ConvTranspose3d(64, 48, 3, (2,2,2), 1, (1,1,1));   self.rbn3 = nn.BatchNorm3d(48)
        self.rec_d4 = nn.ConvTranspose3d(48, 32, 3, (1,2,2), 1, (0,1,1));   self.rbn4 = nn.BatchNorm3d(32)
        self.rec_out = nn.Conv3d(32, 3, 3, padding=1)
        # Pred
        self.pre_d1 = nn.ConvTranspose3d(256, 128, 3, (2,2,2), 1, (1,1,1)); self.pbn1 = nn.BatchNorm3d(128)
        self.pre_d2 = nn.ConvTranspose3d(128, 64, 3, (2,2,2), 1, (1,1,1));  self.pbn2 = nn.BatchNorm3d(64)
        self.pre_d3 = nn.ConvTranspose3d(64, 48, 3, (2,2,2), 1, (1,1,1));   self.pbn3 = nn.BatchNorm3d(48)
        self.pre_d4 = nn.ConvTranspose3d(48, 32, 3, (1,2,2), 1, (0,1,1));   self.pbn4 = nn.BatchNorm3d(32)
        self.pre_out = nn.Conv3d(32, 3, 3, padding=1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        l = self.relu(self.bn5(self.conv5(self.pool4(self.relu(self.bn4(self.conv4(self.pool3(self.relu(self.bn3(self.conv3(self.pool2(self.relu(self.bn2(self.conv2(self.pool1(self.relu(self.bn1(self.conv1(x)))))))))))))))))))
        r = self.sigmoid(self.rec_out(self.relu(self.rbn4(self.rec_d4(self.relu(self.rbn3(self.rec_d3(self.relu(self.rbn2(self.rec_d2(self.relu(self.rbn1(self.rec_d1(l))))))))))))))
        p = self.sigmoid(self.pre_out(self.relu(self.pbn4(self.pre_d4(self.relu(self.pbn3(self.pre_d3(self.relu(self.pbn2(self.pre_d2(self.relu(self.pbn1(self.pre_d1(l))))))))))))))
        return r, p

# --- 2. DATASET ---
class InferenceDataset(Dataset):
    def __init__(self, root_dir, clip_len=16, img_size=256):
        self.clip_len = clip_len
        self.samples = []
        
        print("Indexing Test Videos...")
        videos = sorted(os.listdir(root_dir))
        self.transform = transforms.Compose([transforms.Resize((img_size, img_size)), transforms.ToTensor()])
        
        for vid in videos:
            vid_path = os.path.join(root_dir, vid)
            if not os.path.isdir(vid_path): continue
            frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
            if len(frames) < 2 * clip_len: continue
            
            for i in range(len(frames) - (2 * clip_len)):
                in_paths = frames[i : i + clip_len]
                tgt_paths = frames[i + clip_len : i + (2 * clip_len)]
                target_frame = frames[i + clip_len]
                fid = f"{int(vid) if vid.isdigit() else vid}_{int(os.path.basename(target_frame).split('_')[-1].split('.')[0])}"
                self.samples.append((in_paths, tgt_paths, fid))

    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        in_p, tgt_p, fid = self.samples[idx]
        vol_in = [self.transform(Image.open(p).convert('RGB')) for p in in_p]
        vol_tgt = [self.transform(Image.open(p).convert('RGB')) for p in tgt_p]
        return (torch.stack(vol_in, dim=0).permute(1, 0, 2, 3), 
                torch.stack(vol_tgt, dim=0).permute(1, 0, 2, 3),
                fid)

# --- 3. EXECUTION ---
def generate_submission():
    torch.cuda.empty_cache(); gc.collect()
    print(">>> Initializing Corrected Submission Generator...")
    
    # Load Model
    model = STAutoEncoder_Optimized()
    if not os.path.exists(MODEL_PATH): print("Model not found!"); return
    st = torch.load(MODEL_PATH, map_location=DEVICE)
    if 'module.' in list(st.keys())[0]: st = {k.replace('module.', ''): v for k, v in st.items()}
    model.load_state_dict(st)
    if torch.cuda.device_count() > 1: model = nn.DataParallel(model)
    model.to(DEVICE).eval()
    
    # Dataset
    dataset = InferenceDataset(TEST_DIR, CLIP_LEN, IMG_SIZE)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
    
    results = []
    
    print("Running Inference...")
    with torch.no_grad():
        for inp, tgt, fids in tqdm(loader):
            inp, tgt = inp.to(DEVICE), tgt.to(DEVICE)
            rec, pred = model(inp)
            loss_rec = ((rec - inp)**2).mean(dim=(1,2,3,4))
            loss_pred = ((pred - tgt)**2).mean(dim=(1,2,3,4))
            scores = (W_REC * loss_rec) + (W_PRED * loss_pred)
            scores_np = scores.cpu().numpy()
            
            for fid, s in zip(fids, scores_np):
                results.append({'Id': fid, 'Predicted': float(s)})

    # --- 4. CORRECTED NORMALIZATION LOGIC ---
    print("Processing Scores (Normalize BEFORE filling zeros)...")
    
    # Convert actual predictions to DataFrame
    df_pred = pd.DataFrame(results)
    
    # 1. NORMALIZE NOW (While we only have valid scores)
    # This ensures min_score -> 0.0 and max_score -> 1.0
    # Crucially, this ignores the missing frames (which will be 0 later)
    mx = df_pred['Predicted'].max()
    mn = df_pred['Predicted'].min()
    print(f"Raw Score Range: {mn:.6f} to {mx:.6f}")
    
    if mx > mn:
        df_pred['Predicted'] = (df_pred['Predicted'] - mn) / (mx - mn)
    else:
        df_pred['Predicted'] = 0.0
        

    # 3. FILL GAPS
    # Now we bring in the missing frames and set them to 0.0
    all_frames = []
    for v in sorted(os.listdir(TEST_DIR)):
        vp = os.path.join(TEST_DIR, v)
        if not os.path.isdir(vp): continue
        fs = sorted(glob.glob(os.path.join(vp, '*.jpg')))
        for f in fs:
            fid = f"{int(v) if v.isdigit() else v}_{int(os.path.basename(f).split('_')[-1].split('.')[0])}"
            all_frames.append(fid)
            
    df_full = pd.DataFrame({'Id': all_frames})
    
    # Merge: existing scores stay, missing frames become NaN
    df_final = pd.merge(df_full, df_pred[['Id', 'Predicted']], on='Id', how='left')
    
    # Fill NaN with 0.0 (True Normal)
    df_final['Predicted'] = df_final['Predicted'].fillna(0.0)
    
    # 4. Global Re-Normalization (Safety Check)
    # Just to ensure smoothing didn't push anything above 1.0 or below 0.0
    final_vals = df_final['Predicted'].values
    final_vals = np.clip(final_vals, 0.0, 1.0)
    df_final['Predicted'] = final_vals
    
    # Save
    sub_df = df_final[['Id', 'Predicted']]
    sub_df.to_csv(OUTPUT_CSV, index=False)
    print(f"Success! Saved to {OUTPUT_CSV}")

if __name__ == "__main__":
    generate_submission()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ================= CONFIGURATION =================
INPUT_CSV = '/kaggle/working/submission_videowise_raw.csv'  # Your current best file
OUTPUT_CSV = 'submission_aggressive.csv'

# AGGRESSIVENESS (Gain)
# 10 = Strong, 20 = Very Strong (Almost Binary), 5 = Gentle
GAIN = 10.0  

# CENTER POINT
# If your anomaly scores usually peak at 0.6, set this slightly lower (e.g., 0.4)
# to make sure those 0.6s get pushed up to 1.0.
CENTER = 0.35
# =================================================

def sigmoid(x, gain=10, center=0.5):
    # Standard Logistic Function with Gain and Center control
    return 1 / (1 + np.exp(-gain * (x - center)))

def polarize_submission():
    print(f"Reading {INPUT_CSV}...")
    df = pd.read_csv(INPUT_CSV)
    
    raw = df['Predicted'].values
    
    # 1. Normalize to [0, 1] first (Critical for Sigmoid to work)
    if raw.max() > raw.min():
        raw = (raw - raw.min()) / (raw.max() - raw.min())
    
    # 2. Apply Aggressive Sigmoid
    # This pushes values away from the center
    aggressive = sigmoid(raw, gain=GAIN, center=CENTER)
    
    # 3. Re-Normalize (Just to ensure strict 0-1 range)
    aggressive = (aggressive - aggressive.min()) / (aggressive.max() - aggressive.min())
    
    df['Predicted'] = aggressive
    
    # --- VISUALIZATION (To verify polarization) ---
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(raw, bins=50, color='gray', alpha=0.7)
    plt.title("Original Scores (Soft)")
    plt.xlabel("Score")
    
    plt.subplot(1, 2, 2)
    plt.hist(aggressive, bins=50, color='red', alpha=0.7)
    plt.title(f"Aggressive Scores (Gain={GAIN})")
    plt.xlabel("Score")
    
    plt.tight_layout()
    plt.show()
    
    # --- SAVE ---
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved aggressive scores to {OUTPUT_CSV}")
    print(df.describe())

if __name__ == "__main__":
    polarize_submission()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ================= CONFIGURATION =================
INPUT_CSV = '/kaggle/input/vlg-0-61/submission_smart_fill(1).csv'  
OUTPUT_CSV = 'submission_videowise_raw.csv'
# =================================================

def fix_videowise_raw():
    print(f"Reading {INPUT_CSV}...")
    df = pd.read_csv(INPUT_CSV)
    
    # Extract Video ID
    df['VideoID'] = df['Id'].apply(lambda x: x.split('_')[0])
    
    final_dfs = []
    
    print("Applying Per-Video Normalization (NO SMOOTHING)...")
    
    # Process each video independently
    for vid, group in df.groupby('VideoID', sort=False):
        group = group.copy()
        raw_vals = group['Predicted'].values
        
        # 1. IDENTIFY VALID FRAMES (Ignore the 0.0 fillers)
        # Real model output is never exactly 0.0, so 0.0 means "missing frame"
        mask = raw_vals > 1e-9
        
        if mask.sum() > 0:
            valid_scores = raw_vals[mask]
            
            # 2. CALCULATE MIN/MAX FOR THIS SPECIFIC VIDEO
            mn, mx = valid_scores.min(), valid_scores.max()
            
            # 3. NORMALIZE (Local Range -> [0, 1])
            if mx > mn:
                normalized_valid = (valid_scores - mn) / (mx - mn)
            else:
                normalized_valid = np.zeros_like(valid_scores)
            
            # Apply back to the valid frames only
            raw_vals[mask] = normalized_valid
            
            # 4. SAFETY: Force gaps to stay 0.0 (No bleeding)
            raw_vals[~mask] = 0.0
            
            group['Predicted'] = np.clip(raw_vals, 0.0, 1.0)
            
        final_dfs.append(group)
        
    # Reassemble
    df_final = pd.concat(final_dfs)
    
    # --- PLOT CHECK ---
    plt.figure(figsize=(15, 6))
    
    # Plot Video 02 (Running)
    v02 = df_final[df_final['VideoID'].astype(str).str.contains('02')].sort_values('Id')
    if not v02.empty:
        x02 = v02['Id'].apply(lambda x: int(x.split('_')[1]))
        plt.plot(x02, v02['Predicted'], label='Video 02', color='blue', alpha=0.8)
        
    # Plot Video 05 (Bag Throw)
    v05 = df_final[df_final['VideoID'].astype(str).str.contains('05')].sort_values('Id')
    if not v05.empty:
        x05 = v05['Id'].apply(lambda x: int(x.split('_')[1]))
        plt.plot(x05, v05['Predicted'], label='Video 05', color='red', alpha=0.8)

    plt.title("Video-Wise Normalization (Raw Peaks)")
    plt.xlabel("Frame")
    plt.ylabel("Anomaly Score")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

    # Save
    out = df_final[['Id', 'Predicted']]
    out.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved RAW Video-Wise scores to {OUTPUT_CSV}")

if __name__ == "__main__":
    fix_videowise_raw()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# ================= CONFIGURATION =================
CSV_FILE = '/kaggle/working/submission_aggressive.csv'  # File to check
TARGET_VIDEO = '20'                     # Video ID to plot
# =================================================

def visualize_csv():
    if not os.path.exists(CSV_FILE):
        print(f"Error: {CSV_FILE} not found.")
        return

    print(f"Reading {CSV_FILE}...")
    df = pd.read_csv(CSV_FILE)
    
    # Filter for Target Video
    # Matches '02_xxx' or '2_xxx' style IDs
    df['VideoID'] = df['Id'].apply(lambda x: x.split('_')[0])
    vid_data = df[df['VideoID'].astype(int) == int(TARGET_VIDEO)].copy()
    
    if vid_data.empty:
        print(f"No data found for Video {TARGET_VIDEO}")
        return
        
    # Extract Frame Numbers
    vid_data['Frame'] = vid_data['Id'].apply(lambda x: int(x.split('_')[1]))
    vid_data = vid_data.sort_values('Frame')
    
    # Plot
    plt.figure(figsize=(15, 6))
    plt.plot(vid_data['Frame'], vid_data['Predicted'], color='red', linewidth=2, label='Anomaly Score')
    
    plt.title(f"Aggressive Anomaly Score: Video {TARGET_VIDEO}", fontsize=14)
    plt.xlabel("Frame Number")
    plt.ylabel("Score (0 = Normal, 1 = Anomaly)")
    plt.ylim(-0.1, 1.1) # Keep Y-axis fixed to see 0-1 clearly
    plt.axhline(y=0.5, color='black', linestyle='--', alpha=0.3)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()

if __name__ == "__main__":
    visualize_csv()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# ================= CONFIGURATION =================
TRAIN_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/training_videos'
SAVE_PATH = 'multiscale_unet_conditional.pth'

IMG_SIZE = 256
CLIP_LEN = 4     # 4 frames input
BATCH_SIZE = 16  # 8 per GPU
EPOCHS = 50
LR_G = 2e-4
LR_D = 2e-5

# [cite_start]Loss Weights [cite: 2419, 2782]
LAMBDA_INT = 2.0
LAMBDA_GD = 1.0
LAMBDA_ADV = 0.05
LAMBDA_FLOW = 2.0 

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. ARCHITECTURE COMPONENTS (Generator) ---
# [Unchanged from previous robust implementation]

class AsymmetricConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super(AsymmetricConv, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(kernel_size//2, 0))
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=(1, kernel_size), padding=(0, kernel_size//2))
        self.bn = nn.BatchNorm2d(out_channels)

    def forward(self, x):
        return self.relu(self.bn(self.conv2(self.relu(self.conv1(x)))))

class ResidualSkipConnection(nn.Module):
    def __init__(self, channels):
        super(ResidualSkipConnection, self).__init__()
        self.block = nn.Sequential(
            AsymmetricConv(channels, channels),
            AsymmetricConv(channels, channels)
        )
        self.shortcut = nn.Conv2d(channels, channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(self.block(x) + self.shortcut(x))

class ShortcutInceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ShortcutInceptionModule, self).__init__()
        w_6 = out_channels // 6
        w_3 = out_channels // 3
        w_2 = out_channels - (w_6 + w_3)

        self.branch1 = AsymmetricConv(in_channels, w_6)
        self.branch2 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3))
        self.branch3 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3), AsymmetricConv(w_3, w_2))
        self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        concat = torch.cat([self.branch1(x), self.branch2(x), self.branch3(x)], dim=1)
        return self.relu(concat + self.shortcut(x))

class MultiScaleUNet(nn.Module):
    def __init__(self, in_channels=12, out_channels=3):
        super(MultiScaleUNet, self).__init__()
        # Encoder
        self.sim1 = ShortcutInceptionModule(in_channels, 96); self.pool1 = nn.MaxPool2d(2)
        self.sim2 = ShortcutInceptionModule(96, 192);         self.pool2 = nn.MaxPool2d(2)
        self.sim3 = ShortcutInceptionModule(192, 384);        self.pool3 = nn.MaxPool2d(2)
        self.sim4 = ShortcutInceptionModule(384, 768)

        # Skip Connections
        self.rsc1 = nn.Sequential(*[ResidualSkipConnection(96) for _ in range(4)])
        self.rsc2 = nn.Sequential(*[ResidualSkipConnection(192) for _ in range(3)])
        self.rsc3 = nn.Sequential(*[ResidualSkipConnection(384) for _ in range(2)])

        # Decoder
        self.sim5 = ShortcutInceptionModule(768, 384);   self.up1 = nn.ConvTranspose2d(384, 384, 2, 2)
        self.sim6 = ShortcutInceptionModule(768, 192);   self.up2 = nn.ConvTranspose2d(192, 192, 2, 2)
        self.sim7 = ShortcutInceptionModule(384, 96);    self.up3 = nn.ConvTranspose2d(96, 96, 2, 2)
        self.sim8 = ShortcutInceptionModule(192, 96)
        self.final = nn.Conv2d(96, out_channels, 3, padding=1)
        self.tanh = nn.Tanh()

    def forward(self, x):
        e1 = self.sim1(x);        p1 = self.pool1(e1)
        e2 = self.sim2(p1);       p2 = self.pool2(e2)
        e3 = self.sim3(p2);       p3 = self.pool3(e3)
        e4 = self.sim4(p3)

        d1 = self.sim5(e4);       u1 = self.up1(d1)
        cat1 = torch.cat([u1, self.rsc3(e3)], dim=1)

        d2 = self.sim6(cat1);     u2 = self.up2(d2)
        cat2 = torch.cat([u2, self.rsc2(e2)], dim=1)

        d3 = self.sim7(cat2);     u3 = self.up3(d3)
        cat3 = torch.cat([u3, self.rsc1(e1)], dim=1)

        d4 = self.sim8(cat3)
        return self.tanh(self.final(d4))

# --- 2. CONDITIONAL PATCH DISCRIMINATOR (FIXED) ---
class ConditionalPatchDiscriminator(nn.Module):
    def __init__(self, in_channels=6): # 3 (Current) + 3 (Past Condition)
        super(ConditionalPatchDiscriminator, self).__init__()
        
        def disc_block(in_f, out_f, bn=True):
            block = [nn.Conv2d(in_f, out_f, 4, stride=2, padding=1), nn.LeakyReLU(0.2, inplace=True)]
            if bn: block.append(nn.BatchNorm2d(out_f))
            return block

        self.model = nn.Sequential(
            *disc_block(in_channels, 64, bn=False), # 128x128
            *disc_block(64, 128),                   # 64x64
            *disc_block(128, 256),                  # 32x32
            nn.Conv2d(256, 1, 4, padding=1)         # 32x32 (PatchGAN Map)
        )

    def forward(self, img_A, img_B):
        # Concatenate condition (Last Frame) and target (Current Frame)
        img_input = torch.cat((img_A, img_B), 1)
        return self.model(img_input)

# --- 3. LOSSES ---
def gradient_loss(gen_frames, gt_frames):
    def gradient(x):
        h_x = x.size()[-2]
        w_x = x.size()[-1]
        x_h = torch.abs(x[:, :, 1:, :] - x[:, :, :h_x-1, :])
        x_w = torch.abs(x[:, :, :, 1:] - x[:, :, :, :w_x-1])
        return x_h, x_w
    gen_h, gen_w = gradient(gen_frames)
    gt_h, gt_w = gradient(gt_frames)
    return torch.mean(torch.abs(gen_h - gt_h)) + torch.mean(torch.abs(gen_w - gt_w))

def flow_loss(gen_frames, gt_frames, prev_frames):
    flow_gen = torch.abs(gen_frames - prev_frames)
    flow_gt = torch.abs(gt_frames - prev_frames)
    return torch.mean(torch.abs(flow_gen - flow_gt)) # L1 Loss for robustness

# --- 4. DATASET ---
class AvenueTrainDataset(Dataset):
    def __init__(self, root_dir, clip_len=4, img_size=256):
        self.clip_len = clip_len
        self.samples = []
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        
        videos = sorted(os.listdir(root_dir))
        for vid in videos:
            path = os.path.join(root_dir, vid)
            if not os.path.isdir(path): continue
            frames = sorted(glob.glob(os.path.join(path, '*.jpg')))
            if len(frames) < clip_len + 1: continue
            
            for i in range(len(frames) - clip_len):
                self.samples.append(frames[i : i + clip_len + 1])

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        paths = self.samples[idx]
        imgs = [self.transform(Image.open(p).convert('RGB')) for p in paths]
        
        input_seq = torch.cat(imgs[:-1], dim=0) # 12 channels
        target_frame = imgs[-1]                 # 3 channels (t+1)
        last_input_frame = imgs[-2]             # 3 channels (t) - For Conditioning
        
        return input_seq, target_frame, last_input_frame

# --- 5. TRAINING LOOP (CONDITIONAL GAN) ---
def train():
    print(f"Initializing Conditional Multi-scale U-Net Training on {DEVICE}...")
    
    # Init Models
    generator = MultiScaleUNet().to(DEVICE)
    # Discriminator takes 6 channels: 3 (Condition/Last Frame) + 3 (Target/Generated)
    discriminator = ConditionalPatchDiscriminator(in_channels=6).to(DEVICE)
    
    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs!")
        generator = nn.DataParallel(generator)
        discriminator = nn.DataParallel(discriminator)
        
    opt_g = optim.Adam(generator.parameters(), lr=LR_G)
    opt_d = optim.Adam(discriminator.parameters(), lr=LR_D)
    
    criterion_gan = nn.MSELoss() # LSGAN is more stable than BCE
    criterion_pixel = nn.MSELoss()
    
    dataset = AvenueTrainDataset(TRAIN_DIR, CLIP_LEN, IMG_SIZE)
    loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=12, pin_memory=True)
    
    try:
        for epoch in range(EPOCHS):
            generator.train(); discriminator.train()
            pbar = tqdm(loader, desc=f"Ep {epoch+1}/{EPOCHS}")
            
            for inputs, targets, last_frames in pbar:
                inputs = inputs.to(DEVICE)
                targets = targets.to(DEVICE)
                last_frames = last_frames.to(DEVICE) # Condition for D
                
                # ==========================
                #  Train Discriminator (D)
                # ==========================
                opt_d.zero_grad()
                
                # Real: D(LastFrame, RealTarget) -> 1
                real_out = discriminator(last_frames, targets)
                loss_real = criterion_gan(real_out, torch.ones_like(real_out))
                
                # Fake: D(LastFrame, FakeTarget) -> 0
                fake_frame = generator(inputs)
                fake_out = discriminator(last_frames, fake_frame.detach()) # Detach G
                loss_fake = criterion_gan(fake_out, torch.zeros_like(fake_out))
                
                loss_d = 0.5 * (loss_real + loss_fake)
                loss_d.backward()
                opt_d.step()
                
                # ==========================
                #  Train Generator (G)
                # ==========================
                opt_g.zero_grad()
                
                # 1. Adversarial Loss: D(LastFrame, FakeTarget) -> 1
                fake_out_g = discriminator(last_frames, fake_frame)
                l_adv = criterion_gan(fake_out_g, torch.ones_like(fake_out_g))
                
                # 2. Pixel Intensity Loss
                l_int = criterion_pixel(fake_frame, targets)
                
                # 3. Gradient Loss
                l_gd = gradient_loss(fake_frame, targets)
                
                # 4. Flow Loss (Temporal Consistency)
                l_flow = flow_loss(fake_frame, targets, last_frames)
                
                # Total Loss
                loss_g = (LAMBDA_INT * l_int) + \
                         (LAMBDA_GD * l_gd) + \
                         (LAMBDA_ADV * l_adv) + \
                         (LAMBDA_FLOW * l_flow)
                         
                loss_g.backward()
                opt_g.step()
                
                pbar.set_postfix({
                    'D_loss': f"{loss_d.item():.4f}",
                    'G_Adv': f"{l_adv.item():.4f}",
                    'G_Int': f"{l_int.item():.4f}",
                    'G_Flow': f"{l_flow.item():.4f}"
                })
            
            torch.save(generator.module.state_dict(), f"unet_conditional_ep{epoch}.pth")
            
    except KeyboardInterrupt:
        print("\nTraining Interrupted! Saving checkpoint...")
        state = generator.module.state_dict() if hasattr(generator, 'module') else generator.state_dict()
        torch.save(state, 'INTERRUPTED_conditional.pth')
        print("Saved safely.")

if __name__ == "__main__":
    train()

In [None]:
import torch
import matplotlib.pyplot as plt
import os

# ================= CONFIGURATION =================
# Check the latest saved epoch file in your directory
LATEST_MODEL = '/kaggle/input/vlg-testmodel/pytorch/default/1/unet_conditional_ep1.pth' 
TEST_DIR = '/kaggle/working/cleaned_testing_videos'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

def inspect():
    # Load Model
    model = MultiScaleUNet().to(DEVICE)
    
    if os.path.exists(LATEST_MODEL):
        print(f"Loading {LATEST_MODEL}...")
        st = torch.load(LATEST_MODEL, map_location=DEVICE)
        # Fix DataParallel keys if needed
        if 'module.' in list(st.keys())[0]: st = {k.replace('module.', ''): v for k, v in st.items()}
        model.load_state_dict(st)
    else:
        print("Model file not found yet!")
        return

    model.eval()
    
    # Get 1 Sample
    ds = AvenueTrainDataset(TEST_DIR, clip_len=4)
    inputs, target, _ = ds[1229] # Random index
    
    # Run Inference
    with torch.no_grad():
        inputs = inputs.unsqueeze(0).to(DEVICE) # Add batch dim
        pred = model(inputs).squeeze(0).cpu()
    
    # De-normalize [-1, 1] -> [0, 1] for plotting
    target = (target * 0.5) + 0.5
    pred = (pred * 0.5) + 0.5
    
    # Plot
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.imshow(target.permute(1, 2, 0).clip(0, 1))
    plt.title("Ground Truth (Next Frame)")
    plt.axis('off')
    
    plt.subplot(1, 2, 2)
    plt.imshow(pred.permute(1, 2, 0).clip(0, 1))
    plt.title(f"Prediction (Epoch {LATEST_MODEL.split('_ep')[1].split('.')[0]})")
    plt.axis('off')
    
    plt.show()

if __name__ == "__main__":
    inspect()

In [None]:
import os
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# ================= CONFIGURATION =================
# Update this to match your actual file name (e.g., ep1.pth is Epoch 2 if 0-indexed)
MODEL_PATH = '/kaggle/input/vlg-testmodel/pytorch/default/1/unet_conditional_ep1.pth' 
TEST_DIR = '/kaggle/working/denoised_frames'
TARGET_VIDEO = '09' 

IMG_SIZE = 256
CLIP_LEN = 4
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. MODEL ARCHITECTURE (Must be defined to load weights) ---
class AsymmetricConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super(AsymmetricConv, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(kernel_size//2, 0))
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=(1, kernel_size), padding=(0, kernel_size//2))
        self.bn = nn.BatchNorm2d(out_channels)
    def forward(self, x): return self.relu(self.bn(self.conv2(self.relu(self.conv1(x)))))

class ResidualSkipConnection(nn.Module):
    def __init__(self, channels):
        super(ResidualSkipConnection, self).__init__()
        self.block = nn.Sequential(AsymmetricConv(channels, channels), AsymmetricConv(channels, channels))
        self.shortcut = nn.Conv2d(channels, channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x): return self.relu(self.block(x) + self.shortcut(x))

class ShortcutInceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ShortcutInceptionModule, self).__init__()
        w_6 = out_channels // 6; w_3 = out_channels // 3; w_2 = out_channels - (w_6 + w_3)
        self.branch1 = AsymmetricConv(in_channels, w_6)
        self.branch2 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3))
        self.branch3 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3), AsymmetricConv(w_3, w_2))
        self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        return self.relu(torch.cat([self.branch1(x), self.branch2(x), self.branch3(x)], dim=1) + self.shortcut(x))

class MultiScaleUNet(nn.Module):
    def __init__(self, in_channels=12, out_channels=3):
        super(MultiScaleUNet, self).__init__()
        self.sim1 = ShortcutInceptionModule(in_channels, 96); self.pool1 = nn.MaxPool2d(2)
        self.sim2 = ShortcutInceptionModule(96, 192);         self.pool2 = nn.MaxPool2d(2)
        self.sim3 = ShortcutInceptionModule(192, 384);        self.pool3 = nn.MaxPool2d(2)
        self.sim4 = ShortcutInceptionModule(384, 768)
        self.rsc1 = nn.Sequential(*[ResidualSkipConnection(96) for _ in range(4)])
        self.rsc2 = nn.Sequential(*[ResidualSkipConnection(192) for _ in range(3)])
        self.rsc3 = nn.Sequential(*[ResidualSkipConnection(384) for _ in range(2)])
        self.sim5 = ShortcutInceptionModule(768, 384);   self.up1 = nn.ConvTranspose2d(384, 384, 2, 2)
        self.sim6 = ShortcutInceptionModule(768, 192);   self.up2 = nn.ConvTranspose2d(192, 192, 2, 2)
        self.sim7 = ShortcutInceptionModule(384, 96);    self.up3 = nn.ConvTranspose2d(96, 96, 2, 2)
        self.sim8 = ShortcutInceptionModule(192, 96)
        self.final = nn.Conv2d(96, out_channels, 3, padding=1)
        self.tanh = nn.Tanh()
    def forward(self, x):
        e1 = self.sim1(x); p1 = self.pool1(e1)
        e2 = self.sim2(p1); p2 = self.pool2(e2)
        e3 = self.sim3(p2); p3 = self.pool3(e3)
        e4 = self.sim4(p3)
        d1 = self.sim5(e4); u1 = self.up1(d1)
        d2 = self.sim6(torch.cat([u1, self.rsc3(e3)], dim=1)); u2 = self.up2(d2)
        d3 = self.sim7(torch.cat([u2, self.rsc2(e2)], dim=1)); u3 = self.up3(d3)
        d4 = self.sim8(torch.cat([u3, self.rsc1(e1)], dim=1))
        return self.tanh(self.final(d4))

# --- 2. SINGLE VIDEO DATASET ---
class SingleVideoDataset(Dataset):
    def __init__(self, vid_id, root_dir, clip_len=4, img_size=256):
        self.clip_len = clip_len
        self.samples = []
        
        # Locate Video
        vid_path = os.path.join(root_dir, vid_id)
        if not os.path.exists(vid_path):
            # Try finding folder with different zero-padding (e.g. '2' vs '02')
            candidates = [d for d in os.listdir(root_dir) if str(int(d)) == str(int(vid_id))]
            if candidates: vid_path = os.path.join(root_dir, candidates[0])
            else: raise ValueError(f"Video {vid_id} not found in {root_dir}")
            
        print(f"Loading frames from {vid_path}...")
        self.frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
        
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        
        # Create sliding windows
        if len(self.frames) >= clip_len + 1:
            for i in range(len(self.frames) - clip_len):
                self.samples.append(i)

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        # Input: [t, t+1, t+2, t+3]
        in_paths = self.frames[idx : idx + self.clip_len]
        # Target: [t+4]
        tgt_path = self.frames[idx + self.clip_len]
        
        imgs = [self.transform(Image.open(p).convert('RGB')) for p in in_paths]
        input_seq = torch.cat(imgs, dim=0) # (12, H, W)
        target = self.transform(Image.open(tgt_path).convert('RGB'))
        
        return input_seq, target, idx + self.clip_len

# --- 3. PLOT LOGIC ---
def visualize():
    print(f"Generating Anomaly Graph for Video {TARGET_VIDEO} using {MODEL_PATH}...")
    
    # Load Model
    model = MultiScaleUNet().to(DEVICE)
    if not os.path.exists(MODEL_PATH):
        print(f"Error: {MODEL_PATH} not found. Please train first or check the path.")
        return
        
    st = torch.load(MODEL_PATH, map_location=DEVICE)
    # Handle DataParallel dict keys if needed
    if 'module.' in list(st.keys())[0]: st = {k.replace('module.', ''): v for k, v in st.items()}
    model.load_state_dict(st)
    model.eval()
    
    # Data
    ds = SingleVideoDataset(TARGET_VIDEO, TEST_DIR, CLIP_LEN, IMG_SIZE)
    loader = DataLoader(ds, batch_size=16, shuffle=False, num_workers=2)
    
    frame_indices = []
    errors = []
    
    # Inference
    with torch.no_grad():
        for inputs, targets, idxs in tqdm(loader):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            preds = model(inputs)
            
            # Calculate Reconstruction Error (MSE) per frame
            # (Batch, C, H, W) -> Mean over (C, H, W) -> (Batch)
            mse = ((preds - targets)**2).mean(dim=(1,2,3)).cpu().numpy()
            
            errors.extend(mse)
            frame_indices.extend(idxs.numpy())
            
    # Process Scores
    scores = np.array(errors)
    
    # Normalize [0, 1] for this video (Crucial for AP!)
    mn, mx = scores.min(), scores.max()
    if mx > mn:
        scores_norm = (scores - mn) / (mx - mn)
    else:
        scores_norm = scores
        
    # Plot
    plt.figure(figsize=(15, 6))
    plt.plot(frame_indices, scores_norm, color='red', linewidth=2, label='Anomaly Score (Normalized MSE)')
    
    plt.title(f"Anomaly Score Profile: Video {TARGET_VIDEO} (Epoch 2 Model)", fontsize=16)
    plt.xlabel("Frame Number")
    plt.ylabel("Anomaly Score (0=Normal, 1=Anomaly)")
    plt.axhline(y=0.5, color='black', linestyle='--', alpha=0.3)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

if __name__ == "__main__":
    visualize()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import numpy as np
import cv2
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# ================= CONFIGURATION =================
MODEL_PATH = '/kaggle/input/vlg-testmodel/pytorch/default/1/unet_conditional_ep1.pth' # Use your trained model
TEST_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/testing_videos'
TARGET_VIDEO = '02'  # The video with the running man
IMG_SIZE = 256
CLIP_LEN = 4
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# DENOISING PARAMS
BLUR_KERNEL = (5, 5)
NOISE_THRESH = 0.05
# =================================================

# --- ARCHITECTURE (Hidden for brevity, same as before) ---
class AsymmetricConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super(AsymmetricConv, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(kernel_size//2, 0))
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=(1, kernel_size), padding=(0, kernel_size//2))
        self.bn = nn.BatchNorm2d(out_channels)
    def forward(self, x): return self.relu(self.bn(self.conv2(self.relu(self.conv1(x)))))

class ResidualSkipConnection(nn.Module):
    def __init__(self, channels):
        super(ResidualSkipConnection, self).__init__()
        self.block = nn.Sequential(AsymmetricConv(channels, channels), AsymmetricConv(channels, channels))
        self.shortcut = nn.Conv2d(channels, channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x): return self.relu(self.block(x) + self.shortcut(x))

class ShortcutInceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ShortcutInceptionModule, self).__init__()
        w_6 = out_channels // 6; w_3 = out_channels // 3; w_2 = out_channels - (w_6 + w_3)
        self.branch1 = AsymmetricConv(in_channels, w_6)
        self.branch2 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3))
        self.branch3 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3), AsymmetricConv(w_3, w_2))
        self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        return self.relu(torch.cat([self.branch1(x), self.branch2(x), self.branch3(x)], dim=1) + self.shortcut(x))

class MultiScaleUNet(nn.Module):
    def __init__(self, in_channels=12, out_channels=3):
        super(MultiScaleUNet, self).__init__()
        self.sim1 = ShortcutInceptionModule(in_channels, 96); self.pool1 = nn.MaxPool2d(2)
        self.sim2 = ShortcutInceptionModule(96, 192);         self.pool2 = nn.MaxPool2d(2)
        self.sim3 = ShortcutInceptionModule(192, 384);        self.pool3 = nn.MaxPool2d(2)
        self.sim4 = ShortcutInceptionModule(384, 768)
        self.rsc1 = nn.Sequential(*[ResidualSkipConnection(96) for _ in range(4)])
        self.rsc2 = nn.Sequential(*[ResidualSkipConnection(192) for _ in range(3)])
        self.rsc3 = nn.Sequential(*[ResidualSkipConnection(384) for _ in range(2)])
        self.sim5 = ShortcutInceptionModule(768, 384);   self.up1 = nn.ConvTranspose2d(384, 384, 2, 2)
        self.sim6 = ShortcutInceptionModule(768, 192);   self.up2 = nn.ConvTranspose2d(192, 192, 2, 2)
        self.sim7 = ShortcutInceptionModule(384, 96);    self.up3 = nn.ConvTranspose2d(96, 96, 2, 2)
        self.sim8 = ShortcutInceptionModule(192, 96)
        self.final = nn.Conv2d(96, out_channels, 3, padding=1)
        self.tanh = nn.Tanh()
    def forward(self, x):
        e1 = self.sim1(x); p1 = self.pool1(e1)
        e2 = self.sim2(p1); p2 = self.pool2(e2)
        e3 = self.sim3(p2); p3 = self.pool3(e3)
        e4 = self.sim4(p3)
        d1 = self.sim5(e4); u1 = self.up1(d1)
        d2 = self.sim6(torch.cat([u1, self.rsc3(e3)], dim=1)); u2 = self.up2(d2)
        d3 = self.sim7(torch.cat([u2, self.rsc2(e2)], dim=1)); u3 = self.up3(d3)
        d4 = self.sim8(torch.cat([u3, self.rsc1(e1)], dim=1))
        return self.tanh(self.final(d4))

# --- DATASET ---
class SingleVideoDataset(Dataset):
    def __init__(self, vid_id, root_dir, clip_len=4, img_size=256):
        self.clip_len = clip_len
        self.samples = []
        vid_path = os.path.join(root_dir, vid_id)
        if not os.path.exists(vid_path):
            candidates = [d for d in os.listdir(root_dir) if str(int(d)) == str(int(vid_id))]
            vid_path = os.path.join(root_dir, candidates[0])
        self.frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        if len(self.frames) >= clip_len + 1:
            for i in range(len(self.frames) - clip_len):
                self.samples.append(i)

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        in_paths = self.frames[idx : idx + self.clip_len]
        tgt_path = self.frames[idx + self.clip_len]
        imgs = [self.transform(Image.open(p).convert('RGB')) for p in in_paths]
        input_seq = torch.cat(imgs, dim=0)
        target = self.transform(Image.open(tgt_path).convert('RGB'))
        return input_seq, target, idx + self.clip_len

def show_denoising_effect():
    print(f"Analyzing Video {TARGET_VIDEO}...")
    
    model = MultiScaleUNet().to(DEVICE)
    if not os.path.exists(MODEL_PATH): print("Model not found!"); return
    st = torch.load(MODEL_PATH, map_location=DEVICE)
    if 'module.' in list(st.keys())[0]: st = {k.replace('module.', ''): v for k, v in st.items()}
    model.load_state_dict(st)
    model.eval()
    
    ds = SingleVideoDataset(TARGET_VIDEO, TEST_DIR, CLIP_LEN, IMG_SIZE)
    loader = DataLoader(ds, batch_size=16, shuffle=False)
    
    raw_scores = []
    denoised_scores = []
    frame_indices = []
    
    # We will capture one "interesting" frame to visualize
    sample_vis = None
    max_score = 0
    
    with torch.no_grad():
        for inputs, targets, idxs in tqdm(loader):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            preds = model(inputs)
            
            # 1. Calc Squared Error Map (Batch, C, H, W)
            diff = (preds - targets) ** 2
            # Mean over channels -> (Batch, H, W)
            diff_map = diff.mean(dim=1).cpu().numpy()
            
            for i in range(len(diff_map)):
                err_map = diff_map[i]
                
                # --- RAW SCORE ---
                raw_s = np.mean(err_map)
                raw_scores.append(raw_s)
                
                # --- DENOISING ---
                # Blur
                blur = cv2.GaussianBlur(err_map, BLUR_KERNEL, 0)
                # Threshold
                clean = blur.copy()
                clean[clean < NOISE_THRESH] = 0.0
                
                denoised_s = np.mean(clean)
                denoised_scores.append(denoised_s)
                frame_indices.append(idxs[i].item())
                
                # Capture the frame with the highest denoised error (The Anomaly)
                if denoised_s > max_score:
                    max_score = denoised_s
                    # Denormalize target for display
                    tgt_disp = (targets[i].permute(1,2,0).cpu().numpy() * 0.5) + 0.5
                    sample_vis = (tgt_disp, err_map, clean, idxs[i].item())

    # --- PLOTTING ---
    plt.figure(figsize=(15, 10))
    
    # 1. Timeline Comparison
    plt.subplot(2, 1, 1)
    # Normalize for fair visual comparison on graph
    r_norm = (raw_scores - np.min(raw_scores)) / (np.max(raw_scores) - np.min(raw_scores))
    d_norm = (denoised_scores - np.min(denoised_scores)) / (np.max(denoised_scores) - np.min(denoised_scores))
    
    plt.plot(frame_indices, r_norm, color='gray', alpha=0.5, label='Raw Error (Noisy)')
    plt.plot(frame_indices, d_norm, color='red', linewidth=2, label='Denoised Error (Clean)')
    plt.title(f"Video {TARGET_VIDEO}: Raw vs. Denoised Anomaly Score")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 2. Frame Visualization
    if sample_vis:
        img, raw_map, clean_map, fid = sample_vis
        
        plt.subplot(2, 3, 4)
        plt.imshow(img.clip(0, 1))
        plt.title(f"Frame {fid} (Real)")
        plt.axis('off')
        
        plt.subplot(2, 3, 5)
        plt.imshow(raw_map, cmap='jet', vmin=0, vmax=0.1) # vmin/vmax to see noise
        plt.title("Raw Error Map (Note the static)")
        plt.axis('off')
        
        plt.subplot(2, 3, 6)
        plt.imshow(clean_map, cmap='jet', vmin=0, vmax=0.1)
        plt.title("Denoised Map (Signal Isolated)")
        plt.axis('off')
        
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    show_denoising_effect()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# ================= CONFIGURATION =================
MODEL_PATH = '/kaggle/input/vlg-testmodel/pytorch/default/1/unet_conditional_ep1.pth' # Use your trained model
TEST_DIR = '/kaggle/working/cleaned_testing_videos'
TARGET_VIDEO = '09' 
IMG_SIZE = 256
CLIP_LEN = 4
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. SSIM FUNCTION ---
def gaussian_window(size, sigma):
    coords = torch.arange(size, dtype=torch.float)
    coords -= size // 2
    g = torch.exp(-(coords ** 2) / (2 * sigma ** 2))
    g /= g.sum()
    return g.unsqueeze(1)

def ssim_map_calculation(img1, img2, window_size=11):
    # Returns the full (H, W) SSIM map, not just the mean
    channel = img1.size(1)
    _1D_window = gaussian_window(window_size, 1.5).to(img1.device)
    _1D_window = _1D_window.expand(channel, 1, window_size, 1).contiguous()
    window = _1D_window.matmul(_1D_window.transpose(2, 3))

    mu1 = F.conv2d(img1, window, padding=window_size//2, groups=channel)
    mu2 = F.conv2d(img2, window, padding=window_size//2, groups=channel)

    mu1_sq = mu1.pow(2); mu2_sq = mu2.pow(2); mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(img1*img1, window, padding=window_size//2, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(img2*img2, window, padding=window_size//2, groups=channel) - mu2_sq
    sigma12 = F.conv2d(img1*img2, window, padding=window_size//2, groups=channel) - mu1_mu2

    C1 = 0.01**2; C2 = 0.03**2

    # Map of similarity (1=Same, 0=Different)
    ssim_map = ((2*mu1_mu2 + C1)*(2*sigma12 + C2)) / ((mu1_sq + mu2_sq + C1)*(sigma1_sq + sigma2_sq + C2))
    
    # We return 1 - SSIM because we want High Value = Anomaly
    return 1.0 - ssim_map.mean(dim=1, keepdim=True) 

# --- 2. ARCHITECTURE (Must match training) ---
class AsymmetricConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super(AsymmetricConv, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(kernel_size//2, 0))
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=(1, kernel_size), padding=(0, kernel_size//2))
        self.bn = nn.BatchNorm2d(out_channels)
    def forward(self, x): return self.relu(self.bn(self.conv2(self.relu(self.conv1(x)))))

class ResidualSkipConnection(nn.Module):
    def __init__(self, channels):
        super(ResidualSkipConnection, self).__init__()
        self.block = nn.Sequential(AsymmetricConv(channels, channels), AsymmetricConv(channels, channels))
        self.shortcut = nn.Conv2d(channels, channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x): return self.relu(self.block(x) + self.shortcut(x))

class ShortcutInceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ShortcutInceptionModule, self).__init__()
        w_6 = out_channels // 6; w_3 = out_channels // 3; w_2 = out_channels - (w_6 + w_3)
        self.branch1 = AsymmetricConv(in_channels, w_6)
        self.branch2 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3))
        self.branch3 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3), AsymmetricConv(w_3, w_2))
        self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        return self.relu(torch.cat([self.branch1(x), self.branch2(x), self.branch3(x)], dim=1) + self.shortcut(x))

class MultiScaleUNet(nn.Module):
    def __init__(self, in_channels=12, out_channels=3):
        super(MultiScaleUNet, self).__init__()
        self.sim1 = ShortcutInceptionModule(in_channels, 96); self.pool1 = nn.MaxPool2d(2)
        self.sim2 = ShortcutInceptionModule(96, 192);         self.pool2 = nn.MaxPool2d(2)
        self.sim3 = ShortcutInceptionModule(192, 384);        self.pool3 = nn.MaxPool2d(2)
        self.sim4 = ShortcutInceptionModule(384, 768)
        self.rsc1 = nn.Sequential(*[ResidualSkipConnection(96) for _ in range(4)])
        self.rsc2 = nn.Sequential(*[ResidualSkipConnection(192) for _ in range(3)])
        self.rsc3 = nn.Sequential(*[ResidualSkipConnection(384) for _ in range(2)])
        self.sim5 = ShortcutInceptionModule(768, 384);   self.up1 = nn.ConvTranspose2d(384, 384, 2, 2)
        self.sim6 = ShortcutInceptionModule(768, 192);   self.up2 = nn.ConvTranspose2d(192, 192, 2, 2)
        self.sim7 = ShortcutInceptionModule(384, 96);    self.up3 = nn.ConvTranspose2d(96, 96, 2, 2)
        self.sim8 = ShortcutInceptionModule(192, 96)
        self.final = nn.Conv2d(96, out_channels, 3, padding=1)
        self.tanh = nn.Tanh()
    def forward(self, x):
        e1 = self.sim1(x); p1 = self.pool1(e1)
        e2 = self.sim2(p1); p2 = self.pool2(e2)
        e3 = self.sim3(p2); p3 = self.pool3(e3)
        e4 = self.sim4(p3)
        d1 = self.sim5(e4); u1 = self.up1(d1)
        d2 = self.sim6(torch.cat([u1, self.rsc3(e3)], dim=1)); u2 = self.up2(d2)
        d3 = self.sim7(torch.cat([u2, self.rsc2(e2)], dim=1)); u3 = self.up3(d3)
        d4 = self.sim8(torch.cat([u3, self.rsc1(e1)], dim=1))
        return self.tanh(self.final(d4))

# --- 3. DATASET ---
class SingleVideoDataset(Dataset):
    def __init__(self, vid_id, root_dir, clip_len=4, img_size=256):
        self.clip_len = clip_len
        vid_path = os.path.join(root_dir, vid_id)
        if not os.path.exists(vid_path):
            candidates = [d for d in os.listdir(root_dir) if str(int(d)) == str(int(vid_id))]
            vid_path = os.path.join(root_dir, candidates[0])
        self.frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        self.samples = []
        if len(self.frames) >= clip_len + 1:
            for i in range(len(self.frames) - clip_len):
                self.samples.append(i)
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        in_paths = self.frames[idx : idx + self.clip_len]
        tgt_path = self.frames[idx + self.clip_len]
        imgs = [self.transform(Image.open(p).convert('RGB')) for p in in_paths]
        return torch.cat(imgs, dim=0), self.transform(Image.open(tgt_path).convert('RGB')), idx + self.clip_len

# --- 4. VISUALIZE ---
def visualize_ssim():
    print(f"Analyzing SSIM vs MSE on Video {TARGET_VIDEO}...")
    
    model = MultiScaleUNet().to(DEVICE)
    if not os.path.exists(MODEL_PATH): print("Model not found!"); return
    st = torch.load(MODEL_PATH, map_location=DEVICE)
    if 'module.' in list(st.keys())[0]: st = {k.replace('module.', ''): v for k, v in st.items()}
    model.load_state_dict(st)
    model.eval()
    
    ds = SingleVideoDataset(TARGET_VIDEO, TEST_DIR, CLIP_LEN, IMG_SIZE)
    loader = DataLoader(ds, batch_size=16, shuffle=False)
    
    mse_scores = []
    ssim_scores = []
    frame_indices = []
    
    # Capture Sample for Display (Max Anomaly)
    vis_sample = None
    max_ssim = 0
    
    with torch.no_grad():
        for inputs, targets, idxs in tqdm(loader):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            preds = model(inputs)
            
            # --- MSE Calculation ---
            diff_mse = (preds - targets) ** 2
            mse_map = diff_mse.mean(dim=1) # (B, H, W)
            mse_vals = mse_map.mean(dim=(1,2)).cpu().numpy()
            mse_scores.extend(mse_vals)
            
            # --- SSIM Calculation ---
            # Shift to [0, 1]
            p_01 = (preds * 0.5) + 0.5
            t_01 = (targets * 0.5) + 0.5
            
            # Get Anomaly Map (1 - SSIM)
            # Returns (B, 1, H, W)
            anomaly_map_ssim = ssim_map_calculation(p_01, t_01)
            ssim_vals = anomaly_map_ssim.mean(dim=(1,2,3)).cpu().numpy()
            ssim_scores.extend(ssim_vals)
            
            frame_indices.extend(idxs.numpy())
            
            # Capture interesting frame
            batch_max = ssim_vals.max()
            if batch_max > max_ssim:
                max_ssim = batch_max
                idx_max = ssim_vals.argmax()
                
                # Store numpy arrays for plotting
                img_real = t_01[idx_max].permute(1,2,0).cpu().numpy()
                map_mse = mse_map[idx_max].cpu().numpy()
                map_ssim = anomaly_map_ssim[idx_max].squeeze(0).cpu().numpy()
                fid = idxs[idx_max].item()
                vis_sample = (img_real, map_mse, map_ssim, fid)

    # --- PLOTTING ---
    plt.figure(figsize=(15, 12))
    
    # 1. GRAPH COMPARISON
    plt.subplot(3, 1, 1)
    # Normalize to compare shapes
    mse_norm = (mse_scores - np.min(mse_scores)) / (np.max(mse_scores) - np.min(mse_scores))
    ssim_norm = (ssim_scores - np.min(ssim_scores)) / (np.max(ssim_scores) - np.min(ssim_scores))
    
    plt.plot(frame_indices, mse_norm, color='gray', alpha=0.5, label='MSE (Pixel Error)')
    plt.plot(frame_indices, ssim_norm, color='blue', linewidth=2, label='SSIM (Structural Error)')
    plt.title(f"Graph Comparison: Does SSIM reduce the noise floor?")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 2. IMAGE COMPARISON
    if vis_sample:
        img, m_mse, m_ssim, fid = vis_sample
        
        plt.subplot(3, 3, 4)
        plt.imshow(img.clip(0, 1))
        plt.title(f"Real Frame {fid}")
        plt.axis('off')
        
        plt.subplot(3, 3, 5)
        plt.imshow(m_mse, cmap='jet')
        plt.title("MSE Map (Pixel Difference)\nNote: Background might be noisy")
        plt.axis('off')
        
        plt.subplot(3, 3, 6)
        plt.imshow(m_ssim, cmap='jet')
        plt.title("SSIM Map (Structural Difference)\nIdeally: Dark Background, Bright Object")
        plt.axis('off')
        
    plt.tight_layout()
    plt.show()

if __name__ == "__main__":
    visualize_ssim()

In [None]:
import os
import glob
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from PIL import Image

# ================= CONFIGURATION =================
TEST_DIR = '/kaggle/working/cleaned_testing_videos'
DENOISED_DIR = 'denoised_frames'

# We train on the test video itself (Self-Supervision)
# 1000 steps is enough to learn the "noise pattern" to remove
TRAIN_STEPS = 1000 
BATCH_SIZE = 8
IMG_SIZE = 256
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. LIGHTWEIGHT DENOISER (DnCNN Style) ---
class SimpleDenoiser(nn.Module):
    def __init__(self):
        super(SimpleDenoiser, self).__init__()
        # Input: 3 Channels (RGB)
        # Output: 3 Channels (Clean RGB)
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1), nn.ReLU(True),
            nn.Conv2d(64, 64, 3, padding=1), nn.ReLU(True),
            nn.Conv2d(64, 64, 3, padding=1), nn.ReLU(True),
            nn.Conv2d(64, 3, 3, padding=1)
        )
    
    def forward(self, x):
        # Residual Learning: Predict the NOISE, subtract from Input
        noise_pred = self.encoder(x)
        return x - noise_pred

# --- 2. DATASET (Yields Pairs: Frame T and Frame T+1) ---
class VideoPairDataset(Dataset):
    def __init__(self, root_dir, img_size=256):
        self.pairs = []
        self.img_size = img_size
        
        videos = sorted(os.listdir(root_dir))
        for vid in videos:
            vid_path = os.path.join(root_dir, vid)
            if not os.path.isdir(vid_path): continue
            frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
            
            # Create T, T+1 pairs
            for i in range(len(frames) - 1):
                self.pairs.append((frames[i], frames[i+1]))
                
    def __len__(self): return len(self.pairs)
    
    def __getitem__(self, idx):
        p1, p2 = self.pairs[idx]
        
        # Load and Resize
        img1 = cv2.imread(p1); img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
        img2 = cv2.imread(p2); img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
        
        img1 = cv2.resize(img1, (self.img_size, self.img_size))
        img2 = cv2.resize(img2, (self.img_size, self.img_size))
        
        # Normalize [0, 1] -> Tensor (C, H, W)
        t1 = torch.from_numpy(img1 / 255.0).float().permute(2, 0, 1)
        t2 = torch.from_numpy(img2 / 255.0).float().permute(2, 0, 1)
        
        return t1, t2, p1 # Return p1 path to know ID later

# --- 3. OPTICAL FLOW ALIGNMENT (The "Temporal" Part) ---
def align_frames(curr, next_frame):
    # curr, next_frame: (B, C, H, W) Torch Tensors
    # We use CV2 Optical Flow on CPU (simpler than FlowNet)
    
    aligned_batch = []
    
    curr_np = curr.permute(0, 2, 3, 1).cpu().numpy() # (B, H, W, C)
    next_np = next_frame.permute(0, 2, 3, 1).cpu().numpy()
    
    for i in range(len(curr_np)):
        prev_gray = cv2.cvtColor((curr_np[i]*255).astype(np.uint8), cv2.COLOR_RGB2GRAY)
        next_gray = cv2.cvtColor((next_np[i]*255).astype(np.uint8), cv2.COLOR_RGB2GRAY)
        
        # Calculate Flow (Next -> Prev)
        flow = cv2.calcOpticalFlowFarneback(next_gray, prev_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
        
        # Warp Next to match Prev
        h, w = flow.shape[:2]
        flow_map = np.column_stack((np.repeat(np.arange(h), w), np.tile(np.arange(w), h)))
        vec = flow.reshape(-1, 2)
        indices = flow_map + vec
        
        map_x = indices[:, 1].reshape(h, w).astype(np.float32)
        map_y = indices[:, 0].reshape(h, w).astype(np.float32)
        
        warped = cv2.remap((next_np[i]*255).astype(np.uint8), map_x, map_y, cv2.INTER_LINEAR)
        aligned_batch.append(warped / 255.0)
        
    # Stack back to Tensor
    aligned = torch.tensor(np.array(aligned_batch)).float().permute(0, 3, 1, 2).to(curr.device)
    return aligned

# --- 4. MAIN PIPELINE ---
def train_and_denoise():
    print("Initializing Unsupervised Denoiser (Noise2Noise Strategy)...")
    
    os.makedirs(DENOISED_DIR, exist_ok=True)
    
    model = SimpleDenoiser().to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()
    
    ds = VideoPairDataset(TEST_DIR, IMG_SIZE)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
    
    # --- PHASE 1: SELF-SUPERVISED TRAINING ---
    print(f"Training on Test Videos for {TRAIN_STEPS} steps...")
    model.train()
    
    step = 0
    # Infinite loop wrapper
    while step < TRAIN_STEPS:
        for img_curr, img_next, _ in loader:
            if step >= TRAIN_STEPS: break
            
            img_curr = img_curr.to(DEVICE)
            img_next = img_next.to(DEVICE)
            
            # 1. Align Next Frame to Current Frame
            # (If we don't align, the model learns motion blur)
            with torch.no_grad():
                img_next_aligned = align_frames(img_curr, img_next)
            
            # 2. Denoise Current Frame
            # Input: Noisy Current
            # Target: Aligned Noisy Next (Noise is independent!)
            cleaned_curr = model(img_curr)
            
            # 3. Loss
            loss = criterion(cleaned_curr, img_next_aligned)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if step % 100 == 0:
                print(f"Step {step}/{TRAIN_STEPS} | Loss: {loss.item():.6f}")
            step += 1

    # --- PHASE 2: INFERENCE (SAVE CLEAN FRAMES) ---
    print("Denoising all frames...")
    model.eval()
    
    # Reload dataset sequentially
    full_loader = DataLoader(ds, batch_size=1, shuffle=False)
    
    with torch.no_grad():
        for img_curr, _, path in tqdm(full_loader):
            img_curr = img_curr.to(DEVICE)
            cleaned = model(img_curr)
            
            # Save
            # Structure: denoised_frames/02/frame_001.jpg
            orig_path = path[0]
            vid_id = os.path.basename(os.path.dirname(orig_path))
            f_name = os.path.basename(orig_path)
            
            save_folder = os.path.join(DENOISED_DIR, vid_id)
            os.makedirs(save_folder, exist_ok=True)
            
            # Convert back to Image
            c_np = cleaned.squeeze(0).permute(1, 2, 0).cpu().numpy()
            c_np = np.clip(c_np * 255, 0, 255).astype(np.uint8)
            
            out_p = os.path.join(save_folder, f_name)
            Image.fromarray(c_np).save(out_p)
            
    print(f"Done! Clean frames saved to {DENOISED_DIR}")
    print("Now run your Anomaly Detector on THIS new directory.")

if __name__ == "__main__":
    train_and_denoise()

In [None]:
import sys
import os
import torch
import cv2
import numpy as np
from tqdm import tqdm

# ================= CONFIGURATION =================
# 1. WHERE YOUR PYTHON FILES ARE (The dataset you just made)
# Example: '/kaggle/input/my-tap-code'
CODE_DATASET_DIR = '/kaggle/input/tap-denoise2' 

# 2. PATH TO WEIGHTS (nafnet_srgb.pth)
MODEL_WEIGHTS = '/kaggle/input/nafnetrgb/pytorch/default/1/nafnet_rgb.pth'

# 3. VIDEO SETTINGS
INPUT_VIDEO = '/kaggle/working/rendered_videos/video_02.mp4' # Check this path
OUTPUT_VIDEO = 'cleaned_output.mp4'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. SETUP IMPORTS ---
# Add the dataset directory to Python's search path
if CODE_DATASET_DIR not in sys.path:
    sys.path.append(CODE_DATASET_DIR)

print(f"Added {CODE_DATASET_DIR} to system path.")

# Try importing TAP. 
# Since you brought files "out" of the folder, they might be at the root.
try:
    # Try direct import first (if files are flat)
    from network_tap import TAP
    print("✅ Imported TAP from network_tap.py")
except ImportError:
    try:
        # Try package import (if you kept the folder structure 'models/network_tap')
        from models.network_tap import TAP
        print("✅ Imported TAP from models.network_tap")
    except ImportError as e:
        print("❌ CRITICAL ERROR: Could not import TAP.")
        print(f"Python sees these files in {CODE_DATASET_DIR}:")
        print(os.listdir(CODE_DATASET_DIR))
        print(f"\nError details: {e}")
        # Stop execution if we can't load the model code
        sys.exit()

# --- 2. INFERENCE LOOP ---
def clean_video():
    print(f"Loading weights from {MODEL_WEIGHTS}...")
    
    # Initialize TAP Model
    # sRGB config: in_nc=3, nc=64, frames=5
    model = TAP(in_nc=3, nc=64, frames=5, pixel_loss_type='L1')
    
    # Load Weights
    if not os.path.exists(MODEL_WEIGHTS):
        print("❌ Error: Weights file not found.")
        return

    checkpoint = torch.load(MODEL_WEIGHTS, map_location=DEVICE)
    if 'params' in checkpoint: 
        checkpoint = checkpoint['params']
    
    model.load_state_dict(checkpoint, strict=False)
    model.to(DEVICE)
    model.eval()

    # Open Video
    cap = cv2.VideoCapture(INPUT_VIDEO)
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    if width == 0:
        print("❌ Error: Could not read video. Check INPUT_VIDEO path.")
        return

    print(f"Processing {INPUT_VIDEO}")
    print(f"Dimensions: {width}x{height}, Frames: {total_frames}")

    # Video Writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (width, height))
    
    # Buffer for Sliding Window [t-2, t-1, t, t+1, t+2]
    # We read all frames into RAM for fastest sliding window access 
    # (Avenue videos are small enough for this)
    all_frames = []
    
    print("Reading video into memory...")
    while True:
        ret, frame = cap.read()
        if not ret: break
        # BGR -> RGB -> Normalize 0-1
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = frame.astype(np.float32) / 255.0
        all_frames.append(frame)
    cap.release()
    
    # Convert to Tensor (1, T, C, H, W)
    video_tensor = torch.tensor(np.array(all_frames)).permute(0, 3, 1, 2).unsqueeze(0)
    T = video_tensor.shape[1]
    
    print("Running Denoising...")
    
    with torch.no_grad():
        for i in tqdm(range(T)):
            # 
            # Construct indices for window of 5 around 'i'
            indices = []
            for offset in range(-2, 3):
                # Clamp index to [0, T-1] to handle edges
                idx = np.clip(i + offset, 0, T - 1)
                indices.append(idx)
            
            # Extract Window (1, 5, C, H, W)
            input_window = video_tensor[:, indices, :, :, :].to(DEVICE)
            
            # Inference (Returns Center Frame)
            clean_tensor = model(input_window) 
            
            # Save Output
            clean_frame = clean_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy()
            clean_frame = np.clip(clean_frame * 255, 0, 255).astype(np.uint8)
            clean_frame = cv2.cvtColor(clean_frame, cv2.COLOR_RGB2BGR)
            
            out.write(clean_frame)
            
    out.release()
    print(f"✅ Success! Clean video saved to: {OUTPUT_VIDEO}")

if __name__ == "__main__":
    clean_video()

In [None]:
import cv2
import numpy as np
import os
from tqdm import tqdm

# ================= CONFIGURATION =================
INPUT_VIDEO = '/kaggle/working/rendered_videos/video_02.mp4'
OUTPUT_VIDEO = 'video_02_cleaned_NLM.mp4'

# STRENGTH PARAMETERS
# h: Denoising strength. Higher = smoother but blurrier. 
# 3.0 is conservative, 6.0 is strong. Start with 4.0.
H_LUMA = 4.0 
TEMPORAL_WINDOW = 5  # How many frames to look at (odd number)
SEARCH_WINDOW = 21   # How far to look for matching blocks (pixels)
BLOCK_SIZE = 7       # Size of blocks to compare
# =================================================

def clean_video_nlm():
    if not os.path.exists(INPUT_VIDEO):
        print(f"Error: Video not found at {INPUT_VIDEO}")
        return

    cap = cv2.VideoCapture(INPUT_VIDEO)
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    print(f"Cleaning Video: {width}x{height} @ {fps} FPS ({total_frames} frames)")
    
    # Setup Output
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (width, height))
    
    # Buffer: We need to hold 'TEMPORAL_WINDOW' frames in memory
    # Sliding window logic: [t-2, t-1, t, t+1, t+2] to clean 't'
    frames_buffer = []
    
    # Read ALL frames into RAM first (Avenue videos are small, ~20MB)
    # This makes the multi-frame logic much simpler
    all_frames_gray = []
    all_frames_color = []
    
    print("Reading video to memory...")
    while True:
        ret, frame = cap.read()
        if not ret: break
        all_frames_color.append(frame)
        # NLM works best on Grayscale for the map, but we apply to color
        # Actually OpenCV's function handles color, but let's stick to the color version
    cap.release()
    
    print(f"Applying Multi-Frame Non-Local Means (Strength {H_LUMA})...")
    
    # Pad video with border frames to handle start/end
    pad = TEMPORAL_WINDOW // 2
    padded_frames = [all_frames_color[0]]*pad + all_frames_color + [all_frames_color[-1]]*pad
    
    for i in tqdm(range(len(all_frames_color))):
        # Extract the window of frames
        # Index in padded array is i + pad
        # We want window centered there
        center_idx = i + pad
        start = center_idx - pad
        end = center_idx + pad + 1
        
        current_window = padded_frames[start:end]
        
        # The function expects:
        # imgs: list of frames
        # imgToDenoiseIndex: index of the frame in the list to clean (middle one)
        # temporalWindowSize: number of frames
        
        clean = cv2.fastNlMeansDenoisingMulti(
            srcImgs=current_window,
            imgToDenoiseIndex=pad, # The middle frame
            temporalWindowSize=TEMPORAL_WINDOW,
            h=H_LUMA,
            templateWindowSize=BLOCK_SIZE,
            searchWindowSize=SEARCH_WINDOW
        )
        
        out.write(clean)
        
    out.release()
    print(f"✅ Cleaned video saved as: {OUTPUT_VIDEO}")

if __name__ == "__main__":
    clean_video_nlm()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import numpy as np
import cv2
from PIL import Image
from torchvision import transforms

# ================= CONFIGURATION =================
TRAIN_VIDEO_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/training_videos'
OUTPUT_DIR = 'cleaned_check'
TARGET_VIDEO = '02'  # The specific video ID to test
MODEL_PATH = '/kaggle/input/vlg-testmodel/pytorch/default/1/unet_conditional_ep1.pth' 

IMG_SIZE = 256
CLIP_LEN = 4
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. MODEL ARCHITECTURE (Must match training) ---
class AsymmetricConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super(AsymmetricConv, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(kernel_size//2, 0))
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=(1, kernel_size), padding=(0, kernel_size//2))
        self.bn = nn.BatchNorm2d(out_channels)
    def forward(self, x): return self.relu(self.bn(self.conv2(self.relu(self.conv1(x)))))

class ResidualSkipConnection(nn.Module):
    def __init__(self, channels):
        super(ResidualSkipConnection, self).__init__()
        self.block = nn.Sequential(AsymmetricConv(channels, channels), AsymmetricConv(channels, channels))
        self.shortcut = nn.Conv2d(channels, channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x): return self.relu(self.block(x) + self.shortcut(x))

class ShortcutInceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ShortcutInceptionModule, self).__init__()
        w_6 = out_channels // 6; w_3 = out_channels // 3; w_2 = out_channels - (w_6 + w_3)
        self.branch1 = AsymmetricConv(in_channels, w_6)
        self.branch2 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3))
        self.branch3 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3), AsymmetricConv(w_3, w_2))
        self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        return self.relu(torch.cat([self.branch1(x), self.branch2(x), self.branch3(x)], dim=1) + self.shortcut(x))

class MultiScaleUNet(nn.Module):
    def __init__(self, in_channels=12, out_channels=3):
        super(MultiScaleUNet, self).__init__()
        self.sim1 = ShortcutInceptionModule(in_channels, 96); self.pool1 = nn.MaxPool2d(2)
        self.sim2 = ShortcutInceptionModule(96, 192);         self.pool2 = nn.MaxPool2d(2)
        self.sim3 = ShortcutInceptionModule(192, 384);        self.pool3 = nn.MaxPool2d(2)
        self.sim4 = ShortcutInceptionModule(384, 768)
        self.rsc1 = nn.Sequential(*[ResidualSkipConnection(96) for _ in range(4)])
        self.rsc2 = nn.Sequential(*[ResidualSkipConnection(192) for _ in range(3)])
        self.rsc3 = nn.Sequential(*[ResidualSkipConnection(384) for _ in range(2)])
        self.sim5 = ShortcutInceptionModule(768, 384);   self.up1 = nn.ConvTranspose2d(384, 384, 2, 2)
        self.sim6 = ShortcutInceptionModule(768, 192);   self.up2 = nn.ConvTranspose2d(192, 192, 2, 2)
        self.sim7 = ShortcutInceptionModule(384, 96);    self.up3 = nn.ConvTranspose2d(96, 96, 2, 2)
        self.sim8 = ShortcutInceptionModule(192, 96)
        self.final = nn.Conv2d(96, out_channels, 3, padding=1)
        self.tanh = nn.Tanh()
    def forward(self, x):
        e1 = self.sim1(x); p1 = self.pool1(e1)
        e2 = self.sim2(p1); p2 = self.pool2(e2)
        e3 = self.sim3(p2); p3 = self.pool3(e3)
        e4 = self.sim4(p3)
        d1 = self.sim5(e4); u1 = self.up1(d1)
        d2 = self.sim6(torch.cat([u1, self.rsc3(e3)], dim=1)); u2 = self.up2(d2)
        d3 = self.sim7(torch.cat([u2, self.rsc2(e2)], dim=1)); u3 = self.up3(d3)
        d4 = self.sim8(torch.cat([u3, self.rsc1(e1)], dim=1))
        return self.tanh(self.final(d4))

# --- 2. SINGLE VIDEO LOGIC ---
def clean_single_video():
    print(f"Loading Model from {MODEL_PATH}...")
    model = MultiScaleUNet().to(DEVICE)
    if not os.path.exists(MODEL_PATH):
        print("❌ Model not found! Train it first.")
        return
        
    st = torch.load(MODEL_PATH, map_location=DEVICE)
    if 'module.' in list(st.keys())[0]: st = {k.replace('module.', ''): v for k, v in st.items()}
    model.load_state_dict(st)
    model.eval()
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Locate target video
    vid_path = os.path.join(TRAIN_VIDEO_DIR, TARGET_VIDEO)
    if not os.path.exists(vid_path):
        # Fallback for folder naming inconsistencies
        candidates = [d for d in os.listdir(TRAIN_VIDEO_DIR) if str(int(d)) == str(int(TARGET_VIDEO))]
        if candidates: vid_path = os.path.join(TRAIN_VIDEO_DIR, candidates[0])
        else: print(f"❌ Video {TARGET_VIDEO} not found."); return

    frames = sorted(glob.glob(os.path.join(vid_path, '*.jpg')))
    if len(frames) < CLIP_LEN + 1:
        print("Video too short."); return
        
    print(f"Cleaning Video {TARGET_VIDEO} ({len(frames)} frames)...")

    # Setup Video Writer
    save_path = os.path.join(OUTPUT_DIR, f"train_{TARGET_VIDEO}_cleaned.mp4")
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(save_path, fourcc, 25.0, (IMG_SIZE, IMG_SIZE))
    
    # Transforms
    tf = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    
    # Load all frames to RAM
    loaded_frames = []
    for f in frames:
        img = Image.open(f).convert('RGB')
        loaded_frames.append(tf(img))
    
    vid_tensor = torch.stack(loaded_frames)
    
    with torch.no_grad():
        # Iterate through the sequence
        for i in range(len(loaded_frames) - CLIP_LEN):
            # Input: [i, i+1, i+2, i+3] -> Predicts i+4
            input_seq = vid_tensor[i : i+CLIP_LEN]
            input_seq = input_seq.view(-1, IMG_SIZE, IMG_SIZE).unsqueeze(0).to(DEVICE)
            
            pred_frame = model(input_seq)
            
            # Post-process
            img_out = pred_frame.squeeze(0).permute(1, 2, 0).cpu().numpy()
            img_out = (img_out * 0.5) + 0.5
            img_out = np.clip(img_out * 255, 0, 255).astype(np.uint8)
            img_out = cv2.cvtColor(img_out, cv2.COLOR_RGB2BGR)
            
            out.write(img_out)
            
    out.release()
    print(f"✅ Cleaned video saved to: {save_path}")

if __name__ == "__main__":
    clean_single_video()

In [None]:
import os
import glob
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

# ================= CONFIGURATION =================
# We train on the TEST videos to adapt to their specific noise patterns
TRAIN_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/testing_videos'
SAVE_NAME = 'resnet_denoiser.pth'

IMG_SIZE = 256
BATCH_SIZE = 16
EPOCHS = 15  # Fast training
LR = 1e-4
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. LIGHTWEIGHT RESNET AUTOENCODER ---
class ResBlock(nn.Module):
    def __init__(self, channels):
        super(ResBlock, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(channels, channels, 3, padding=1),
            nn.BatchNorm2d(channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(channels, channels, 3, padding=1),
            nn.BatchNorm2d(channels)
        )
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(x + self.conv(x)) # Residual Connection

class ResNetDenoiser(nn.Module):
    def __init__(self):
        super(ResNetDenoiser, self).__init__()
        
        # Encoder (Downsampling)
        self.enc1 = nn.Sequential(nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2)) # 128
        self.enc2 = nn.Sequential(nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2)) # 64
        self.enc3 = nn.Sequential(nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2)) # 32
        
        # Bottleneck (Clean Structure Learning)
        # 3 ResBlocks to refine features without losing resolution
        self.bottleneck = nn.Sequential(
            ResBlock(128),
            ResBlock(128),
            ResBlock(128)
        )
        
        # Decoder (Upsampling)
        self.dec3 = nn.Sequential(nn.ConvTranspose2d(128, 64, 2, 2), nn.ReLU())
        self.dec2 = nn.Sequential(nn.ConvTranspose2d(64, 32, 2, 2), nn.ReLU())
        self.dec1 = nn.Sequential(nn.ConvTranspose2d(32, 16, 2, 2), nn.ReLU())
        
        self.final = nn.Conv2d(16, 3, 3, padding=1)
        self.tanh = nn.Tanh() # Output -1 to 1

    def forward(self, x):
        # No Long Skip Connections! This kills the noise.
        e1 = self.enc1(x)
        e2 = self.enc2(e1)
        e3 = self.enc3(e2)
        
        b = self.bottleneck(e3)
        
        d3 = self.dec3(b)
        d2 = self.dec2(d3)
        d1 = self.dec1(d2)
        
        return self.tanh(self.final(d1))

# --- 2. DATASET (Single Frame Reconstruction) ---
class FrameDataset(Dataset):
    def __init__(self, root_dir, img_size=256):
        self.frames = []
        videos = sorted(os.listdir(root_dir))
        for vid in videos:
            vid_path = os.path.join(root_dir, vid)
            if not os.path.isdir(vid_path): continue
            self.frames.extend(glob.glob(os.path.join(vid_path, '*.jpg')))
            
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

    def __len__(self): return len(self.frames)
    def __getitem__(self, idx):
        img = Image.open(self.frames[idx]).convert('RGB')
        return self.transform(img)

# --- 3. TRAINING ---
def train_denoiser():
    print("Initializing ResNet Denoiser...")
    model = ResNetDenoiser().to(DEVICE)
    opt = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.L1Loss() # L1 is better for Sharpness than MSE
    
    ds = FrameDataset(TRAIN_DIR, IMG_SIZE)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    
    print(f"Training on {len(ds)} frames from Test Set...")
    
    for epoch in range(EPOCHS):
        model.train()
        loop = tqdm(loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
        epoch_loss = 0
        
        for img in loop:
            img = img.to(DEVICE)
            
            # Reconstruction Task: Input = Img, Target = Img
            # The bottleneck naturally filters high-frequency noise
            recon = model(img)
            
            loss = criterion(recon, img)
            
            opt.zero_grad()
            loss.backward()
            opt.step()
            
            epoch_loss += loss.item()
            loop.set_postfix(loss=loss.item())
            
        # Save usually at the end
    
    torch.save(model.state_dict(), SAVE_NAME)
    print(f"Denoiser saved to {SAVE_NAME}")

if __name__ == "__main__":
    train_denoiser()

In [5]:
import os
import glob
import torch
import torch.nn as nn
import numpy as np
import cv2
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

# ================= CONFIGURATION =================
# 1. CORRECT INPUT PATH (Point to the Avenue Dataset Test Video 02)
INPUT_VIDEO_DIR = '/kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/testing_videos/02'

# 2. OUTPUT FILE
OUTPUT_VIDEO = 'video_02_resnet_clean.mp4'

# 3. MODEL WEIGHTS
MODEL_PATH = 'resnet_denoiser.pth'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. RESNET CLASSES (Must match training) ---
class ResBlock(nn.Module):
    def __init__(self, channels):
        super(ResBlock, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(channels, channels, 3, padding=1),
            nn.BatchNorm2d(channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(channels, channels, 3, padding=1),
            nn.BatchNorm2d(channels)
        )
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(x + self.conv(x))

class ResNetDenoiser(nn.Module):
    def __init__(self):
        super(ResNetDenoiser, self).__init__()
        self.enc1 = nn.Sequential(nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2))
        self.enc2 = nn.Sequential(nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2))
        self.enc3 = nn.Sequential(nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2))
        
        self.bottleneck = nn.Sequential(ResBlock(128), ResBlock(128), ResBlock(128))
        
        self.dec3 = nn.Sequential(nn.ConvTranspose2d(128, 64, 2, 2), nn.ReLU())
        self.dec2 = nn.Sequential(nn.ConvTranspose2d(64, 32, 2, 2), nn.ReLU())
        self.dec1 = nn.Sequential(nn.ConvTranspose2d(32, 16, 2, 2), nn.ReLU())
        
        self.final = nn.Conv2d(16, 3, 3, padding=1)
        self.tanh = nn.Tanh()

    def forward(self, x):
        e1 = self.enc1(x); e2 = self.enc2(e1); e3 = self.enc3(e2)
        b = self.bottleneck(e3)
        d3 = self.dec3(b); d2 = self.dec2(d3); d1 = self.dec1(d2)
        return self.tanh(self.final(d1))

# --- 2. CLEANING FUNCTION ---
def clean_and_save_video():
    print(f"Checking for frames in: {INPUT_VIDEO_DIR}")
    
    # Robust path finding (handle if folder is named '02' or '2')
    if not os.path.exists(INPUT_VIDEO_DIR):
        print(f"❌ Error: Path not found. Check if '/kaggle/input/...' path is correct.")
        return

    frames = sorted(glob.glob(os.path.join(INPUT_VIDEO_DIR, '*.jpg')))
    
    if len(frames) == 0:
        print("❌ Error: No .jpg frames found in directory!")
        print("Files found:", os.listdir(INPUT_VIDEO_DIR)[:5]) # Debug print
        return

    print(f"✅ Found {len(frames)} frames. Loading Model...")

    model = ResNetDenoiser().to(DEVICE)
    if not os.path.exists(MODEL_PATH):
        print(f"❌ Error: {MODEL_PATH} not found. Did you train it?")
        return
        
    model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
    model.eval()
    
    # Setup Video Writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, 25.0, (256, 256))
    
    tf = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    
    print("Processing...")
    with torch.no_grad():
        for f in tqdm(frames):
            img = Image.open(f).convert('RGB')
            inp = tf(img).unsqueeze(0).to(DEVICE)
            
            clean = model(inp)
            
            # Post-process
            clean_np = clean.squeeze(0).permute(1, 2, 0).cpu().numpy()
            clean_np = (clean_np * 0.5) + 0.5
            clean_np = np.clip(clean_np * 255, 0, 255).astype(np.uint8)
            clean_np = cv2.cvtColor(clean_np, cv2.COLOR_RGB2BGR)
            
            out.write(clean_np)
            
    out.release()
    print(f"✅ Success! Saved: {OUTPUT_VIDEO}")

if __name__ == "__main__":
    clean_and_save_video()

Checking for frames in: /kaggle/input/pixel-play-26/Avenue_Corrupted-20251221T112159Z-3-001/Avenue_Corrupted/Dataset/testing_videos/02
✅ Found 1211 frames. Loading Model...
Processing...


100%|██████████| 1211/1211 [00:12<00:00, 95.39it/s]

✅ Success! Saved: video_02_resnet_clean.mp4





In [10]:
import os
import glob
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import cv2
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# ================= CONFIGURATION =================
INPUT_DIR = '/kaggle/working/cleaned_testing_videos/02'
OUTPUT_VIDEO = 'unet_interpolation_sharp_02.mp4'
MODEL_SAVE_PATH = 'unet_interpolator.pth'

IMG_SIZE = 256
BATCH_SIZE = 16
EPOCHS = 40        # Interpolation learns faster
LR = 0.0002
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# =================================================

# --- 1. ARCHITECTURE (Same U-Net, Modified Input) ---
class AsymmetricConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3):
        super(AsymmetricConv, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=(kernel_size, 1), padding=(kernel_size//2, 0))
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=(1, kernel_size), padding=(0, kernel_size//2))
        self.bn = nn.BatchNorm2d(out_channels)
    def forward(self, x): return self.relu(self.bn(self.conv2(self.relu(self.conv1(x)))))

class ResidualSkipConnection(nn.Module):
    def __init__(self, channels):
        super(ResidualSkipConnection, self).__init__()
        self.block = nn.Sequential(AsymmetricConv(channels, channels), AsymmetricConv(channels, channels))
        self.shortcut = nn.Conv2d(channels, channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x): return self.relu(self.block(x) + self.shortcut(x))

class ShortcutInceptionModule(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ShortcutInceptionModule, self).__init__()
        w_6 = out_channels // 6; w_3 = out_channels // 3; w_2 = out_channels - (w_6 + w_3)
        self.branch1 = AsymmetricConv(in_channels, w_6)
        self.branch2 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3))
        self.branch3 = nn.Sequential(AsymmetricConv(in_channels, w_6), AsymmetricConv(w_6, w_3), AsymmetricConv(w_3, w_2))
        self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1)
        self.relu = nn.ReLU(inplace=True)
    def forward(self, x):
        return self.relu(torch.cat([self.branch1(x), self.branch2(x), self.branch3(x)], dim=1) + self.shortcut(x))

class MultiScaleUNet(nn.Module):
    def __init__(self, in_channels=6, out_channels=3): # Input: 6 (Prev+Next), Output: 3 (Current)
        super(MultiScaleUNet, self).__init__()
        self.sim1 = ShortcutInceptionModule(in_channels, 96); self.pool1 = nn.MaxPool2d(2)
        self.sim2 = ShortcutInceptionModule(96, 192);         self.pool2 = nn.MaxPool2d(2)
        self.sim3 = ShortcutInceptionModule(192, 384);        self.pool3 = nn.MaxPool2d(2)
        self.sim4 = ShortcutInceptionModule(384, 768)
        self.rsc1 = nn.Sequential(*[ResidualSkipConnection(96) for _ in range(4)])
        self.rsc2 = nn.Sequential(*[ResidualSkipConnection(192) for _ in range(3)])
        self.rsc3 = nn.Sequential(*[ResidualSkipConnection(384) for _ in range(2)])
        self.sim5 = ShortcutInceptionModule(768, 384);   self.up1 = nn.ConvTranspose2d(384, 384, 2, 2)
        self.sim6 = ShortcutInceptionModule(768, 192);   self.up2 = nn.ConvTranspose2d(192, 192, 2, 2)
        self.sim7 = ShortcutInceptionModule(384, 96);    self.up3 = nn.ConvTranspose2d(96, 96, 2, 2)
        self.sim8 = ShortcutInceptionModule(192, 96)
        self.final = nn.Conv2d(96, out_channels, 3, padding=1)
        self.tanh = nn.Tanh()
    def forward(self, x):
        e1 = self.sim1(x); p1 = self.pool1(e1)
        e2 = self.sim2(p1); p2 = self.pool2(e2)
        e3 = self.sim3(p2); p3 = self.pool3(e3)
        e4 = self.sim4(p3)
        d1 = self.sim5(e4); u1 = self.up1(d1)
        d2 = self.sim6(torch.cat([u1, self.rsc3(e3)], dim=1)); u2 = self.up2(d2)
        d3 = self.sim7(torch.cat([u2, self.rsc2(e2)], dim=1)); u3 = self.up3(d3)
        d4 = self.sim8(torch.cat([u3, self.rsc1(e1)], dim=1))
        return self.tanh(self.final(d4))

# --- 2. DATASET (The Sandwich) ---
class InterpolationDataset(Dataset):
    def __init__(self, root_dir, img_size=256):
        if not os.path.exists(root_dir):
            parent = os.path.dirname(root_dir)
            if os.path.exists(parent):
                possible = [d for d in os.listdir(parent) if '02' in d]
                if possible: root_dir = os.path.join(parent, possible[0])

        self.frames = sorted(glob.glob(os.path.join(root_dir, '*.jpg')))
        print(f"Found {len(self.frames)} frames.")
        
        self.transform = transforms.Compose([
            transforms.Resize((img_size, img_size)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        
        # Pre-load to RAM
        self.loaded_frames = []
        for f in tqdm(self.frames, desc="Loading"):
            img = Image.open(f).convert('RGB')
            self.loaded_frames.append(self.transform(img))
            
    def __len__(self):
        # We need neighbors, so we skip first and last frame
        return len(self.frames) - 2

    def __getitem__(self, idx):
        # We want to predict frame 'i' using 'i-1' and 'i+1'
        # Dataset idx 0 corresponds to frame 1 (middle of 0,1,2)
        center_idx = idx + 1
        
        prev_frame = self.loaded_frames[center_idx - 1]
        next_frame = self.loaded_frames[center_idx + 1]
        target_frame = self.loaded_frames[center_idx]
        
        # Stack inputs: (6, H, W)
        input_stack = torch.cat([prev_frame, next_frame], dim=0)
        
        return input_stack, target_frame

# --- 3. EXECUTION ---
def run_interpolation_denoising():
    ds = InterpolationDataset(INPUT_DIR, IMG_SIZE)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    
    # Input: 6 channels (Prev RGB + Next RGB)
    model = MultiScaleUNet(in_channels=6, out_channels=3).to(DEVICE)
    
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
        
    optimizer = optim.Adam(model.parameters(), lr=LR)
    criterion = nn.L1Loss() # L1 is sharper than MSE
    
    print(f"Starting Interpolation Training ({EPOCHS} Epochs)...")
    
    try:
        for epoch in range(EPOCHS):
            model.train()
            loop = tqdm(loader, desc=f"Epoch {epoch+1}", leave=False)
            epoch_loss = 0
            
            for inputs, targets in loop:
                inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
                
                preds = model(inputs)
                loss = criterion(preds, targets)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                epoch_loss += loss.item()
                loop.set_postfix(loss=loss.item())
            
            # Uncomment to print loss if needed
            # print(f"Epoch {epoch+1} Loss: {epoch_loss/len(loader):.5f}")
            
    except KeyboardInterrupt:
        print("\n🛑 Interrupted! Generating video now...")

    # --- SAVE ---
    state_dict = model.module.state_dict() if isinstance(model, nn.DataParallel) else model.state_dict()
    torch.save(state_dict, MODEL_SAVE_PATH)
    
    # --- GENERATE & SHARPEN ---
    print("Generating & Sharpening Video...")
    model.eval()
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, 25.0, (IMG_SIZE, IMG_SIZE))
    
    seq_loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False)
    
    with torch.no_grad():
        for inputs, _ in tqdm(seq_loader, desc="Rendering"):
            inputs = inputs.to(DEVICE)
            preds = model(inputs)
            
            for i in range(preds.size(0)):
                img = preds[i].permute(1, 2, 0).cpu().numpy()
                img = (img * 0.5) + 0.5
                img = np.clip(img * 255, 0, 255).astype(np.uint8)
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                
                # --- SHARPENING TRICK ---
                # 1. Unsharp Mask: enhance edges
                gaussian = cv2.GaussianBlur(img, (0, 0), 2.0)
                sharp = cv2.addWeighted(img, 1.5, gaussian, -0.5, 0)
                
                # 2. Detail Enhance (Optional, subtle boost)
                # sharp = cv2.detailEnhance(sharp, sigma_s=10, sigma_r=0.15)
                
                out.write(sharp)
                
    out.release()
    print(f"✅ DONE! Sharp & Clean video saved to: {OUTPUT_VIDEO}")

if __name__ == "__main__":
    run_interpolation_denoising()

Found 1211 frames.


Loading: 100%|██████████| 1211/1211 [00:05<00:00, 236.50it/s]


Starting Interpolation Training (40 Epochs)...


                                                                     


🛑 Interrupted! Generating video now...
Generating & Sharpening Video...


Rendering: 100%|██████████| 76/76 [01:04<00:00,  1.18it/s]

✅ DONE! Sharp & Clean video saved to: unet_interpolation_sharp_02.mp4





In [6]:
import os
import shutil
import sys

# 1. SETUP WRITABLE DIRECTORIES
WORK_DIR = "/kaggle/working"
MODELS_DIR = os.path.join(WORK_DIR, "models")

# Clean slate: Remove existing models dir if it exists to avoid conflicts
if os.path.exists(MODELS_DIR):
    shutil.rmtree(MODELS_DIR)
os.makedirs(MODELS_DIR)

# 2. FIND AND COPY ALL PYTHON FILES
print("🔍 Hunting for Python files in /kaggle/input...")

py_files_found = []

for root, dirs, files in os.walk("/kaggle/input/tap-denoise2"):
    for file in files:
        if file.endswith(".py"):
            source_path = os.path.join(root, file)
            
            # Copy to /kaggle/working/models/ (Satisfies 'from models import x')
            dest_path_models = os.path.join(MODELS_DIR, file)
            shutil.copy2(source_path, dest_path_models)
            
            # ALSO Copy to /kaggle/working/ (Satisfies direct imports if needed)
            dest_path_root = os.path.join(WORK_DIR, file)
            shutil.copy2(source_path, dest_path_root)
            
            py_files_found.append(file)

# 3. CREATE __init__.py
# This makes Python treat the directory as a package
with open(os.path.join(MODELS_DIR, "__init__.py"), "w") as f:
    f.write("")

print(f"✅ Success! Moved {len(py_files_found)} files to {MODELS_DIR}")
print(f"Files: {py_files_found}")

# 4. FIX SYSTEM PATH
# Now Python will look in /kaggle/working first
sys.path.insert(0, WORK_DIR)

🔍 Hunting for Python files in /kaggle/input...
✅ Success! Moved 18 files to /kaggle/working/models
Files: ['option_finetune_crvd.py', 'main_test_tap_crvd_outdoor.py', 'option_pretrain.py', 'module_pcd_alignment.py', 'network_isp.py', 'main_test_tap_crvd_indoor.py', 'pretrain_im_dataset.py', 'main_finetune_crvd_tap.py', 'network_nafnet.py', 'module_dcn.py', 'basicblocks.py', 'option_finetune_rgb.py', 'utils_image.py', 'utils_network.py', 'finetune_vid_dataset.py', 'utils_logger.py', 'main_finetune_rgb_tapt.py', 'main_finetune_rgb_tap.py']


ModuleNotFoundError: No module named 'models'

In [8]:
import torch
import cv2
import numpy as np
import os
import sys
from tqdm import tqdm

# --- 1. SETUP PATHS (Kaggle Uploads are usually in /kaggle/input) ---
# We add the directory containing network_nafnet.py to python path
sys.path.append("/kaggle/working") 

# Try to import
try:
    from network_nafnet import Baseline
except ImportError:
    # Fallback: if you uploaded raw files, they might be in current dir
    sys.path.append(os.getcwd())
    from network_nafnet import Baseline

# --- 2. CONFIGURATION (Your fixed values) ---
WIDTH = 64
ENC_BLKS = [2, 2, 4]
DEC_BLKS = [2, 2, 2]
MIDDLE_BLK = 6
DW_EXPAND = 2
DEVICE = torch.device('cuda') # We use the free GPU!

# --- 3. PATHS ---
# UPDATE THESE TO MATCH YOUR KAGGLE RIGHT SIDEBAR
INPUT_VIDEO = "/kaggle/working/rendered_videos/video_02.mp4"

WEIGHTS = "/kaggle/input/nafnetrgb/pytorch/default/1/nafnet_rgb.pth"
OUTPUT_VIDEO = "cleaned_02_gpu.mp4"

def run_gpu_inference():
    print(f"🚀 powering up GPU: {torch.cuda.get_device_name(0)}")
    
    model = Baseline(
        img_channel=3, width=WIDTH, middle_blk_num=MIDDLE_BLK,
        enc_blk_nums=ENC_BLKS, dec_blk_nums=DEC_BLKS, dw_expand=DW_EXPAND
    )
    
    # Load weights
    ckpt = torch.load(WEIGHTS, map_location=DEVICE)
    if 'params' in ckpt: ckpt = ckpt['params']
    elif 'state_dict' in ckpt: ckpt = ckpt['state_dict']
    model.load_state_dict(ckpt, strict=True)
    model.to(DEVICE).eval()
    
    # Process
    cap = cv2.VideoCapture(INPUT_VIDEO)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    out = cv2.VideoWriter(OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
    
    print(f"Cleaning {total} frames...")
    
    with torch.no_grad():
        for _ in tqdm(range(total)):
            ret, frame = cap.read()
            if not ret: break
            
            # GPU Processing
            img = torch.from_numpy(frame).permute(2,0,1).float().div(255.0).unsqueeze(0).to(DEVICE)
            
            # Since video is BGR, and model expects RGB usually, we flip
            # But NAFNet might be agnostic. Let's stick to standard BGR->RGB flow if trained on RGB.
            img = img.flip(1) # BGR to RGB
            
            output = model(img)
            
            output = output.squeeze(0).permute(1,2,0).cpu().numpy()
            output = output[:, :, ::-1] # Flip back RGB to BGR
            output = np.clip(output * 255, 0, 255).astype(np.uint8)
            
            out.write(output)
            
    cap.release()
    out.release()
    print("✅ Done! Download your video from the Output tab.")

if __name__ == "__main__":
    run_gpu_inference()

🚀 powering up GPU: Tesla T4
Cleaning 1211 frames...


100%|██████████| 1211/1211 [05:42<00:00,  3.53it/s]

✅ Done! Download your video from the Output tab.





In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import cv2
import numpy as np
import os
import sys
from tqdm import tqdm

# --- CONFIGURATION ---
INPUT_VIDEO = "/kaggle/working/rendered_videos/video_02.mp4"       # <--- CHECK PATH
WEIGHTS_PATH = "/kaggle/input/nafnetrgb/pytorch/default/1/nafnet_rgb.pth" # <--- CHECK PATH
OUTPUT_VIDEO = "/kaggle/working/final_temporal_clean.mp4"

# NAFNet Config (From your weights)
WIDTH = 64
ENC_BLKS = [2, 2, 4]
DEC_BLKS = [2, 2, 2]
MIDDLE_BLK = 6
DW_EXPAND = 2

# Temporal Strength (0.0 = No history, 0.5 = Strong smoothing)
# 0.3 is a good balance to kill flicker without ghosting
TEMPORAL_BLEND = 0.3 

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# --- MODEL DEFINITION (NAFNet Baseline) ---
class LayerNormFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, weight, bias, eps):
        ctx.eps = eps
        N, C, H, W = x.size()
        mu = x.mean(1, keepdim=True)
        var = (x - mu).pow(2).mean(1, keepdim=True)
        y = (x - mu) / (var + eps).sqrt()
        ctx.save_for_backward(y, var, weight)
        y = weight.view(1, C, 1, 1) * y + bias.view(1, C, 1, 1)
        return y
    @staticmethod
    def backward(ctx, grad_output):
        eps = ctx.eps
        N, C, H, W = grad_output.size()
        y, var, weight = ctx.saved_variables
        g = grad_output * weight.view(1, C, 1, 1)
        mean_g = g.mean(dim=1, keepdim=True)
        mean_gy = (g * y).mean(dim=1, keepdim=True)
        gx = 1. / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g)
        return gx, (grad_output * y).sum(dim=3).sum(dim=2).sum(dim=0), grad_output.sum(dim=3).sum(dim=2).sum(dim=0), None

class LayerNorm2d(nn.Module):
    def __init__(self, channels, eps=1e-6):
        super(LayerNorm2d, self).__init__()
        self.register_parameter('weight', nn.Parameter(torch.ones(channels)))
        self.register_parameter('bias', nn.Parameter(torch.zeros(channels)))
        self.eps = eps
    def forward(self, x):
        return LayerNormFunction.apply(x, self.weight, self.bias, self.eps)

class BaselineBlock(nn.Module):
    def __init__(self, c, DW_Expand=1, FFN_Expand=2, drop_out_rate=0.):
        super().__init__()
        dw_channel = c * DW_Expand
        self.conv1 = nn.Conv2d(in_channels=c, out_channels=dw_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
        self.conv2 = nn.Conv2d(in_channels=dw_channel, out_channels=dw_channel, kernel_size=3, padding=1, stride=1, groups=dw_channel, bias=True)
        self.conv3 = nn.Conv2d(in_channels=dw_channel, out_channels=c, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(in_channels=dw_channel, out_channels=dw_channel // 2, kernel_size=1, padding=0, stride=1, groups=1, bias=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=dw_channel // 2, out_channels=dw_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True),
            nn.Sigmoid()
        )
        self.gelu = nn.GELU()
        ffn_channel = FFN_Expand * c
        self.conv4 = nn.Conv2d(in_channels=c, out_channels=ffn_channel, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
        self.conv5 = nn.Conv2d(in_channels=ffn_channel, out_channels=c, kernel_size=1, padding=0, stride=1, groups=1, bias=True)
        self.norm1 = LayerNorm2d(c)
        self.norm2 = LayerNorm2d(c)
        self.dropout1 = nn.Dropout(drop_out_rate) if drop_out_rate > 0. else nn.Identity()
        self.dropout2 = nn.Dropout(drop_out_rate) if drop_out_rate > 0. else nn.Identity()
        self.beta = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
        self.gamma = nn.Parameter(torch.zeros((1, c, 1, 1)), requires_grad=True)
    def forward(self, inp):
        x = inp
        x = self.norm1(x)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.gelu(x)
        x = x * self.se(x)
        x = self.conv3(x)
        x = self.dropout1(x)
        y = inp + x * self.beta
        x = self.conv4(self.norm2(y))
        x = self.gelu(x)
        x = self.conv5(x)
        x = self.dropout2(x)
        return y + x * self.gamma

class Baseline(nn.Module):
    def __init__(self, img_channel=3, width=16, middle_blk_num=1, enc_blk_nums=[], dec_blk_nums=[], dw_expand=1, ffn_expand=2):
        super().__init__()
        self.intro = nn.Conv2d(in_channels=img_channel, out_channels=width, kernel_size=3, padding=1, stride=1, groups=1, bias=True)
        self.ending = nn.Conv2d(in_channels=width, out_channels=img_channel, kernel_size=3, padding=1, stride=1, groups=1, bias=True)
        self.encoders = nn.ModuleList()
        self.decoders = nn.ModuleList()
        self.middle_blks = nn.ModuleList()
        self.ups = nn.ModuleList()
        self.downs = nn.ModuleList()
        chan = width
        for num in enc_blk_nums:
            self.encoders.append(nn.Sequential(*[BaselineBlock(chan, dw_expand, ffn_expand) for _ in range(num)]))
            self.downs.append(nn.Conv2d(chan, 2*chan, 2, 2))
            chan = chan * 2
        self.middle_blks = nn.Sequential(*[BaselineBlock(chan, dw_expand, ffn_expand) for _ in range(middle_blk_num)])
        for num in dec_blk_nums:
            self.ups.append(nn.Sequential(nn.Conv2d(chan, chan * 2, 1, bias=False), nn.PixelShuffle(2)))
            chan = chan // 2
            self.decoders.append(nn.Sequential(*[BaselineBlock(chan, dw_expand, ffn_expand) for _ in range(num)]))
        self.padder_size = 2 ** len(self.encoders)
    def forward(self, inp):
        B, C, H, W = inp.shape
        inp = self.check_image_size(inp)
        x = self.intro(inp)
        encs = []
        for encoder, down in zip(self.encoders, self.downs):
            x = encoder(x)
            encs.append(x)
            x = down(x)
        x = self.middle_blks(x)
        for decoder, up, enc_skip in zip(self.decoders, self.ups, encs[::-1]):
            x = up(x)
            x = x + enc_skip
            x = decoder(x)
        x = self.ending(x)
        x = x + inp
        return x[:, :, :H, :W]
    def check_image_size(self, x):
        _, _, h, w = x.size()
        mod_pad_h = (self.padder_size - h % self.padder_size) % self.padder_size
        mod_pad_w = (self.padder_size - w % self.padder_size) % self.padder_size
        x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h))
        return x

# --- EXECUTION ---
def run_temporal_denoising():
    print("--- Motion-Compensated NAFNet Cleaning ---")
    
    # 1. Load Model
    model = Baseline(
        img_channel=3, width=WIDTH, middle_blk_num=MIDDLE_BLK,
        enc_blk_nums=ENC_BLKS, dec_blk_nums=DEC_BLKS, dw_expand=DW_EXPAND
    )
    
    if not os.path.exists(WEIGHTS_PATH):
        print("❌ Error: Weights not found.")
        return
        
    st = torch.load(WEIGHTS_PATH, map_location=DEVICE)
    if 'params' in st: st = st['params']
    elif 'state_dict' in st: st = st['state_dict']
    
    model.load_state_dict(st, strict=True)
    model.to(DEVICE).eval()
    print("✅ Model Loaded")
    
    # 2. Video IO
    cap = cv2.VideoCapture(INPUT_VIDEO)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    out = cv2.VideoWriter(OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
    
    # 3. Processing Loop
    prev_clean = None
    prev_gray = None
    
    print(f"Processing {total} frames...")
    
    with torch.no_grad():
        for i in tqdm(range(total)):
            ret, frame = cap.read()
            if not ret: break
            
            # A. Spatial Denoise (NAFNet)
            img_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img_tensor = torch.from_numpy(img_rgb.astype(np.float32) / 255.0)
            img_tensor = img_tensor.permute(2, 0, 1).unsqueeze(0).to(DEVICE)
            
            clean_tensor = model(img_tensor)
            
            # To numpy (H, W, 3)
            curr_clean = clean_tensor.squeeze(0).permute(1, 2, 0).cpu().numpy()
            curr_clean = np.clip(curr_clean * 255, 0, 255).astype(np.uint8)
            curr_clean_bgr = cv2.cvtColor(curr_clean, cv2.COLOR_RGB2BGR)
            
            # B. Temporal Fusion (Optical Flow)
            curr_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            
            if prev_clean is not None:
                # 1. Calculate Flow (Where did pixels move?)
                flow = cv2.calcOpticalFlowFarneback(
                    prev_gray, curr_gray, None, 
                    pyr_scale=0.5, levels=3, winsize=15, 
                    iterations=3, poly_n=5, poly_sigma=1.2, flags=0
                )
                
                # 2. Warp Previous Clean Frame to Current Position
                # (Create grid for remapping)
                h, w = flow.shape[:2]
                flow_map = np.column_stack((np.repeat(np.arange(h), w), np.tile(np.arange(w), h)))
                flow_map = flow_map.reshape(h, w, 2).astype(np.float32)
                
                # Add flow to grid (Remember: flow is dx, dy)
                map_x = flow_map[..., 1] + flow[..., 0]
                map_y = flow_map[..., 0] + flow[..., 1]
                
                warped_prev = cv2.remap(prev_clean, map_x, map_y, cv2.INTER_LINEAR)
                
                # 3. Blend (Remove Flicker)
                # If flow error is high (occlusion), trust current frame more
                # For simplicity, we use fixed alpha which works well for grain
                final_frame = cv2.addWeighted(curr_clean_bgr, 1.0 - TEMPORAL_BLEND, warped_prev, TEMPORAL_BLEND, 0)
            else:
                final_frame = curr_clean_bgr
            
            # Save for next iter
            prev_clean = final_frame
            prev_gray = curr_gray
            
            out.write(final_frame)
            
    cap.release()
    out.release()
    print(f"✅ Finished! Saved to {OUTPUT_VIDEO}")

if __name__ == "__main__":
    run_temporal_denoising()

--- Motion-Compensated NAFNet Cleaning ---
✅ Model Loaded
Processing 1211 frames...


100%|██████████| 1211/1211 [06:58<00:00,  2.89it/s]

✅ Finished! Saved to /kaggle/working/final_temporal_clean.mp4





In [10]:
# 1. Clone the Repo
!git clone https://github.com/m-tassano/fastdvdnet.git
%cd fastdvdnet

# 2. Install Dependencies (Kaggle usually has these, but just in case)
!pip install -q torch torchvision opencv-python

# 3. Download Pre-trained Weights (Model.pth)
# We use the 'denoising' weights trained on high noise
!mkdir -p model
!wget -O model/model.pth https://github.com/m-tassano/fastdvdnet/raw/master/model.pth

Cloning into 'fastdvdnet'...
remote: Enumerating objects: 145, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 145 (delta 21), reused 12 (delta 9), pack-reused 109 (from 1)[K
Receiving objects: 100% (145/145), 34.97 MiB | 42.38 MiB/s, done.
Resolving deltas: 100% (71/71), done.
/kaggle/working/fastdvdnet
--2025-12-30 18:45:05--  https://github.com/m-tassano/fastdvdnet/raw/master/model.pth
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/m-tassano/fastdvdnet/master/model.pth [following]
--2025-12-30 18:45:06--  https://raw.githubusercontent.com/m-tassano/fastdvdnet/master/model.pth
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.git

In [19]:
import os
import cv2
import shutil
import glob
import re

# ================= CONFIGURATION =================
# Path to your messy frames (e.g. 1.jpg, 10.jpg...)
INPUT_FRAMES_DIR = "/kaggle/working/cleaned_testing_videos/02"  

NOISE_SIGMA = 50 
MAX_FRAMES = 2000
OUTPUT_VIDEO_PATH = "/kaggle/working/cleaned_fastdvdnet_ordered.mp4"
# =================================================

def natural_sort_key(s):
    """Sorts string with embedded numbers correctly (1, 2, ... 10) instead of (1, 10, 2)"""
    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]

def prepare_ordered_input(src_dir, dest_dir):
    """Copies frames to a new folder and renames them 00001.jpg, 00002.jpg..."""
    if os.path.exists(dest_dir): shutil.rmtree(dest_dir)
    os.makedirs(dest_dir)
    
    # Get all images
    files = glob.glob(os.path.join(src_dir, "*"))
    files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    # CRITICAL: Sort them naturally (1, 2, 3...)
    files.sort(key=lambda f: natural_sort_key(os.path.basename(f)))
    
    print(f"🧹 Re-ordering {len(files)} frames...")
    print(f"   First 3: {[os.path.basename(f) for f in files[:3]]}")
    
    for i, file_path in enumerate(files):
        # Rename to 00001.jpg, 00002.jpg, etc.
        ext = os.path.splitext(file_path)[1]
        new_name = f"{i:05d}{ext}" 
        shutil.copy2(file_path, os.path.join(dest_dir, new_name))
        
    return dest_dir

def install_dependencies():
    print("📦 Installing dependencies (tensorboardX)...")
    os.system("pip install tensorboardX")

def patch_fastdvdnet_code():
    utils_path = "fastdvdnet/utils.py"
    if not os.path.exists(utils_path): return
    with open(utils_path, "r") as f: code = f.read()
    new_code = code.replace("from skimage.measure.simple_metrics import compare_psnr", "from skimage.metrics import peak_signal_noise_ratio as compare_psnr")
    with open(utils_path, "w") as f: f.write(new_code)
    print("✅ Patch applied.")

def run_pipeline():
    # 1. SETUP
    install_dependencies()
    if not os.path.exists("fastdvdnet"):
        print("📥 Cloning FastDVDnet...")
        os.system("git clone https://github.com/m-tassano/fastdvdnet.git")
    patch_fastdvdnet_code()
    
    if not os.path.exists("fastdvdnet/model/model.pth"):
        print("⬇️ Downloading weights...")
        os.makedirs("fastdvdnet/model", exist_ok=True)
        os.system("wget -O fastdvdnet/model/model.pth https://github.com/m-tassano/fastdvdnet/raw/master/model.pth")

    # 2. PREPARE INPUT (THE FIX)
    # We copy frames to a 'sorted_input' folder first
    sorted_input_dir = "/kaggle/working/temp_sorted_input"
    prepare_ordered_input(INPUT_FRAMES_DIR, sorted_input_dir)

    # 3. RUN INFERENCE
    print(f"🚀 Running FastDVDnet...")
    temp_out = "/kaggle/working/temp_results"
    if os.path.exists(temp_out): shutil.rmtree(temp_out)
    os.makedirs(temp_out)

    cmd = (
        f"cd fastdvdnet && "
        f"python test_fastdvdnet.py "
        f"--test_path {sorted_input_dir} " # Use the SORTED folder
        f"--noise_sigma {NOISE_SIGMA} "
        f"--save_path {temp_out} "
        f"--max_num_fr_per_seq {MAX_FRAMES}"
    )
    
    if os.system(cmd) != 0:
        print("❌ FastDVDnet failed.")
        return

    # 4. STITCH VIDEO
    print("🎬 Stitching frames...")
    
    # Find output (FastDVDnet creates a subdir named 'temp_sorted_input')
    processed_dir = os.path.join(temp_out, "temp_sorted_input")
    if not os.path.exists(processed_dir): processed_dir = temp_out
        
    frames = sorted(glob.glob(os.path.join(processed_dir, "*")))
    # Since we renamed inputs to 00001.jpg, the output is guaranteed to sort correctly now!
    
    if not frames:
        print("❌ No output frames found.")
        return
        
    sample = cv2.imread(frames[0])
    h, w, _ = sample.shape
    out = cv2.VideoWriter(OUTPUT_VIDEO_PATH, cv2.VideoWriter_fourcc(*'mp4v'), 25.0, (w, h))
    
    for f in frames:
        out.write(cv2.imread(f))
    out.release()
    
    # Cleanup
    shutil.rmtree(temp_out)
    shutil.rmtree(sorted_input_dir)
    print(f"✅ SUCCESS! Correctly ordered video: {OUTPUT_VIDEO_PATH}")

if __name__ == "__main__":
    run_pipeline()

📦 Installing dependencies (tensorboardX)...
✅ Patch applied.
🧹 Re-ordering 1211 frames...
   First 3: ['frame_00000.jpg', 'frame_00001.jpg', 'frame_00002.jpg']
🚀 Running FastDVDnet...


INFO:testlog:Finished denoising /kaggle/working/temp_sorted_input
INFO:testlog:	Denoised 1211 frames in 119.965s, loaded seq in 7.591s
INFO:testlog:	PSNR noisy 14.1514dB, PSNR result 30.0900dB



### Testing FastDVDnet model ###
> Parameters:
	model_file: ./model.pth
	test_path: /kaggle/working/temp_sorted_input
	suffix: 
	max_num_fr_per_seq: 2000
	noise_sigma: 0.19607843137254902
	dont_save_results: False
	save_noisy: False
	no_gpu: False
	save_path: /kaggle/working/temp_results
	gray: False
	cuda: True


Loading models ...
	Open sequence in folder:  /kaggle/working/temp_sorted_input
🎬 Stitching frames...


AttributeError: 'NoneType' object has no attribute 'shape'

In [20]:
import cv2
import glob
import os
import re

# ================= CONFIGURATION =================
# Path where FastDVDnet dumped the results
# Based on your logs, it should be here:
RESULTS_DIR = "/kaggle/working/temp_results/temp_sorted_input" 
OUTPUT_VIDEO = "/kaggle/working/cleaned_fastdvdnet_final.mp4"
# =================================================

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]

def repair_video():
    print(f"🕵️ Looking for frames in: {RESULTS_DIR}")
    
    if not os.path.exists(RESULTS_DIR):
        print(f"❌ Error: Folder not found. Did the previous step delete it?")
        # Fallback check
        alt_path = "/kaggle/working/temp_results"
        print(f"   Checking fallback: {alt_path}")
        if os.path.exists(alt_path):
            files = glob.glob(os.path.join(alt_path, "*.png"))
            if len(files) > 0:
                print("   Found frames in root temp_results!")
                process_stitching(alt_path)
                return
        return

    process_stitching(RESULTS_DIR)

def process_stitching(folder_path):
    # 1. Strict Filtering (Only PNG/JPG)
    # This prevents the error you just got
    search_path = os.path.join(folder_path, "*")
    all_files = glob.glob(search_path)
    
    frames = [f for f in all_files if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    # 2. Sort Correctly
    frames.sort(key=lambda f: natural_sort_key(os.path.basename(f)))
    
    if not frames:
        print("❌ No image files found to stitch.")
        return

    print(f"✅ Found {len(frames)} valid frames.")
    print(f"   First: {os.path.basename(frames[0])}")
    print(f"   Last:  {os.path.basename(frames[-1])}")

    # 3. Initialize Video Writer
    first_frame = cv2.imread(frames[0])
    if first_frame is None:
        print(f"❌ Critical: Could not read first frame: {frames[0]}")
        return
        
    height, width, _ = first_frame.shape
    fps = 25.0
    
    print(f"🎬 Stitching {width}x{height} video...")
    
    out = cv2.VideoWriter(OUTPUT_VIDEO, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
    
    count = 0
    for f_path in frames:
        img = cv2.imread(f_path)
        if img is None:
            print(f"⚠️ Warning: Skipping unreadable file: {f_path}")
            continue
        out.write(img)
        count += 1
        
    out.release()
    print(f"🎉 SUCCESS! Video saved to: {OUTPUT_VIDEO}")

if __name__ == "__main__":
    repair_video()

🕵️ Looking for frames in: /kaggle/working/temp_results/temp_sorted_input
❌ Error: Folder not found. Did the previous step delete it?
   Checking fallback: /kaggle/working/temp_results
   Found frames in root temp_results!
✅ Found 1211 valid frames.
   First: n50_FastDVDnet_0.png
   Last:  n50_FastDVDnet_1210.png
🎬 Stitching 640x360 video...
🎉 SUCCESS! Video saved to: /kaggle/working/cleaned_fastdvdnet_final.mp4
