In [8]:
import os
import json
import torch
import numpy as np
from PIL import Image

class NeRFSyntheticDatasetAllSplits:
    def __init__(self, root_dir, splits=('train', 'val', 'test'), img_wh=(800, 800), device='cuda'):
        self.root_dir = root_dir
        self.splits = splits
        self.img_wh = img_wh
        self.device = device
        self.data = self._load_all_data()
    
    def _load_all_data(self):
        all_frames = []
        for split in self.splits:
            json_path = os.path.join(self.root_dir, f'transforms_{split}.json')
            with open(json_path, 'r') as f:
                meta = json.load(f)
            all_frames.extend(meta['frames'])
        
        images = []
        poses = []
        for frame in all_frames:
            img_path = os.path.join(self.root_dir, frame['file_path'] + '.png')
            img = Image.open(img_path).convert('RGBA').resize(self.img_wh, Image.LANCZOS)
            img = torch.from_numpy(np.array(img) / 255.0).float().to(self.device)
            images.append(img)
            
            pose = torch.from_numpy(np.array(frame['transform_matrix'])).float().to(self.device)
            poses.append(pose)
        
        images = torch.stack(images)
        poses = torch.stack(poses)
        return {'images': images, 'poses': poses}

    def __len__(self):
        return self.data['images'].shape[0]

    def __getitem__(self, idx):
        return self.data['images'][idx], self.data['poses'][idx]

dataset = NeRFSyntheticDatasetAllSplits(root_dir='/kaggle/input/nerf-synthetic-dataset/nerf_synthetic/ship')
print(f"Loaded total {len(dataset)} images and poses")
img, pose = dataset[0]
print(f"Sample image shape: {img.shape}, Sample pose shape: {pose.shape}")

Loaded total 400 images and poses
Sample image shape: torch.Size([800, 800, 4]), Sample pose shape: torch.Size([4, 4])


In [32]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from PIL import Image
import os, json
import numpy as np

class NeRFSyntheticDatasetLazy:
    def __init__(self, root_dir, splits=('train','val','test'), img_wh=(800,800)):
        self.root_dir = root_dir
        self.img_wh = img_wh
        self.frames = []
        for split in splits:
            json_path = os.path.join(root_dir, f'transforms_{split}.json')
            with open(json_path,'r') as f:
                meta = json.load(f)
            self.frames.extend(meta['frames'])
    def __len__(self):
        return len(self.frames)
    def __getitem__(self, idx):
        frame = self.frames[idx]
        img_path = os.path.join(self.root_dir, frame['file_path'] + '.png')
        img = Image.open(img_path).convert('RGBA').resize(self.img_wh, Image.LANCZOS)
        img = torch.from_numpy(np.array(img)/255.0).float()  # CPU
        pose = torch.from_numpy(np.array(frame['transform_matrix'])).float()
        return img, pose

def sample_rays(img, num_rays=1024):
    H,W,C = img.shape
    img_rgb = img[...,:3]  # RGB only
    img_rgb = img_rgb.reshape(H*W,3)
    idx = torch.randint(0,H*W,(num_rays,))
    rays = img_rgb[idx]
    return rays

class SimpleNeRFPlusPlus(nn.Module):
    def __init__(self,D=8,W=128,input_ch=3,output_ch=3):
        super().__init__()
        layers = [nn.Linear(input_ch,W), nn.ReLU()]
        for _ in range(D-1):
            layers.append(nn.Linear(W,W))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(W,output_ch))
        self.mlp = nn.Sequential(*layers)
    def forward(self,x):
        return self.mlp(x)

dataset = NeRFSyntheticDatasetLazy('/kaggle/input/nerf-synthetic-dataset/nerf_synthetic/ship')
dataloader = DataLoader(dataset,batch_size=1,shuffle=True)

model = SimpleNeRFPlusPlus().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
criterion = nn.MSELoss()

for i, (img, pose) in enumerate(dataloader):
    rays = sample_rays(img[0], num_rays=1024).cuda()
    target = rays.clone()
    optimizer.zero_grad()
    output = model(rays)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    if i % 50 == 0:
        print(f"Step {i}, Loss: {loss.item()}")

print("Step 2 completed: NeRF++ RGB demo")

Step 0, Loss: 0.02725367248058319
Step 50, Loss: 0.002450698520988226
Step 100, Loss: 0.0006320072570815682
Step 150, Loss: 0.0006806027959100902
Step 200, Loss: 0.0004027260874863714
Step 250, Loss: 0.0006119100144132972
Step 300, Loss: 0.0006149326218292117
Step 350, Loss: 0.0008332583820447326
Step 2 completed: NeRF++ RGB demo


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from diffusers import ControlNetModel, UNet2DConditionModel, DDIMScheduler, StableDiffusionControlNetPipeline
from transformers import CLIPTextModel, CLIPTokenizer

class NeRFControlDataset(Dataset):
    def __init__(self, nerf_images, prompts):
        self.images = nerf_images
        self.prompts = prompts
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        return self.images[idx], self.prompts[idx]

SD_BASE = "/kaggle/input/sd-offline/stable-diffusion-2-1-base"
CONTROLNET_PATH = "/kaggle/input/sd-offline/sd-controlnet-canny"

controlnet = ControlNetModel.from_pretrained(CONTROLNET_PATH, local_files_only=True).to("cuda")
unet = UNet2DConditionModel.from_pretrained(f"{SD_BASE}/unet", local_files_only=True).to("cuda")
text_encoder = CLIPTextModel.from_pretrained(f"{SD_BASE}/text_encoder", local_files_only=True).to("cuda")
tokenizer = CLIPTokenizer.from_pretrained(f"{SD_BASE}/tokenizer", local_files_only=True)
scheduler = DDIMScheduler.from_pretrained(f"{SD_BASE}/scheduler", local_files_only=True)

pipe = StableDiffusionControlNetPipeline(
    unet=unet,
    controlnet=controlnet,
    tokenizer=tokenizer,
    text_encoder=text_encoder,
    scheduler=scheduler
).to("cuda")

nerf_images = [torch.rand(3, 256, 256) for _ in range(6)]
prompts = ["red factory machine", "conveyor belt", "robotic arm", "assembly line", "metal press", "forklift"]

dataset = NeRFControlDataset(nerf_images, prompts)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

optimizer = torch.optim.Adam(controlnet.parameters(), lr=1e-4)
for i, (img, prompt) in enumerate(dataloader):
    img = img.to("cuda")
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids.to("cuda")
    optimizer.zero_grad()
    outputs = controlnet(img, input_ids=input_ids).sample
    loss = ((outputs - img)**2).mean()
    loss.backward()
    optimizer.step()
    if i % 1 == 0:
        print(f"Step {i}, Loss: {loss.item()}")

print("Step 3 completed: Offline NeRF++ → ControlNet → Stable Diffusion pipeline")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from diffusers import ControlNetModel, UNet2DConditionModel, DDIMScheduler, StableDiffusionControlNetPipeline
from transformers import CLIPTextModel, CLIPTokenizer
from PIL import Image
import os

class NeRFControlDataset(Dataset):
    def __init__(self, nerf_images, prompts):
        self.images = nerf_images
        self.prompts = prompts
    def __len__(self):
        return len(self.images)
    def __getitem__(self, idx):
        return self.images[idx], self.prompts[idx]

SD_BASE = "/kaggle/input/sd-offline/stable-diffusion-2-1-base"
CONTROLNET_PATH = "/kaggle/input/sd-offline/sd-controlnet-canny"

controlnet = ControlNetModel.from_pretrained(CONTROLNET_PATH, local_files_only=True).to("cuda")
unet = UNet2DConditionModel.from_pretrained(f"{SD_BASE}/unet", local_files_only=True).to("cuda")
text_encoder = CLIPTextModel.from_pretrained(f"{SD_BASE}/text_encoder", local_files_only=True).to("cuda")
tokenizer = CLIPTokenizer.from_pretrained(f"{SD_BASE}/tokenizer", local_files_only=True)
scheduler = DDIMScheduler.from_pretrained(f"{SD_BASE}/scheduler", local_files_only=True)

pipe = StableDiffusionControlNetPipeline(
    unet=unet,
    controlnet=controlnet,
    tokenizer=tokenizer,
    text_encoder=text_encoder,
    scheduler=scheduler
).to("cuda")

nerf_image_folder = "/kaggle/working/nerf_outputs"
nerf_images = []
prompts = []
for fname in sorted(os.listdir(nerf_image_folder)):
    if fname.endswith(".png"):
        img_path = os.path.join(nerf_image_folder, fname)
        img = Image.open(img_path).convert("RGB")
        img_tensor = torch.from_numpy((torch.tensor(np.array(img))/255.0).permute(2,0,1)).float()
        nerf_images.append(img_tensor)
        prompts.append("photorealistic factory scene")  

dataset = NeRFControlDataset(nerf_images, prompts)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

optimizer = torch.optim.Adam(controlnet.parameters(), lr=1e-4)
for i, (img, prompt) in enumerate(dataloader):
    img = img.to("cuda")
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True).input_ids.to("cuda")
    optimizer.zero_grad()
    outputs = controlnet(img, input_ids=input_ids).sample
    loss = ((outputs - img)**2).mean()
    loss.backward()
    optimizer.step()
    if i % 1 == 0:
        print(f"Step {i}, Loss: {loss.item()}")

output_folder = "/kaggle/working/synthetic_dataset"
os.makedirs(output_folder, exist_ok=True)
for idx, img in enumerate(nerf_images):
    out_img = (img * 255).byte().permute(1,2,0).numpy()
    Image.fromarray(out_img).save(os.path.join(output_folder, f"synthetic_{idx:03d}.png"))

print("Step 4 & 5 completed: Text-guided scene generation + dataset export done")

In [None]:
import json
import os
from PIL import Image

synthetic_folder = "/kaggle/working/synthetic_dataset"
annotations_file = "/kaggle/working/synthetic_dataset/annotations.json"

coco_dict = {
    "images": [],
    "annotations": [],
    "categories": [{"id": 1, "name": "factory_object"}]
}

annotation_id = 0
for idx, fname in enumerate(sorted(os.listdir(synthetic_folder))):
    if not fname.endswith(".png"):
        continue
    img_path = os.path.join(synthetic_folder, fname)
    img = Image.open(img_path)
    width, height = img.size
    coco_dict["images"].append({
        "id": idx,
        "file_name": fname,
        "width": width,
        "height": height
    })
    # Dummy annotation (full image bbox)
    coco_dict["annotations"].append({
        "id": annotation_id,
        "image_id": idx,
        "category_id": 1,
        "bbox": [0, 0, width, height],
        "area": width*height,
        "iscrowd": 0
    })
    annotation_id += 1

with open(annotations_file, "w") as f:
    json.dump(coco_dict, f)

print(f"Step 6 completed: COCO-format synthetic dataset exported with {len(coco_dict['images'])} images")

In [None]:
import torch
from torchvision import models, transforms
from PIL import Image
import os

synthetic_folder = "/kaggle/working/synthetic_dataset"
annotations_file = "/kaggle/working/synthetic_dataset/annotations.json"

device = "cuda" if torch.cuda.is_available() else "cpu"
resnet = models.resnet18(pretrained=True).to(device).eval()
preprocess = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

accuracies = []
for fname in sorted(os.listdir(synthetic_folder)):
    if not fname.endswith(".png"):
        continue
    img_path = os.path.join(synthetic_folder, fname)
    img = Image.open(img_path).convert("RGB")
    input_tensor = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        output = resnet(input_tensor)
    pred = torch.argmax(output, dim=1)
    accuracies.append(pred.item())

avg_pred = sum(accuracies)/len(accuracies)
print(f"Step 7 completed: Dataset validated on pre-trained ResNet, average prediction index: {avg_pred}")