# import 

In [None]:
import random
import numpy as np
import torch

def set_seed(seed):
    random.seed(seed) 
    np.random.seed(seed)  
    torch.manual_seed(seed) 
    torch.cuda.manual_seed(seed) 
    torch.cuda.manual_seed_all(seed) 
seed_value = 33
set_seed(seed_value)

In [None]:
import sys
sys.path.append('./')


from model import *
from zero_shot_test import *
from data import *

import yaml

cfg_path = 'config_depth.yaml'
cfg = yaml.safe_load(open(cfg_path))
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)

# reprogram

## train data load

In [None]:
from data import *
root = './dataset/depth/sun_d/SUNRGBD'
train_val_dataset  = SunRgbDDataset(root,tag= 'alltrain')

In [None]:
from torch.utils.data import random_split, Subset


dataset_size = len(train_val_dataset)
train_size = int(0.9 * dataset_size)  
val_size = dataset_size - train_size  

train_dataset, val_dataset = random_split(train_val_dataset, [train_size, val_size])

# num_samples = 100
# random_indices = random.sample(range(len(train_dataset)), num_samples)

# train_subset = Subset(train_dataset, random_indices)


In [None]:
train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=cfg['batch_size'],
        shuffle=True,
        drop_last=False,
        pin_memory=True,
        sampler=None
)

val_dataloader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=cfg['batch_size'],
        shuffle=True,
        drop_last=False,
        pin_memory=True,
        sampler=None
)

In [None]:
for source ,target in train_dataloader:
    print( source.shape, target.shape)
    print(target)
    break

## model load

In [None]:
import torch
model = torch.load('./trained_model/reprogram_base.pth')

In [None]:

trainable_params = []

for name, param in model.named_parameters():
    try:
        if name.split('.')[1] == 'depth':
            print(f"Parameter: {name}, Requires Grad: {param.requires_grad}")
            trainable_params.append(param)
    except:
        pass

In [None]:
torch.cuda.empty_cache()


## train

In [None]:
model.cuda()

In [None]:
cfg['train_params'] = {}
cfg['train_params']['optimizer'] = 'AdamW'
cfg['train_params']['init_lr'] = 0.1
cfg['train_params']['weight_decay'] = 0.2
cfg['train_params']['scheduler'] = 'cosw'
cfg['train_params']['temperature'] = 1.0
cfg['train_params']['T_max'] = 60

In [None]:
import torch.optim as optim
optimizer =  optim.AdamW(trainable_params, lr=cfg['train_params']['init_lr'], weight_decay=cfg['train_params']['weight_decay'])
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=cfg['train_params']['T_max'])
# scheduler =  torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.01, patience=10, verbose=True)

In [None]:

save_best_path = './trained_model/depth/sunrgbd_best.pth'

In [None]:
from utils import *

num_epochs = 1000
early_stop = 0
min_loss = np.inf
print(save_best_path)
print('with final logit')
temperature = cfg['train_params']['temperature']
log = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    curr_lr = float(optimizer.param_groups[0]['lr'])
    
    for i, (source, target) in enumerate(train_dataloader):
        
        target_inputs = {
            'depth': target.to(device)
        }
        outputs = model(target_inputs,source.to(device))
        
        # loss
        source_features = outputs['source_' + cfg['source_type']]
        target_features = outputs['depth']

        # normalized features
        source_features = source_features / source_features.norm(dim=1, keepdim=True)
        target_features = target_features / target_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / temperature)).exp()
        logits =  logit_scale * source_features @ target_features.t()


        loss_f = cosine_similarity_loss(logits)


        loss = loss_f 
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
    
    

    model.eval()  
    total_val_loss = 0.0
    for source, target in val_dataloader:

        target_inputs = {
            'depth': target.to(device)
        }
        with torch.no_grad():
            outputs = model(target_inputs, source.to(device))
            
        # loss
        source_features = outputs['source_' + cfg['source_type']]
        target_features = outputs['depth']

        # normalized features
        source_features = source_features / source_features.norm(dim=1, keepdim=True)
        target_features = target_features / target_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logits =logit_scale * source_features @ target_features.t()
        
        loss_f = cosine_similarity_loss(logits)

        loss =loss_f 
        total_val_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Curr_LR: {curr_lr}, Train Loss: {total_loss / len(train_dataloader):.4f}, Val Loss: {total_val_loss / len(val_dataloader):.4f}")
    log.append([f"Epoch {epoch + 1}/{num_epochs}, Curr_LR: {curr_lr}, Train Loss: {total_loss / len(train_dataloader):.4f}, Val Loss: {total_val_loss / len(val_dataloader):.4f}"])


    if total_val_loss / len(val_dataloader) < min_loss:
        min_loss = total_val_loss / len(val_dataloader)
        early_stop = 0
        torch.save(model, save_best_path)
        print('saved best')
    else:
        early_stop += 1
    
    if early_stop > 10:
        print("Early stopping triggered")
        break
    
    if cfg['train_params']['scheduler'] == 'cosw':
        scheduler.step()
    elif cfg['train_params']['scheduler'] =='plateau':
        print("?")
        scheduler.step(total_val_loss / len(val_dataloader))





In [None]:
model_num = 2
with open(f'./trained_model/log/{model_num}.txt','w') as f:
    for l in log:
        f.writelines(str(l) + '\n')
    
    f.write(str(cfg))

# test
For zero-shot classification get https://github.com/facebookresearch/ImageBind/blob/main/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz 

In [None]:
import torch

model = torch.load("./trained_model/reprogram_trained.pth")
model.eval()

## text_template

In [None]:
text_templates = [
    'a bad photo of a {}.',
    'a photo of many {}.',
    'a sculpture of a {}.',
    'a photo of the hard to see {}.',
    'a low resolution photo of the {}.',
    'a rendering of a {}.',
    'graffiti of a {}.',
    'a bad photo of the {}.',
    'a cropped photo of the {}.',
    'a tattoo of a {}.',
    'the embroidered {}.',
    'a photo of a hard to see {}.',
    'a bright photo of a {}.',
    'a photo of a clean {}.',
    'a photo of a dirty {}.',
    'a dark photo of the {}.',
    'a drawing of a {}.',
    'a photo of my {}.',
    'the plastic {}.',
    'a photo of the cool {}.',
    'a close-up photo of a {}.',
    'a black and white photo of the {}.',
    'a painting of the {}.',
    'a painting of a {}.',
    'a pixelated photo of the {}.',
    'a sculpture of the {}.',
    'a bright photo of the {}.',
    'a cropped photo of a {}.',
    'a plastic {}.',
    'a photo of the dirty {}.',
    'a jpeg corrupted photo of a {}.',
    'a blurry photo of the {}.',
    'a photo of the {}.',
    'a good photo of the {}.',
    'a rendering of the {}.',
    'a {} in a video game.',
    'a photo of one {}.',
    'a doodle of a {}.',
    'a close-up photo of the {}.',
    'a photo of a {}.',
    'the origami {}.',
    'the {} in a video game.',
    'a sketch of a {}.',
    'a doodle of the {}.',
    'a origami {}.',
    'a low resolution photo of a {}.',
    'the toy {}.',
    'a rendition of the {}.',
    'a photo of the clean {}.',
    'a photo of a large {}.',
    'a rendition of a {}.',
    'a photo of a nice {}.',
    'a photo of a weird {}.',
    'a blurry photo of a {}.',
    'a cartoon {}.',
    'art of a {}.',
    'a sketch of the {}.',
    'a embroidered {}.',
    'a pixelated photo of a {}.',
    'itap of the {}.',
    'a jpeg corrupted photo of the {}.',
    'a good photo of a {}.',
    'a plushie {}.',
    'a photo of the nice {}.',
    'a photo of the small {}.',
    'a photo of the weird {}.',
    'the cartoon {}.',
    'art of the {}.',
    'a drawing of the {}.',
    'a photo of the large {}.',
    'a black and white photo of a {}.',
    'the plushie {}.',
    'a dark photo of a {}.',
    'itap of a {}.',
    'graffiti of the {}.',
    'a toy {}.',
    'itap of my {}.',
    'a photo of a cool {}.',
    'a photo of a small {}.',
    'a tattoo of the {}.',
]


## Sun RGBD

In [None]:
from data import *
root = './dataset/depth/sun_d/SUNRGBD'
test_dataset  = SunRgbDDataset(root,tag= 'alltest')

test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=cfg['batch_size'],
        shuffle=False,
        drop_last=False,
        pin_memory=True,
        sampler=None
)


In [None]:
from preprocessing.sun_rgb_d import *
root = './dataset/depth/sun_d/SUNRGBD'

gt_classes = get_gt(root)
test_depth_path, test_image_path, test_target_list = get_data(root, gt_classes, 'alltest')


In [None]:
bpe_path = './ImageBind/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz'
zeroshot_weights = zeroshot_classifier(model.cuda(), gt_classes, text_templates, bpe_path, device)

In [None]:
test_target = get_target(test_target_list, gt_classes)
test_target = test_target.to('cuda:0')

In [None]:

all_emb = []
for source, target in tqdm(test_dataloader):
    with torch.no_grad():
        tmp_depth_features = model.forward({'depth': target.to(device)})
        all_emb.append(tmp_depth_features['depth'])
test_depth_features = torch.concat(all_emb)
depth_features_norm = test_depth_features / test_depth_features.norm(dim=-1, keepdim=True)


In [None]:
logits = 100. * depth_features_norm @ zeroshot_weights.T
top1, top5 = top1_top5_acc(logits,test_target)
print(top1, top5)

## NYU-D

In [None]:
from preprocessing.preprocessing_utils import *
from preprocessing.nyu_d import *

data_file_path = './dataset/depth/nyu_d/nyu_depth_v2_labeled.mat'
split_file_path = './dataset/depth/nyu_d/splits.mat'

train_idx, test_idx = get_train_n_test_idx(split_file_path)

test_data = get_data(data_file_path, test_idx)

test_depth, test_image, test_gt_list = test_data

gt_classes = set(read_depth_name(data_file_path, 'sceneTypes'))

image_preprocess = get_img_preprocess()
depth_preprocess = get_depth_preprocess()

test_image_input = get_preprocessed_img_data(test_image, image_preprocess)

In [None]:

test_disparity_input = get_preprocessed_depth_data(test_depth, depth_preprocess)

In [None]:
bpe_path = './ImageBind/imagebind/bpe/bpe_simple_vocab_16e6.txt.gz'

zeroshot_weights = zeroshot_classifier(model, gt_classes, text_templates,  bpe_path, device)
zeroshot_weights = zeroshot_weights.to(torch.float32)
test_target = get_target(data_file_path,test_idx, gt_classes)
test_target = test_target.to('cuda:0')

with torch.no_grad():
    all_emb = []
    for i in range(0,test_image_input.shape[0],10):
        tmp_depth_features = model.forward({'depth':test_disparity_input[i:i+10].to(device)})
        all_emb.append(tmp_depth_features['depth'])
test_depth_features = torch.concat(all_emb)
depth_features_norm = test_depth_features / test_depth_features.norm(dim=-1, keepdim=True)

In [None]:

logits = 100. * depth_features_norm @ zeroshot_weights.T
top1, top5 = top1_top5_acc(logits,test_target)
print(top1, top5)

# imagebind

In [None]:
# import numpy as np
import torch
from tqdm import tqdm
from pkg_resources import packaging

import sys
sys.path.append('./ImageBind/')

from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

import os

In [None]:
# import model
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)


## Sun RGBD

In [None]:
from data import *
root = './dataset/depth/sun_d/SUNRGBD'
test_dataset  = SunRgbDDataset(root,tag= 'alltest')

test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=cfg['batch_size'],
        shuffle=False,
        drop_last=False,
        pin_memory=True,
        sampler=None
)


In [None]:
from preprocessing.sun_rgb_d import *
root = './dataset/depth/sun_d/SUNRGBD'

gt_classes = get_gt(root)
test_depth_path, test_image_path, test_target_list = get_data(root, gt_classes, 'alltest')


In [None]:
zeroshot_weights = []
for classname in tqdm(list(gt_classes)):
    text_list =  [template.format(classname) for template in text_templates]

    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    }

    with torch.no_grad():
        embeddings = model(inputs)


        embeddings[ModalityType.TEXT] /= embeddings[ModalityType.TEXT].norm(dim=-1, keepdim=True)
        embeddings[ModalityType.TEXT] = embeddings[ModalityType.TEXT].mean(dim=0)
        embeddings[ModalityType.TEXT] /= embeddings[ModalityType.TEXT].norm()
        zeroshot_weights.append(embeddings[ModalityType.TEXT])
    # break

zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()

In [None]:
test_target = get_target(data_file_path,test_idx, gt_classes)
test_target = test_target.to('cuda:0')

In [None]:
# get depth embeddings

# depths = depths[test_idxs]
all_emb = []
for source, target in tqdm(test_dataloader):
    tmp_depths = target
    inputs = {
        ModalityType.DEPTH: tmp_depths.to(device),
    }

    with torch.no_grad():
        imembeddings = model(inputs)

        all_emb.append(imembeddings[ModalityType.DEPTH] )

vid_emb = torch.concat(all_emb)
vid_emb /= vid_emb.norm(dim=-1, keepdim=True)


In [None]:
# zero shot prediction
def accuracy(output, target, topk=(1,)):
    pred = output.topk(max(topk), 1, True, True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu()) for k in topk]

logits = 100. * vid_emb @ zeroshot_weights

top1, top5, n = 0., 0., 0.

# measure accuracy
acc1, acc5 = accuracy(logits, test_target.to(device), topk=(1, 5))
n += test_target.size(0)

top1 = (acc1 / n) * 100
top5 = (acc5 / n) * 100

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

## NYU-D

In [None]:
from preprocessing.preprocessing_utils import *
from preprocessing.nyu_d import *

data_file_path = './dataset/depth/nyu_d/nyu_depth_v2_labeled.mat'
split_file_path = './dataset/depth/nyu_d/splits.mat'

train_idx, test_idx = get_train_n_test_idx(split_file_path)

test_data = get_data(data_file_path, test_idx)

test_depth, test_image, test_gt_list = test_data

gt_classes = set(read_depth_name(data_file_path, 'sceneTypes'))

image_preprocess = get_img_preprocess()
depth_preprocess = get_depth_preprocess()

test_image_input = get_preprocessed_img_data(test_image, image_preprocess)

In [None]:

test_disparity_input = get_preprocessed_depth_data(test_depth, depth_preprocess)

In [None]:
zeroshot_weights = []
for classname in tqdm(list(gt_classes)):
    text_list =  [template.format(classname) for template in text_templates]

    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(text_list, device),
    }

    with torch.no_grad():
        embeddings = model(inputs)


        embeddings[ModalityType.TEXT] /= embeddings[ModalityType.TEXT].norm(dim=-1, keepdim=True)
        embeddings[ModalityType.TEXT] = embeddings[ModalityType.TEXT].mean(dim=0)
        embeddings[ModalityType.TEXT] /= embeddings[ModalityType.TEXT].norm()
        zeroshot_weights.append(embeddings[ModalityType.TEXT])
    # break

zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()

In [None]:
# get depth embeddings

# depths = depths[test_idxs]
all_emb = []
for i in range(0,len(test_disparity_input),10):
    tmp_depths = test_disparity_input[i:i+10]
    inputs = {
        ModalityType.DEPTH: tmp_depths.to(device),
    }

    with torch.no_grad():
        imembeddings = model(inputs)

        all_emb.append(imembeddings[ModalityType.DEPTH] )

vid_emb = torch.concat(all_emb)
vid_emb /= vid_emb.norm(dim=-1, keepdim=True)


In [None]:
test_target = get_target(data_file_path,test_idx, gt_classes)
test_target = test_target.to('cuda:0')

In [None]:
# zero shot prediction
def accuracy(output, target, topk=(1,)):
    pred = output.topk(max(topk), 1, True, True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu()) for k in topk]

logits = 100. * vid_emb @ zeroshot_weights

top1, top5, n = 0., 0., 0.

# measure accuracy
acc1, acc5 = accuracy(logits, test_target.to(device), topk=(1, 5))
n += test_target.size(0)

top1 = (acc1 / n) * 100
top5 = (acc5 / n) * 100

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")