In [1]:
import os
import torch
import dct_manip as dm
import numpy as np
import torchvision.transforms as transforms
import utils.custom_transforms as ctrans
from utils.pipeline_utils import load_model_and_report
import utils.configs as configs
import argparse
from PIL import Image
os.chdir("/workspace/")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
imgpath = "dataset/mvtec_ad_jpeg/bottle/train/good/000.JPEG"
png_path = "dataset/mvtec_ad/bottle/train/good/000.PNG"

In [3]:
from data.dataset import MvtecAd

In [4]:
datapath = "dataset/mvtec_ad"

In [5]:
jpeg_dataset = MvtecAd(datadir = datapath,
                        target = "bottle",
                        is_train = True,
                        resize = 224,
                        image_mode = 'dct',
                        image_format = 'jpeg',)

In [7]:
jpeg_dataset[0][0].shape

AttributeError: 'tuple' object has no attribute '노멛'

In [6]:
dim, quant, Y, cbcr = dm.read_coefficients(imgpath) # dimension, quantization, Y, cbcr coefficients
Y = torch.clamp(Y * quant[0], min=-2**10, max=2**10-8) # recover quantized coefficients (clamp to -1024~1023 (values should be within -1024~1016 with quant table of all ones))

In [7]:
if cbcr is not None: # if colored
    cbcr = torch.clamp(cbcr * quant[1:3].unsqueeze(1).unsqueeze(1), min=-2**10, max=2**10-8) # recover quantized coefficient
else: # if black and white
    _, h, w, kh, kw = Y.shape
    cbcr = torch.zeros((2,h//2,w//2,kh,kw), dtype=Y.dtype, device=Y.device) # fill it with zeroes
coeffs = (Y, cbcr)


In [8]:
transform = transforms.Compose([
                                ctrans.Resize_DCT(28), # 28 = 224x224, 48 = 384x384
                                ctrans.ToRange(val_min=-1, val_max=1, orig_min=-1024, orig_max=1016, dtype=torch.float32),
                                ])
coeffs = transform(coeffs)

In [9]:
coeffs[0].shape, coeffs[1].shape

(torch.Size([1, 28, 28, 8, 8]), torch.Size([2, 14, 14, 8, 8]))

In [10]:
original_patch_embedd = PatchEmbedding()


In [11]:
original_patch_embedd(torch.randn(1,3,224,224)).shape

torch.Size([1, 196, 768])

In [16]:
from models.decoder import *
from models.plainvit import *
from easydict import EasyDict

In [17]:
patchembed = PatchEmbedding_DCT_Group()

In [19]:
patchembed(batch_coeffs)[0].shape, patchembed(batch_coeffs)[1].shape

(torch.Size([1, 196, 768]), torch.Size([1, 14, 14, 384]))

In [14]:
temp_y = batch_coeffs[0]
rearranged_y = patchembed.rearrange_Y(temp_y)
subblocked_y = apply_subblock(rearranged_y, patchembed.conv_Y, combine = patchembed.combine_Y)
print("temp_y.shape", temp_y.shape)
print("rearranged_y.shape", rearranged_y.shape)
print("subblocked_y.shape", subblocked_y.shape)

NameError: name 'batch_coeffs' is not defined

In [None]:
temp_cbcr = batch_coeffs[1]
rearranged_cbcr = patchembed.rearrange_C(temp_cbcr)
subblocked_cbcr = apply_subblock(rearranged_cbcr, patchembed.conv_C, combine = patchembed.combine_C)
print("temp_cbcr.shape", temp_y.shape)
print("rearranged_cbcr.shape", rearranged_cbcr.shape)
print("subblocked_cbcr.shape", subblocked_cbcr.shape)

temp_cbcr.shape torch.Size([1, 1, 28, 28, 8, 8])
rearranged_cbcr.shape torch.Size([1, 2, 14, 14, 8, 8])
subblocked_cbcr.shape torch.Size([1, 2, 14, 14, 8, 8])


In [None]:
patchembed.collapser(subblocked_y).shape, patchembed.collapser(subblocked_cbcr).shape

(torch.Size([1, 14, 14, 256]), torch.Size([1, 14, 14, 128]))

In [None]:
temp_ycbcr = torch.cat((patchembed.collapser(subblocked_y), patchembed.collapser(subblocked_cbcr)), dim=3)
temp_ycbcr.shape

torch.Size([1, 14, 14, 384])

In [None]:
project_output = patchembed.projection(temp_ycbcr)
project_output.shape 

torch.Size([1, 196, 768])

In [10]:
def parse_args():
    """
    Parse arguments
    """
    parser=argparse.ArgumentParser()

    # DDP config
    parser.add_argument('--port', type=int, default=13932, help='Port for pytorch distributed dataparallel')

    # model config
    parser.add_argument('--model_arch', type=str, default='vits', help='Model architecture (vitti, vits, vitb, vitl, swinv2)')
    parser.add_argument('--no_subblock', action='store_true', help='If set, disable subblock conversion')
    parser.add_argument("--embed_type", type=int, default=1, help='Embedding layer type. (1: grouped, 2: separate, 3: concatenate). Default 1')
    parser.add_argument("--domain", type=str, default="dct", help="(DCT/RGB) Choose domain type")

    # data config
    parser.add_argument("--datapath", type=str, default='./dataset/', help='Path to folder containing the .tar files')
    parser.add_argument("--image_size", type=int, default=224, help='Image size for training')
    parser.add_argument("--object", type=str, default='bottle', help='Object to train on')
    parser.add_argument('--num_gpus', type=int, default=-1, help='number of GPUs to use. If not set, automatically use all available GPUs')
    parser.add_argument('--num_cpus', type=int, default=1, help='number of total available cpu threads')

    # pipeline config
    parser.add_argument('--train', action='store_true', help='Train new model')
    parser.add_argument('--eval', action='store_true', help='Evaluate model loaded from ``savepath`` ')
    parser.add_argument('--benchmark', type=int, default=0, help='If set, benchmark for the set iterations')
    parser.add_argument('--savepath', type=str, default='./weights/jpeganovit.pth', help='Save path for model. Also saves checkpoint at this path')
    parser.add_argument('--loadpath', type=str, default='', help='Load path for model. Used during evaluation. If empty, copy savepath')
    parser.add_argument('--load_ckpt', type=str, default='', help='If set, load checkpoint from this path')
    parser.add_argument('--deterministic', action='store_true', help='If set, use deterministic mode')
    parser.add_argument('--verbose', type=int, default=0, help='(0/1/2) 0: no output, 1: output per epoch, 2: output per iteration')

    # override default config
    parser.add_argument("--epochs", type=int, default=-1, help="Override the number of epochs")
    parser.add_argument("--batch", type=int, default=-1, help="Override the size of batch (overall batch size)")
    parser.add_argument("--lr", type=float, default=-1, help='Override the learning rate')
    parser.add_argument("--wd", type=float, default=-1, help='Override the weight decay strength')
    parser.add_argument('--drop', type=float, default=-1, help='Override dropout probability')
    parser.add_argument('--warmup_steps', type=int, default=-1, help='Override warmup steps')
    parser.add_argument('--ops_list', type=str, default='', help='Override augmentation list')
    parser.add_argument('--num_ops', type=int, default=-1, help='Override number of operations')
    parser.add_argument('--ops_magnitude', type=int, default=-1, help='Override augmentation magnitude')
    parser.add_argument("--amp", type=int, default=-1, help="(True:1/False:0) Override automatic mixed precision")
    parser.add_argument("--ampdtype", type=str, default='', help="Override amp dtype casting")
    parser.add_argument('--seed', type=int, default=-1, help='Override random seed')
    parser.add_argument('--use_msrsync', action='store_true', help='If set, use msrsync instead of .tar')

    args=parser.parse_args(args=[])  
    return args

In [11]:
args = parse_args()
cfg = configs.generate_config(
        modelarch = args.model_arch.lower(),
        domain = args.domain,
        modelver=args.embed_type,
        subblock=True if not args.no_subblock else False,
        epochs=None if args.epochs < 0 else args.epochs, # need to add
        batchsize=None if args.batch < 0 else args.batch, # need to change order
        lr=None if args.lr < 0 else args.lr,
        wd=None if args.wd < 0 else args.wd,
        drop=None if args.drop < 0 else args.drop,
        warmup_steps=None if args.warmup_steps < 0 else args.warmup_steps, # need to add
        auglist=None if args.ops_list == '' else args.ops_list.split(","),
        num_ops=None if args.num_ops < 0 else args.num_ops, # need to add
        ops_magnitude=None if args.ops_magnitude < 0 else args.ops_magnitude, # need to add
        seed=None if args.seed < 0 else args.seed, # need to add
        amp=None if args.amp < 0 else args.amp,
        ampdtype=None if args.ampdtype == '' else args.ampdtype,
        use_msrsync=args.use_msrsync,
    )

In [12]:
vitmodel = ViT(
                in_channels= 3,
                patch_size= cfg.MODEL.PATCHSIZE,
                emb_size= cfg.MODEL.EMBEDSIZE,
                depth= cfg.MODEL.DEPTH,
                n_classes= 1000,
                drop_p=cfg.TRAIN.DROP,
                device=cfg.RANK,
                dtype=torch.float32,
                num_heads=cfg.MODEL.HEADS,
                head_size=cfg.MODEL.HEADSIZE,
                pixel_space=cfg.MODEL.DOMAIN,
                ver=cfg.MODEL.VERSION,
                use_subblock=cfg.MODEL.SUBBLOCK,
                )
decoder = Decoder(
                image_size=args.image_size,
                patch_size=cfg.MODEL.PATCHSIZE,
                emb_size=cfg.MODEL.EMBEDSIZE,
                )

In [15]:
# change it to batch size 1
batch_coeffs = (coeffs[0].unsqueeze(0), coeffs[1].unsqueeze(0))

In [21]:
map_location = {'cuda:%d' % 0: 'cuda:%d' % cfg.RANK}
vitmodel = vitmodel.to(cfg.RANK)
vitmodel_state_dict = torch.load("weights/imgnetDCTViTS_ep90_76.5.pth", map_location=map_location)

In [22]:
vitmodel.load_state_dict(vitmodel_state_dict)

<All keys matched successfully>

In [23]:
vitmodel.patchembed(batch_coeffs[0].to("cuda:0"), batch_coeffs[1].to("cuda:0")).shape

torch.Size([1, 196, 384])

In [24]:
vit_output = vitmodel(batch_coeffs[0].to("cuda:0"), batch_coeffs[1].to("cuda:0"))

In [25]:
full_model = nn.Sequential(vitmodel, decoder)
full_model.to("cuda:0")

Sequential(
  (0): ViT(
    (patchembed): PatchEmbedding_DCT_Group(
      (rearrange_Y): Rearrange('b c (h pdh) (w pdw) p1 p2 -> b c h w (pdh p1) (pdw p2)', pdh=2, pdw=2)
      (rearrange_C): Rearrange('b c (h pdh) (w pdw) p1 p2 -> b c h w (pdh p1) (pdw p2)', pdh=1, pdw=1)
      (collapser): Rearrange('b c h w i j -> b h w (c i j)')
      (projection): Sequential(
        (0): Linear(in_features=384, out_features=384, bias=True)
        (1): SinCosEmbedding()
        (2): Rearrange('b h w e -> b (h w) e')
      )
    )
    (encoder): TransformerEncoder(
      (0): TransformerEncoderBlock(
        (0): ResidualAdd(
          (fn): Sequential(
            (eb_lrnorm1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
            (eb_mha): MultiHeadAttention(
              (qkv): Linear(in_features=384, out_features=1152, bias=True)
              (att_drop): Dropout(p=0, inplace=False)
              (projection): Linear(in_features=384, out_features=384, bias=True)
            )
    

In [26]:
cuda_batch_coeffs = (batch_coeffs[0].to("cuda:0"), batch_coeffs[1].to("cuda:0"))

In [27]:
full_model.forward(cuda_batch_coeffs).shape



torch.Size([1, 3, 224, 224])

In [28]:
batch_coeffs[0].shape, batch_coeffs[1].shape

(torch.Size([1, 1, 28, 28, 8, 8]), torch.Size([1, 2, 14, 14, 8, 8]))

In [29]:
transpose_output = vit_output.transpose(1,2)
transpose_output.shape

torch.Size([1, 384, 196])

In [30]:
reshape_output = transpose_output.reshape(vit_output.shape[0], -1, 14, 14)
reshape_output.shape

torch.Size([1, 384, 14, 14])

In [31]:
dec_block1_output = decoder.dec_block1(reshape_output)
dec_block2_output = decoder.dec_block2(dec_block1_output)
dec_block3_output = decoder.dec_block3(dec_block2_output)
dec_block4_output = decoder.dec_block4(dec_block3_output)
dec_block5_output = decoder.dec_block5(dec_block4_output)
print(dec_block1_output.shape)
print(dec_block2_output.shape)
print(dec_block3_output.shape)
print(dec_block4_output.shape)
print(dec_block5_output.shape)


torch.Size([1, 256, 16, 16])
torch.Size([1, 128, 18, 18])
torch.Size([1, 64, 20, 20])
torch.Size([1, 32, 39, 39])
torch.Size([1, 16, 77, 77])


In [33]:
up_output = decoder.up(dec_block5_output)
up_output.shape

torch.Size([1, 16, 224, 224])

In [48]:
decoder.up 

UpsamplingBilinear2d(size=(224, 224), mode=bilinear)

In [34]:
output_output = decoder.output(up_output)
output_output.shape

torch.Size([1, 3, 224, 224])

In [None]:
custom_decoder_up = 1, 14, 14, 384

In [35]:
from utils.custom_transforms import ycbcr_to_rgb

In [40]:
len(coeffs)

2

In [41]:
coeffs[0].shape

torch.Size([1, 28, 28, 8, 8])

In [43]:
ycbcr_rgb = ycbcr_to_rgb()

In [45]:
ycbcr_rgb(coeffs

torch.Size([3, 224, 224])

In [47]:
import torch.nn as nn
from einops.layers.torch import Rearrange, Reduce
# can be reshaped to something suitable for transposed convolution.
DeconvY = nn.ModuleList([nn.ConvTranspose2d(64, 64, kernel_size=(8, 8)) for _ in range(8)])
DeconvC = nn.ModuleList([nn.ConvTranspose2d(64, 64, kernel_size=(8, 8)) for _ in range(8)])

def backward(ycbcr, patch_embbed_size: int= 28):
    # Reshape the input to the shape before the projection
    b, _, e = ycbcr.shape
    y_size, cbcr_size = patch_embbed_size, patch_embbed_size/2
    h = int((b / e) ** 0.5)
    w = h
    ycbcr = Rearrange(ycbcr, "b (h w) e -> b h w e", h=h, w=w)

    # Determine the original channel counts for Y and CbCr
    y_shape, cbcr_shape = (1, y_size, y_size, 8, 8), (2, cbcr_size, cbcr_size, 8, 8)
    _, _, _, cy = y_shape
    _, _, _, cc = cbcr_shape

    yout = ycbcr[:, :, :, :cy]
    cout = ycbcr[:, :, :, cy:cy+cc]

    # Upsample the Y channel using DeconvY
    y = []
    for i, deconv in enumerate(DeconvY):
        y.append(deconv(yout[:, :, :, i]))
    y = torch.cat(y, dim=1)

    # Upsample the CbCr channel using DeconvC
    cbcr = []
    for i, deconv in enumerate(DeconvC):
        cbcr.append(deconv(cout[:, :, :, i]))
    cbcr = torch.cat(cbcr, dim=1)

    # Assuming the output size from deconvolution isn't the same as the desired
    # (1, 28, 28, 8, 8) and (2, 14, 14, 8, 8), we will reshape them.
    y = y.view(b, 1, 28, 28, 8, 8)
    cbcr = cbcr.view(b, 2, 14, 14, 8, 8)

    return y, cbcr

In [50]:
ycbcr_patch = vitmodel.patchembed(batch_coeffs[0].to("cuda:0"), batch_coeffs[1].to("cuda:0"))

In [51]:
ycbcr_patch.shape

torch.Size([1, 196, 384])

In [56]:
batch_coeffs[0].shape

torch.Size([1, 1, 28, 28, 8, 8])

In [55]:
vitmodel.patchembed.rearrange_Y(batch_coeffs[0].to("cuda:0")).shape

torch.Size([1, 1, 14, 14, 16, 16])

In [59]:
from utils.custom_transforms import ycbcr_to_rgb

In [36]:
ycbcr_to_rgb(coeffs)

TypeError: ycbcr_to_rgb.__init__() takes 1 positional argument but 2 were given

In [64]:
tempa, tempb = coeffs

In [66]:
tempa.shape, tempb.shape

(torch.Size([1, 28, 28, 8, 8]), torch.Size([2, 14, 14, 8, 8]))