## Vision Encoder Decoder Models

In [1]:
from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel

### Random Initialization of VisionEncoderDecoderModel from model configurations. 

In [2]:
config_encoder = ViTConfig()
config_decoder = BertConfig()

In [3]:
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
model = VisionEncoderDecoderModel(config=config)

### Initializng VisionEncoderDecoderModel from a pretrained encoder and a pretrained decoder

In [4]:
from transformers import VisionEncoderDecoderModel

In [5]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    "microsoft/swin-base-patch4-window7-224-in22k", "bert-base-uncased"
)

Downloading:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/417M [00:00<?, ?B/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Some weights of the model checkpoint at microsoft/swin-base-patch4-window7-224-in22k were not used when initializing SwinModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing SwinModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SwinModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.3.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encod

### Loading an existing VisionEncoderDecoderModel checkpoint and perform inference

In [6]:
import requests
from PIL import Image

from transformers import GPT2TokenizerFast, ViTFeatureExtractor, VisionEncoderDecoderModel

In [7]:

# load a fine-tuned image captioning model and corresponding tokenizer and feature extractor
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

Downloading:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/937M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/241 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/120 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/228 [00:00<?, ?B/s]

In [8]:
# let's perform inference on an image
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values

In [11]:
image.show()

Opening in existing browser session.


In [9]:
# autoregressively generate caption (uses greedy decoding by default)
generated_ids = model.generate(pixel_values)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [10]:
print(generated_text)

a cat laying on a blanket next to a cat laying on a bed 


### Loading a PyTorch checkpoint into TFVisionEncoderDecoderModel

TFVisionEncoderDecoderModel.from_pretrained() currently doesn’t support initializing the model from a PyTorch checkpoint. Passing from_pt=True to this method will throw an exception. If there are only PyTorch checkpoints for a particular vision encoder-decoder model, a workaround is:

In [12]:
# from transformers import VisionEncoderDecoderModel, TFVisionEncoderDecoderModel

# _model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# _model.encoder.save_pretrained("./encoder")
# _model.decoder.save_pretrained("./decoder")

# model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
#     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
# )
# # This is only for copying some specific attributes of this particular model.
# model.config = _model.config

### Training

In [14]:
from transformers import ViTFeatureExtractor, BertTokenizer, VisionEncoderDecoderModel
from datasets import load_dataset

In [15]:
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    "google/vit-base-patch16-224-in21k", "bert-base-uncased"
)

Downloading:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.3.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.7.crossattention.self.key.bias', 'bert.encod

In [16]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [17]:
dataset = load_dataset("huggingface/cats-image")
image = dataset["test"]["image"][0]
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values

Downloading builder script:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

No config specified, defaulting to: cats-image/image


Downloading and preparing dataset cats-image/image to /home/jmwolf/.cache/huggingface/datasets/huggingface___cats-image/image/1.9.0/68fbc793fb10cd165e490867f5d61fa366086ea40c73e549a020103dcb4f597e...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/173k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset cats-image downloaded and prepared to /home/jmwolf/.cache/huggingface/datasets/huggingface___cats-image/image/1.9.0/68fbc793fb10cd165e490867f5d61fa366086ea40c73e549a020103dcb4f597e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
labels = tokenizer(
    "an image of two cats chilling on a couch",
    return_tensors="pt",
).input_ids

In [19]:
loss = model(pixel_values=pixel_values, labels=labels).loss

In [23]:
url = '../model_dev/Data_CMC_COADEL_224_1/val/Mitosis/10003.jpg'

image = Image.open(url)#requests.get(url, stream=True).raw)
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [25]:
print(generated_text)

...................


In [29]:
dataset = load_dataset('../model_dev/Data_CMC_COADEL_224_1/train/Mitosis/')

Resolving data files:   0%|          | 0/10695 [00:00<?, ?it/s]

Using custom data configuration Mitosis-c51512a8e5587f44


Downloading and preparing dataset imagefolder/Mitosis to /home/jmwolf/.cache/huggingface/datasets/imagefolder/Mitosis-c51512a8e5587f44/0.0.0/e872d3ec27c6c200a8881a4af52930df7eca3372b19aa4d0f5db74a2fded8141...
               

Downloading data files #0:   0%|          | 0/669 [00:00<?, ?obj/s]

 

Downloading data files #5:   0%|          | 0/669 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/669 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/669 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/669 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/669 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/669 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/668 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /home/jmwolf/.cache/huggingface/datasets/imagefolder/Mitosis-c51512a8e5587f44/0.0.0/e872d3ec27c6c200a8881a4af52930df7eca3372b19aa4d0f5db74a2fded8141. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [45]:
image = dataset["train"]["image"][0]
#pixel_values = feature_extractor(image, return_tensors="pt").pixel_values

In [46]:
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values

In [47]:
labels = tokenizer(
    "Oblong mitotic figure with a single nucleus",
    return_tensors="pt",
).input_ids

In [48]:
loss = model(pixel_values=pixel_values, labels=labels).loss

In [49]:
print(loss)

tensor(13.0428, grad_fn=<NllLossBackward0>)


In [43]:
url = '../model_dev/Data_CMC_COADEL_224_1/val/Mitosis/10003.jpg'

image = Image.open(url)#requests.get(url, stream=True).raw)
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
generated_ids = model.generate(pixel_values)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [44]:
print(generated_text)

...................


### Training

In [None]:
from torchvision.transforms import (CenterCrop, 
                                    Compose, 
                                    Normalize, 
                                    RandomHorizontalFlip,
                                    RandomResizedCrop, 
                                    Resize, 
                                    ToTensor)

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
_train_transforms = Compose(
        [
            RandomResizedCrop(feature_extractor.size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

_val_transforms = Compose(
        [
            Resize(feature_extractor.size),
            CenterCrop(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

def train_transforms(examples):
    examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['img']]
    return examples

def val_transforms(examples):
    examples['pixel_values'] = [_val_transforms(image.convert("RGB")) for image in examples['img']]
    return examples

In [50]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import requests
from PIL import Image
import torch

processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

# load image from the IAM dataset
url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

# training
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

pixel_values = processor(image, return_tensors="pt").pixel_values
text = "hello world"
labels = processor.tokenizer(text, return_tensors="pt").input_ids
outputs = model(pixel_values=pixel_values, labels=labels)
loss = outputs.loss

# inference (generation)
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

Downloading:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
print(generated_text)

industry, " Mr. Brown commented icily. " Let us have a


In [65]:
url = '../model_dev/Data_CMC_COADEL_224_1/val/Mitosis/10040.jpg'
image = Image.open(url)#requests.get(url, stream=True).raw).convert("RGB")

# training
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

pixel_values = processor(image, return_tensors="pt").pixel_values
text = "Oblong mitotic figure with a single nucleus"
labels = processor.tokenizer(text, return_tensors="pt").input_ids
outputs = model(pixel_values=pixel_values, labels=labels)
loss = outputs.loss

# inference (generation)
generated_ids = model.generate(pixel_values)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]



In [106]:
import torch
import torch.nn.functional as F
import torch.nn as nn

from PIL import Image

import os
import json
import numpy as np
from matplotlib.colors import LinearSegmentedColormap
#from GPUtil import showUtilization as gpu_usage

import torchvision
from torchvision import models
from torchvision import transforms
from torchvision.models import resnet50
import torchvision.models as models

file_path='../model_dev/vgg16_ft.pth'

model = models.vgg16(pretrained=False)

#torch.save(vgg16_ft.state_dict(), 'vgg16_ft.pth')

num_features = model.classifier[6].in_features
features = list(model.classifier.children())[:-1]
features.extend([nn.Linear(num_features, 2)])
model.classifier = nn.Sequential(*features)

#model = torch.load(file_path)
model.load_state_dict(torch.load(file_path))
#model.eval()

num_features = model.classifier[6]#.in_features
features = list(model.classifier.children())[:-1]
#features.extend([nn.Linear(num_features, 5)])
newmodel = model
newmodel.classifier = nn.Sequential(*features)

num_features = newmodel.classifier[5]#.in_features
features = list(newmodel.classifier.children())[:-1]
#features.extend([nn.Linear(num_features, 5)])
newmodel = newmodel
newmodel.classifier = nn.Sequential(*features)

num_features = model.classifier[4]#.in_features
features = list(model.classifier.children())[:-1]
#features.extend([nn.Linear(num_features, 5)])
newmodel = newmodel
newmodel.classifier = nn.Sequential(*features)

newmodel.eval()



VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [66]:
print(generated_text)

and they


In [126]:
from transformers import ViTFeatureExtractor, BertTokenizer, VisionEncoderDecoderModel
from datasets import load_dataset

feature_extractor = ViTFeatureExtractor('trns_model.pt')#.from_pretrained("google/vit-base-patch16-224-in21k")
#feature_extractor = newmodel
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#model = VisionEncoderDecoderModel(feature_extractor, tokenizer)#.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
model = VisionEncoderDecoderModel(#.from_encoder_decoder_pretrained(
    "trns_model.pt", "bert-base-uncased")
#     "google/vit-base-patch16-224-in21k", "bert-base-uncased")

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

dataset = load_dataset('../model_dev/Data_CMC_COADEL_224_1/train/Mitosis/')
image = dataset["train"]["image"][10]
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values
text = dataset["train"]["text"][0]
labels = tokenizer(
    text,#"Mitotic figure that is round in shape",
    return_tensors="pt",
).input_ids

# the forward function automatically creates the correct decoder_input_ids
loss = model(pixel_values=pixel_values, labels=labels).loss

ValueError: Config: trns_model.pt has to be of type <class 'transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder.VisionEncoderDecoderConfig'>

In [124]:
print(pixel_values)
generated_ids = model.generate(pixel_values)
print(generated_ids)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
generated_text

tensor([[[[ 0.5686,  0.5843,  0.6078,  ...,  0.8353,  0.8118,  0.7490],
          [ 0.6000,  0.5137,  0.5529,  ...,  0.8196,  0.7882,  0.7412],
          [ 0.5922,  0.5294,  0.6000,  ...,  0.7882,  0.7647,  0.7333],
          ...,
          [ 0.7255,  0.7020,  0.7020,  ...,  0.7020,  0.6941,  0.6941],
          [ 0.7569,  0.7255,  0.7255,  ...,  0.6863,  0.6863,  0.6784],
          [ 0.8353,  0.7647,  0.7490,  ...,  0.6863,  0.6706,  0.6627]],

         [[-0.1922, -0.1608, -0.1059,  ...,  0.2157,  0.1529,  0.0824],
          [-0.1294, -0.2000, -0.1451,  ...,  0.2078,  0.1686,  0.0980],
          [-0.0902, -0.1373, -0.0667,  ...,  0.2078,  0.1843,  0.1451],
          ...,
          [ 0.3961,  0.4039,  0.4431,  ..., -0.0275, -0.0353, -0.0353],
          [ 0.4353,  0.4118,  0.4431,  ..., -0.0039, -0.0039, -0.0118],
          [ 0.5137,  0.4510,  0.4588,  ...,  0.0275,  0.0275,  0.0196]],

         [[ 0.0667,  0.0902,  0.1529,  ...,  0.3882,  0.3412,  0.2627],
          [ 0.1137,  0.0431,  

' like TV ob ob ob ob ob ob ob ob ob ob ob ob ob ob ob ob ob ob'

In [75]:
dataset["train"]["text"][0]

'Square mitotic figure with multiple nuclei'

In [77]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

In [78]:
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

In [79]:
from tokenizers.pre_tokenizers import Whitespace
bert_tokenizer.pre_tokenizer = Whitespace()

In [88]:

from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor('trns_modep.pt')#.from_pretrained('trns_model.pt')
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


predict_step('../model_dev/Data_CMC_COADEL_224_1/train/Mitosis/10003.jpg') # ['a woman in a hospital bed with a woman in a hospital bed']


IsADirectoryError: [Errno 21] Is a directory: '.'