In [2]:
from transformers import AutoFeatureExtractor, AutoTokenizer
import intel_extension_for_pytorch as ipex
encoder_checkpoint = "google/vit-base-patch16-224-in21k"
decoder_checkpoint = "gpt2"

feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
tokenizer.pad_token = tokenizer.eos_token




In [3]:
import pandas as pd
df = pd.read_csv('Fashion Dataset v2 - Fashion Dataset v2.csv.csv')

2023-12-04 23:04:43,682 - numexpr.utils - INFO - Note: detected 224 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2023-12-04 23:04:43,683 - numexpr.utils - INFO - Note: NumExpr detected 224 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-12-04 23:04:43,683 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


In [4]:
df

Unnamed: 0,img,description
0,http://assets.myntassets.com/assets/images/170...,Black printed Kurta with Palazzos with dupatta...
1,http://assets.myntassets.com/assets/images/165...,Orange solid Kurta with Palazzos with dupatta<...
2,http://assets.myntassets.com/assets/images/163...,Navy blue embroidered Kurta with Trousers with...
3,http://assets.myntassets.com/assets/images/147...,Red printed kurta with trouser and dupatta<br>...
4,http://assets.myntassets.com/assets/images/110...,"Black and green printed straight kurta, has a ..."
...,...,...
14209,http://assets.myntassets.com/assets/images/154...,Blue solid front-open sweatshirt has a mock co...
14210,http://assets.myntassets.com/assets/images/164...,"Green printed sweatshirt has a hooded, 2 pock..."
14211,http://assets.myntassets.com/assets/images/163...,"Pink solid sweatshirt has a mock collar, 2 ka..."
14212,http://assets.myntassets.com/assets/images/163...,"Blue solid sweatshirt has a round neck, long s..."


In [8]:
from PIL import Image
import requests

max_length = 128
sample = df.iloc[0]

# sample image
image = Image.open(requests.get(sample['img'],stream=True).raw)
# sample caption
caption = sample['description']

# apply feature extractor on the sample image
inputs = feature_extractor(images=image, return_tensors='pt')
# apply tokenizer
outputs = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )

In [9]:
print(inputs)

{'pixel_values': tensor([[[[0.3020, 0.3020, 0.3020,  ..., 0.5137, 0.5216, 0.5216],
          [0.3020, 0.3020, 0.3020,  ..., 0.5137, 0.5137, 0.5216],
          [0.3020, 0.3020, 0.3020,  ..., 0.5137, 0.5137, 0.5137],
          ...,
          [0.5137, 0.5137, 0.5137,  ..., 0.4196, 0.4196, 0.4196],
          [0.5137, 0.5137, 0.5059,  ..., 0.4275, 0.4275, 0.4275],
          [0.5059, 0.5059, 0.5137,  ..., 0.4275, 0.4275, 0.4275]],

         [[0.3020, 0.3020, 0.3020,  ..., 0.5059, 0.5137, 0.5137],
          [0.3020, 0.3020, 0.3020,  ..., 0.5137, 0.5137, 0.5137],
          [0.3020, 0.3020, 0.3020,  ..., 0.5216, 0.5216, 0.5216],
          ...,
          [0.5137, 0.5137, 0.5137,  ..., 0.4196, 0.4196, 0.4196],
          [0.5137, 0.5137, 0.5059,  ..., 0.4275, 0.4275, 0.4275],
          [0.5059, 0.5059, 0.5137,  ..., 0.4275, 0.4275, 0.4275]],

         [[0.3176, 0.3176, 0.3176,  ..., 0.5451, 0.5529, 0.5529],
          [0.3176, 0.3176, 0.3176,  ..., 0.5529, 0.5529, 0.5529],
          [0.3176, 0.3176

In [10]:
print(outputs)

{'input_ids': tensor([[ 9915, 10398, 20642,    64,   351,  3175,  8101,   418,   351, 32597,
         25014,  1279,  1671,    29,  1279,  1671,    29,  1279,    65,    29,
         20642,    64,  1486,    25,   220,  7359,    65,    29,  1279,   377,
            29,  1279,  4528,    29, 48021, 32702,    82, 10398,  7359,  4528,
            29,  1279,  4528,    29,  1052,   668,  7344,  5485,  7359,  4528,
            29,  1279,  4528,    29, 23603,  3918,  7359,  4528,    29,  1279,
          4528,    29, 41621, 19908,    11,   220,  1115,    12, 24385,  3218,
         27409,  7359,  4528,    29,  1279,  4528,    29,   327,  1604,  4129,
           351, 50017, 16869,  7359,  4528,    29,  1279,  4528,    29,   569,
          2304,   577, 26842,   261,  4572, 37982,  9664,  7359,  4528,    29,
          7359,   377,    29,  1279,  1671,    29,  1279,    65,    29,  3175,
          8101,   418,  1486,    25,   220,  7359,    65,    29,  1279,   377,
            29,  1279,  4528,    29, 3

In [11]:
from torch.utils.data import Dataset

class LoadDataset(Dataset):
    def __init__(self, df):
        self.images = df['img'].values
        self.captions = df['description'].values
    
    def __getitem__(self, idx):
        inputs = dict()

        # load the image and apply feature_extractor
        image_path = str(self.images[idx])
        image =  Image.open(requests.get(image_path,stream=True).raw)
        image = feature_extractor(images=image, return_tensors='pt')

        # load the caption and apply tokenizer
        caption = self.captions[idx]
        labels = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )['input_ids'][0]
        
        # store the inputs(pixel_values) and labels(input_ids) in the dict we created
        inputs['pixel_values'] = image['pixel_values'].squeeze()   
        inputs['labels'] = labels
        return inputs
    
    def __len__(self):
        return len(self.images)

In [12]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

In [13]:
train_ds = LoadDataset(train_df)
test_ds = LoadDataset(test_df)

In [14]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_checkpoint, 
    decoder_checkpoint
)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.7.crossattention.q_attn.bias', 'h.4.crossattention.c_attn.bias', 'h.11.crossattention.q_attn.weight', 'h.1.crossattention.q_attn.weight', 'h.8.ln_cross_attn.weight', 'h.5.ln_cross_attn.bias', 'h.5.ln_cross_attn.weight', 'h.2.crossattention.q_attn.bias', 'h.11.crossattention.c_proj.bias', 'h.8.crossattention.c_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.ln_cross_attn.bias', 'h.2.ln_cross_attn.bias', 'h.3.crossattention.c_attn.bias', 'h.1.crossattention.q_attn.bias', 'h.0.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.q_attn.bias', 'h.6.crossattention.c_attn.weight', 'h.9.crossattention.q_attn.weight', 'h.4.crossattention.q_attn.weight', 'h.2.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.4.ln_cross_attn.bias', 'h.10.crossattention.c_proj.bias', 'h.1.crossattention.c_attn.weight', 'h.3.ln_cross_attn.weight', 'h.8.ln_cros

In [15]:
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [19]:
DEVICE

device(type='xpu')

In [16]:
# set number of beams for beam search to 4
num_beams = 4
model.config.num_beams = num_beams

In [18]:
import torch

DEVICE = torch.device("xpu" if torch.xpu.is_available() else "cpu")

In [None]:
from transformers import Seq2SeqTrainingArguments

# batch size
bs = 64
model.to(DEVICE)
training_args = Seq2SeqTrainingArguments(
    output_dir="image-caption-generator", # name of the directory to store training outputs
    evaluation_strategy="epoch",          # evaluate after each epoch
    per_device_train_batch_size=bs,       # batch size during training
    per_device_eval_batch_size=bs,        # batch size during evaluation
    learning_rate=5e-5,
    weight_decay=0.01,                    # weight decay parameter for AdamW optimizer
    num_train_epochs=5,                   # number of epochs to train
    save_strategy='epoch',                # save checkpoints after each epoch
    report_to='none',                     # prevent reporting to wandb, mlflow...
)

from transformers import Seq2SeqTrainer, default_data_collator

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    data_collator=default_data_collator,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=training_args,
)

trainer.train()

Epoch,Training Loss,Validation Loss


In [1]:
pip install -U "huggingface_hub[cli]"

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
!huggingface-cli login

/bin/bash: line 1: huggingface-cli: command not found


In [7]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("models/")

img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# unconditional image captioning
inputs = processor(raw_image, return_tensors="pt")

out = model.generate(**inputs)
print(processor.decode(out[0], skip_special_tokens=True))




woman sitting on the beach with her dog and a cell phone


In [3]:
model.save_pretrained('models')

In [4]:
'# Load model directly
# Load model directly


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/310k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


pytorch_model.bin:   0%|          | 0.00/730M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium",padding_side='left')
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))


In [17]:
model.save_pretrained('chatbot_model')

In [20]:


inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

outputs

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -8.5411, -13.9883, -15.7658,  ..., -13.2058, -13.2467,  -5.0981],
         [ -4.4049, -13.8693, -13.4804,  ...,  -7.9285,  -8.8395,   1.6477],
         [ -4.1507, -12.0031, -12.1445,  ...,  -4.6811,  -7.0245,   1.5745],
         [ -4.4140, -14.9646, -16.1205,  ...,  -9.1661,  -6.1686,   6.2558],
         [ -6.9450, -14.3312, -14.9969,  ...,  -8.4141,  -9.4990,  -0.5340],
         [ -2.9911, -14.5041, -15.1018,  ..., -10.1135,  -5.7408,   9.5571]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 0.3018, -0.0568,  0.1022,  ..., -0.4782, -0.0028,  0.0466],
          [-0.4885, -0.1077,  0.0994,  ..., -1.0764, -0.1525, -0.1001],
          [ 0.4674,  0.0774,  0.0579,  ...,  0.5732,  0.0300, -0.2596],
          [-0.5246,  0.0600, -0.2596,  ...,  0.3716, -0.4224, -0.2629],
          [ 0.5747,  0.2123,  0.2429,  ...,  0.4444,  0.2238, -0.3618],
          [-0.1256,  0.0784, -0.0045,  ...,  0.4143, -0.1365, -0.11

Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


'The Departed'