In [2]:
from transformers import AutoTokenizer, AutoFeatureExtractor
from intel-extension-for-transformers.transformers.pipeline import pipeline
from intel_extension_for_transformers.transformers.trainer import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments,default_data_collator
encoder_checkpoint = "google/vit-base-patch16-224-in21k"
decoder_checkpoint = "gpt2"

feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
tokenizer.pad_token = tokenizer.eos_token




In [3]:
import pandas as pd
df = pd.read_csv('Fashion Dataset v2 - Fashion Dataset v2.csv.csv')

2023-12-04 23:04:43,682 - numexpr.utils - INFO - Note: detected 224 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2023-12-04 23:04:43,683 - numexpr.utils - INFO - Note: NumExpr detected 224 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2023-12-04 23:04:43,683 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


In [4]:
df

Unnamed: 0,img,description
0,http://assets.myntassets.com/assets/images/170...,Black printed Kurta with Palazzos with dupatta...
1,http://assets.myntassets.com/assets/images/165...,Orange solid Kurta with Palazzos with dupatta<...
2,http://assets.myntassets.com/assets/images/163...,Navy blue embroidered Kurta with Trousers with...
3,http://assets.myntassets.com/assets/images/147...,Red printed kurta with trouser and dupatta<br>...
4,http://assets.myntassets.com/assets/images/110...,"Black and green printed straight kurta, has a ..."
...,...,...
14209,http://assets.myntassets.com/assets/images/154...,Blue solid front-open sweatshirt has a mock co...
14210,http://assets.myntassets.com/assets/images/164...,"Green printed sweatshirt has a hooded, 2 pock..."
14211,http://assets.myntassets.com/assets/images/163...,"Pink solid sweatshirt has a mock collar, 2 ka..."
14212,http://assets.myntassets.com/assets/images/163...,"Blue solid sweatshirt has a round neck, long s..."


In [8]:
from PIL import Image
import requests

max_length = 128
sample = df.iloc[0]

# sample image
image = Image.open(requests.get(sample['img'],stream=True).raw)
# sample caption
caption = sample['description']

# apply feature extractor on the sample image
inputs = feature_extractor(images=image, return_tensors='pt')
# apply tokenizer
outputs = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )

In [9]:
print(inputs)

{'pixel_values': tensor([[[[0.3020, 0.3020, 0.3020,  ..., 0.5137, 0.5216, 0.5216],
          [0.3020, 0.3020, 0.3020,  ..., 0.5137, 0.5137, 0.5216],
          [0.3020, 0.3020, 0.3020,  ..., 0.5137, 0.5137, 0.5137],
          ...,
          [0.5137, 0.5137, 0.5137,  ..., 0.4196, 0.4196, 0.4196],
          [0.5137, 0.5137, 0.5059,  ..., 0.4275, 0.4275, 0.4275],
          [0.5059, 0.5059, 0.5137,  ..., 0.4275, 0.4275, 0.4275]],

         [[0.3020, 0.3020, 0.3020,  ..., 0.5059, 0.5137, 0.5137],
          [0.3020, 0.3020, 0.3020,  ..., 0.5137, 0.5137, 0.5137],
          [0.3020, 0.3020, 0.3020,  ..., 0.5216, 0.5216, 0.5216],
          ...,
          [0.5137, 0.5137, 0.5137,  ..., 0.4196, 0.4196, 0.4196],
          [0.5137, 0.5137, 0.5059,  ..., 0.4275, 0.4275, 0.4275],
          [0.5059, 0.5059, 0.5137,  ..., 0.4275, 0.4275, 0.4275]],

         [[0.3176, 0.3176, 0.3176,  ..., 0.5451, 0.5529, 0.5529],
          [0.3176, 0.3176, 0.3176,  ..., 0.5529, 0.5529, 0.5529],
          [0.3176, 0.3176

In [10]:
print(outputs)

{'input_ids': tensor([[ 9915, 10398, 20642,    64,   351,  3175,  8101,   418,   351, 32597,
         25014,  1279,  1671,    29,  1279,  1671,    29,  1279,    65,    29,
         20642,    64,  1486,    25,   220,  7359,    65,    29,  1279,   377,
            29,  1279,  4528,    29, 48021, 32702,    82, 10398,  7359,  4528,
            29,  1279,  4528,    29,  1052,   668,  7344,  5485,  7359,  4528,
            29,  1279,  4528,    29, 23603,  3918,  7359,  4528,    29,  1279,
          4528,    29, 41621, 19908,    11,   220,  1115,    12, 24385,  3218,
         27409,  7359,  4528,    29,  1279,  4528,    29,   327,  1604,  4129,
           351, 50017, 16869,  7359,  4528,    29,  1279,  4528,    29,   569,
          2304,   577, 26842,   261,  4572, 37982,  9664,  7359,  4528,    29,
          7359,   377,    29,  1279,  1671,    29,  1279,    65,    29,  3175,
          8101,   418,  1486,    25,   220,  7359,    65,    29,  1279,   377,
            29,  1279,  4528,    29, 3

In [11]:
from torch.utils.data import Dataset

class LoadDataset(Dataset):
    def __init__(self, df):
        self.images = df['img'].values
        self.captions = df['description'].values
    
    def __getitem__(self, idx):
        inputs = dict()

        # load the image and apply feature_extractor
        image_path = str(self.images[idx])
        image =  Image.open(requests.get(image_path,stream=True).raw)
        image = feature_extractor(images=image, return_tensors='pt')

        # load the caption and apply tokenizer
        caption = self.captions[idx]
        labels = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )['input_ids'][0]
        
        # store the inputs(pixel_values) and labels(input_ids) in the dict we created
        inputs['pixel_values'] = image['pixel_values'].squeeze()   
        inputs['labels'] = labels
        return inputs
    
    def __len__(self):
        return len(self.images)

In [12]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

In [13]:
train_ds = LoadDataset(train_df)
test_ds = LoadDataset(test_df)

In [14]:
from transformers import VisionEncoderDecoderModel

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_checkpoint, 
    decoder_checkpoint
)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.7.crossattention.q_attn.bias', 'h.4.crossattention.c_attn.bias', 'h.11.crossattention.q_attn.weight', 'h.1.crossattention.q_attn.weight', 'h.8.ln_cross_attn.weight', 'h.5.ln_cross_attn.bias', 'h.5.ln_cross_attn.weight', 'h.2.crossattention.q_attn.bias', 'h.11.crossattention.c_proj.bias', 'h.8.crossattention.c_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.ln_cross_attn.bias', 'h.2.ln_cross_attn.bias', 'h.3.crossattention.c_attn.bias', 'h.1.crossattention.q_attn.bias', 'h.0.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.q_attn.bias', 'h.6.crossattention.c_attn.weight', 'h.9.crossattention.q_attn.weight', 'h.4.crossattention.q_attn.weight', 'h.2.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.4.ln_cross_attn.bias', 'h.10.crossattention.c_proj.bias', 'h.1.crossattention.c_attn.weight', 'h.3.ln_cross_attn.weight', 'h.8.ln_cros

In [18]:
import torch

DEVICE = torch.device("xpu" if torch.xpu.is_available() else "cpu")

In [None]:


bs = 64
model.to(DEVICE)
training_args = Seq2SeqTrainingArguments(
    output_dir="image-caption-generator", # name of the directory to store training outputs
    evaluation_strategy="epoch",          # evaluate after each epoch
    per_device_train_batch_size=bs,       # batch size during training
    per_device_eval_batch_size=bs,        # batch size during evaluation
    learning_rate=5e-5,
    weight_decay=0.01,                    # weight decay parameter for AdamW optimizer
    num_train_epochs=5,                   # number of epochs to train
    save_strategy='epoch',                # save checkpoints after each epoch
    report_to='none',                     # prevent reporting to wandb, mlflow...
)



trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    data_collator=default_data_collator,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=training_args,
)

trainer.train()

Epoch,Training Loss,Validation Loss


In [3]:
model.save_pretrained('image-caption-gen')
tokenizer.save_pretrained('image-caption-gen')

In [None]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
df = pd.read_csv("FashionV2.csv")

column = df['img'].values
time_per_step = []

complete_start_time = time.time()
text_classifier = pipeline(
    task="image-to-text",
     model="image-caption-gen/",
    framework="pt",
    device=torch.device("cpu"),
)
for data in tqdm(column[:1000]):
    a = text_classifier(data)

complete_end = time.time()
complete_time_taken = complete_end - complete_start_time