In [1]:
!export HF_DATASETS_CACHE="/scratch/eecs595f23_class_root/eecs595f23_class/zjf/huggingface_cache"
import torch
from PIL import Image
from dataset import SimpleDataset, make_dataset
from glob import glob
import os.path as osp


In [2]:
label2id, id2label = dict(), dict()

IMAGE_ROOT_DIR = "/home/zjf/repos/proj_595/data/action_effect_image_rs"
effect_dir_list = glob(osp.join(IMAGE_ROOT_DIR, '*'))
labels = [osp.basename(effect_dir) for effect_dir in effect_dir_list]

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label


In [3]:
from transformers import AutoImageProcessor
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

ds_train, ds_val = make_dataset(transform=_transforms)


In [8]:
import evaluate
accuracy = evaluate.load("accuracy")
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

print(len(ds_train))
print(len(ds_val))


1238
309


In [9]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()


training_args = TrainingArguments(
    output_dir="my_awesome_food_model",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=ds_train,
    eval_dataset=ds_val,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
0,4.9389,4.873051,0.029126
2,4.7206,4.783671,0.158576


TrainOutput(global_step=57, training_loss=4.809672506232011, metrics={'train_runtime': 412.2545, 'train_samples_per_second': 9.009, 'train_steps_per_second': 0.138, 'total_flos': 2.8148866758977126e+17, 'train_loss': 4.809672506232011, 'epoch': 2.92})

In [None]:
trainer.save_model('trained_model.pth')

In [11]:
text_file_path = "/home/zjf/repos/proj_595/data/action_effect_sentence_phrase.txt"

PNs = []
ESs = []
with open(text_file_path, 'r') as f:
    for line in f.readlines():
        ele =line.split(',')
        PNs.append(ele[0])
        ESs.append(ele[1].strip(' '))

In [None]:
from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs_text = processor(text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True)


In [13]:
inputs_text

{'input_ids': tensor([[49406,   320,  1125,   539,   320,  2368, 49407],
        [49406,   320,  1125,   539,   320,  1929, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}

In [15]:
from transformers import AutoTokenizer, CLIPTextModel
model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
outputs = model(**inputs)


In [18]:
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output  # pooled (EOS token) states


In [20]:
pooled_output

tensor([[-0.5152,  0.1658,  0.8876,  ..., -0.0675, -0.4551, -1.7960],
        [ 0.0426,  0.0189,  1.2740,  ..., -0.4217, -0.4393, -1.3016]],
       grad_fn=<IndexBackward0>)