In [1]:
# !pip install Levenshtein -q
!pip install -q --upgrade git+https://github.com/huggingface/transformers.git
# !pip install -q datasets lightning
# !pip install -q peft accelerate bitsandbytes
# !pip install -q --upgrade wandb

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
# log in to hugging face
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Importing os, numpy and pandas for data manipulation
import os
import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()
import time

# For data visualization, we will use matplotlib, wordcloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# For data preprocessing, we will use Counter, train_test_split, Levenshtein distance, Python Image Library and OneHotEncoder
from collections import Counter
# import Levenshtein as lev
from PIL import Image
import cv2
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# For saving and loading the preprocessed data, we will use pickle
import pickle

# For Building the model, we will use PyTorch and its functions
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# For taking the image from the URL, we will use requests
import requests

# For evaluation, we will need sklearn.metrics.average_precision_score
from sklearn.metrics import average_precision_score

# Importing json for results formatting which will be uploaded for evaluation
import json


In [4]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Check if the directory exists and copy if it does not exist
import os
import shutil

# List of files to copy
files = ["valid.json", "valid.zip"]

# Source directory
src_dir = "drive/MyDrive/matsuo-ken-2024-DL/VQA/VQA"

# Destination directory
dest_dir = "./data"

# Ensure the destination directory exists
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

# Copy files one by one
for file in files:
    src_file = os.path.join(src_dir, file)
    dest_file = os.path.join(dest_dir, file)

    if not os.path.exists(dest_file):
        shutil.copy(src_file, dest_file)
        print(f"{file} copied.")
    else:
        print(f"{file} already exists, not copying.")

valid.json already exists, not copying.
valid.zip already exists, not copying.


In [6]:
import os
import zipfile

def unzip_file(zip_path, extract_to, filename):
    if os.path.isfile(zip_path) and not os.path.isdir(extract_to + filename):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
            print(f"Extracted {zip_path} to {extract_to}")
    else:
        if not os.path.isfile(zip_path):
            print(f"{zip_path} not found.")
        else:
            print(f"{extract_to} already exists.")

# Specify the paths for your zip files and target directories
valid_zip_path = './data/valid.zip'
valid_extract_to = './data/'

# Unzip the files
unzip_file(valid_zip_path, valid_extract_to, 'valid')

./data/ already exists.


In [7]:
INPUT_PATH = './data'
VALIDATION_PATH = INPUT_PATH + '/valid'
ANNOTATIONS_VAL_PATH = INPUT_PATH + '/valid.json'
OUTPUT_PATH = '/content/drive/MyDrive/matsuo-ken-2024-DL/VQA/OPENAI_CLIP/'
ANSWER_SPACE = 0 # Will be configured later when we build the vocab using the methodology described in the paper
MODEL_REPO_ID = "google/paligemma-3b-pt-224"
MAX_LENGTH = 512

In [8]:
def load_df(df_path):
    df = pd.read_json(df_path)
    df = df[['image', 'question', 'answers']]
    return df

In [9]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(MODEL_REPO_ID)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
PROMPT = "Answer: "

In [11]:
import re

def process_pred(pred):
    pred = re.sub(r"(?:(?<=>) | (?=</s_))", "", pred)
    if "unanswer" in pred:
        pred = "unanswerable"
    return pred

In [12]:
FINETUNED_MODEL_ID = "howarudo/paligemma-3b-pt-224-vqa-continue-ft"
EPOCH = "7"

In [13]:
# from transformers import AutoConfig
# AutoConfig.from_pretrained("google/paligemma-3b-pt-224/config.json")

In [14]:
from transformers import PaliGemmaForConditionalGeneration

df = pd.read_json("./data/valid.json")
df = df[['image', 'question']]
# FINETUNED_MODEL_ID = "howarudo/paligemma-3b-pt-224-vqa-continue"EPOCH = "5"
# del model
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PaliGemmaForConditionalGeneration.from_pretrained(FINETUNED_MODEL_ID + "-" + EPOCH).to(device)
model.eval()
TEST_BATCH_SIZE = 16

model_answers = []

class TestDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_path = VALIDATION_PATH + '/' + row['image']
        try:
            image = Image.open(image_path).convert("RGB")
        except OSError:
            cvimg = cv2.imread(image_path)
            image = Image.fromarray(cvimg)
        question = PROMPT + row['question']

        return image, question

def test_collate_fn(batch):
    images, questions = zip(*batch)
    inputs = processor(text=list(questions), images=list(images), return_tensors="pt", padding=True, tokenize_newline_separately=False)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    return inputs

data_loader = DataLoader(TestDataset(df), batch_size=TEST_BATCH_SIZE, collate_fn=test_collate_fn)
torch.cuda.empty_cache()
model_answers = []
with torch.no_grad():
    for inputs in tqdm(data_loader, desc="Processing batches"):
        generated_ids = model.generate(**inputs, max_new_tokens=MAX_LENGTH)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        model_answers.extend([process_pred(ans.split("\n")[1]) for ans in generated_text])
        torch.cuda.empty_cache()

adapter_config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/45.3M [00:00<?, ?B/s]

Processing batches: 100%|██████████| 311/311 [46:58<00:00,  9.06s/it]


In [15]:
model_answers

['no',
 'unanswerable',
 'chicken carbonara',
 'yes',
 'blue',
 'yes',
 'snickers',
 'brown',
 'puppets',
 'dark green',
 'unanswerable',
 'pink yellow blue',
 'unanswerable',
 'unanswerable',
 'chicken noodle',
 'light pink mood light',
 'green beans',
 '10',
 'purple',
 'dark chocolate peanut almond',
 'dalmatian',
 'unanswerable',
 'mountain dew',
 'glass',
 'unanswerable',
 'mocha nut fudge',
 'unanswerable',
 'chardonnay',
 'unanswerable',
 'coin',
 'unanswerable',
 'can opener',
 'meatloaf',
 '1996',
 'unanswerable',
 'unanswerable',
 'unanswerable',
 'mint tea',
 'no',
 'classroom',
 'unanswerable',
 'black',
 'unanswerable',
 'red green yellow',
 'jung',
 'yes',
 'tosttos',
 'yes',
 'weed',
 'unanswerable',
 'beef pot roast',
 'unanswerable',
 'red',
 'daisy',
 '70% isopropyl alcohol',
 'bacon egg hash browns',
 'no writing',
 'heater',
 'digital tv for pc 2',
 'kona blend',
 'unanswerable',
 'roof',
 'cinnamon',
 'unanswerable',
 'unanswerable',
 'salad dressing',
 'orange jui

In [16]:
submission = np.array(model_answers)
np.save("submission.npy", submission)
# save to drive
DATA_PATH = '/content/drive/MyDrive/matsuo-ken-2024-DL/VQA/sub/pali_results/'
# add timestamp to name
import time
time_stamp = time.strftime("%Y%m%d-%H%M%S")
# torch.save(model.state_dict(), DATA_PATH + 'model_' + time_stamp + '_.pth')
np.save(DATA_PATH + 'submission_' + "epoch_new2_" + EPOCH + "_" + time_stamp, submission)

In [17]:
from google.colab import runtime
runtime.unassign()

In [None]:
1+1

2