In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR, ReduceLROnPlateau
import torchvision.transforms as T
from transformers import BlipProcessor, BlipForConditionalGeneration, ViTForImageClassification, ViTFeatureExtractor, CLIPProcessor, CLIPModel, AutoFeatureExtractor, get_cosine_schedule_with_warmup, ViTModel
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import albumentations as A
from albumentations.pytorch import ToTensorV2

In [2]:
import warnings
warnings.filterwarnings('ignore')

import random

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# デバイスの設定
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

train_dir = 'train'
test_dir = 'test'

Using device: cuda


In [None]:
# CLIPモデルとプロセッサのロード
clip_model_name = "openai/clip-vit-large-patch14"
clip_model = CLIPModel.from_pretrained(clip_model_name).to(device)
clip_processor = CLIPProcessor.from_pretrained(clip_model_name)
clip_model.eval()

In [4]:
# 'hold'フォルダの擬似ラベル用テキストプロンプト
hold_text_prompts = [
    "a person holding an open fan",               # クラス 0
    "a person holding a closed fan"               # クラス 1
]

# 'not-hold'フォルダの擬似ラベル用テキストプロンプト
not_hold_text_prompts = [
    "a fan is present but not held by the person",  # クラス 2
    "no fan is present in the image"                # クラス 3
]

In [5]:
def generate_pseudo_labels(image_dir, image_files, text_prompts, class_offset=0):
    """
    CLIPを用いて擬似ラベルを生成する関数
    
    Args:
        image_dir (str): 画像ディレクトリのパス
        image_files (list): 画像ファイル名のリスト
        text_prompts (list): クラスごとのテキストプロンプトのリスト
        class_offset (int): クラス番号のオフセット（'not-hold'フォルダ用に2を設定）
    
    Returns:
        list: 擬似ラベルのリスト
    """
    pseudo_labels = []
    for img_file in tqdm(image_files, desc=f"Generating pseudo labels for {os.path.basename(image_dir)}"):
        img_path = os.path.join(image_dir, img_file)
        image = Image.open(img_path).convert('RGB')
        inputs = clip_processor(text=text_prompts, images=image, return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            outputs = clip_model(**inputs)
            logits_per_image = outputs.logits_per_image  # [1, num_texts]
            probs = logits_per_image.softmax(dim=1)      # [1, num_texts]
            pred = torch.argmax(probs, dim=1).item()
        pseudo_labels.append(pred + class_offset)
    return pseudo_labels


In [None]:
# 'hold'フォルダ内の画像ファイルのリストを取得
hold_dir = os.path.join(train_dir, 'hold')
hold_files = [f for f in os.listdir(hold_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

# 擬似ラベルの生成
hold_pseudo_labels = generate_pseudo_labels(hold_dir, hold_files, hold_text_prompts, class_offset=0)


In [None]:
# 'not-hold'フォルダ内の画像ファイルのリストを取得
not_hold_dir = os.path.join(train_dir, 'not-hold')
not_hold_files = [f for f in os.listdir(not_hold_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

# 擬似ラベルの生成（クラス番号を2,3に調整）
not_hold_pseudo_labels = generate_pseudo_labels(not_hold_dir, not_hold_files, not_hold_text_prompts, class_offset=2)

In [None]:
# 'hold'のデータフレーム作成
hold_df = pd.DataFrame({
    'file': ['hold/' + f for f in hold_files],
    'label': hold_pseudo_labels
})

# 'not-hold'のデータフレーム作成
not_hold_df = pd.DataFrame({
    'file': ['not-hold/' + f for f in not_hold_files],
    'label': not_hold_pseudo_labels
})

# 訓練データ全体を結合
train_df = pd.concat([hold_df, not_hold_df], ignore_index=True)

# データをシャッフル
train_df = train_df.sample(frac=1).reset_index(drop=True)

# 擬似ラベルをCSVに保存
train_df.to_csv('pseudo_labels.csv', index=False)
print("Pseudo labels saved to 'pseudo_labels.csv'")

In [3]:
class BlipDataset(Dataset):
    def __init__(self, dataframe, image_dir, processor, transform=None, class_captions=None, max_length=16):
        """
        Args:
            dataframe (pd.DataFrame): ファイル名とラベルを含むデータフレーム
            image_dir (str): 画像ディレクトリのパス
            processor (BlipProcessor): BLIPのプロセッサ
            transform (albumentations.Compose): 画像の変換パイプライン
            class_captions (dict): クラスラベルとキャプションの対応辞書
        """
        self.dataframe = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform
        self.processor = processor
        self.class_captions = class_captions
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_name = self.dataframe.loc[idx, 'file']
        label = self.dataframe.loc[idx, 'label']
        caption = self.class_captions[label]
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        image = np.array(image)
    
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
    
        inputs = self.processor(
            images=image,
            text=caption,
            return_tensors="pt",
            padding="max_length",
            max_length=self.max_length,
            truncation=True
            )
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        return inputs


In [4]:
# BLIPモデルとプロセッサのロード
blip_model_name = "Salesforce/blip-image-captioning-base"
blip_processor = BlipProcessor.from_pretrained(blip_model_name)
blip_model = BlipForConditionalGeneration.from_pretrained(blip_model_name).to(device)

In [5]:
train_transforms = A.Compose([
    A.Resize(256, 256),
    A.RandomResizedCrop(224, 224, scale=(0.8, 1.0), p=1.0),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=10, p=0.3),
    A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1, p=0.3),
])
val_transforms = A.Compose([
    A.Resize(224, 224),
])
# クラスラベルと対応するキャプション
class_captions = {
    0: "a person holding an open fan",
    1: "a person holding a closed fan",
    2: "a fan is present but not held by the person",
    3: "no fan is present in the image"
}

# 擬似ラベルの読み込み
train_df = pd.read_csv('pseudo_labels.csv')

# 訓練データと検証データに分割
train_data, val_data = train_test_split(train_df, test_size=0.1, stratify=train_df['label'], random_state=42)

# データセットの作成
train_dataset = BlipDataset(train_data, train_dir, blip_processor, transform=train_transforms, class_captions=class_captions)
val_dataset = BlipDataset(val_data, train_dir, blip_processor, transform=val_transforms, class_captions=class_captions)

# データローダーの作成
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [6]:
# オプティマイザの設定
optimizer = AdamW(blip_model.parameters(), lr=5e-5)

# ファインチューニングの実施
best_val_loss = float('inf')
epochs = 5
blip_model.train()

for epoch in range(epochs):
    blip_model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
    
        # 生成キャプションをターゲットとして損失を計算
        outputs = blip_model(input_ids=input_ids,
                            pixel_values=pixel_values,
                            attention_mask=attention_mask,
                            labels=input_ids)
    
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")
    
    # 検証ステップ（オプション）
    blip_model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
    
            outputs = blip_model(input_ids=input_ids,
                                pixel_values=pixel_values,
                                attention_mask=attention_mask,
                                labels=input_ids)
    
            loss = outputs.loss
            val_loss += loss.item()
    
    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_val_loss:.4f}")
    
    # ベストモデルの保存（検証損失が低い場合）
    if epoch == 0 or avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        blip_model.save_pretrained("finetuned_blip")
        blip_processor.save_pretrained("finetuned_blip_processor")
        print("Best model saved.")

Training Epoch 1:   0%|          | 0/3794 [00:00<?, ?it/s]

Epoch 1, Loss: 1.4054


Validation Epoch 1:   0%|          | 0/422 [00:00<?, ?it/s]

Epoch 1, Validation Loss: 1.3634
Best model saved.


Training Epoch 2:   0%|          | 0/3794 [00:00<?, ?it/s]

Epoch 2, Loss: 1.3665


Validation Epoch 2:   0%|          | 0/422 [00:00<?, ?it/s]

Epoch 2, Validation Loss: 1.3623
Best model saved.


Training Epoch 3:   0%|          | 0/3794 [00:00<?, ?it/s]

Epoch 3, Loss: 1.3632


Validation Epoch 3:   0%|          | 0/422 [00:00<?, ?it/s]

Epoch 3, Validation Loss: 1.3630


Training Epoch 4:   0%|          | 0/3794 [00:00<?, ?it/s]

Epoch 4, Loss: 2.8642


Validation Epoch 4:   0%|          | 0/422 [00:00<?, ?it/s]

Epoch 4, Validation Loss: 1.4122


Training Epoch 5:   0%|          | 0/3794 [00:00<?, ?it/s]

Epoch 5, Loss: 1.3884


Validation Epoch 5:   0%|          | 0/422 [00:00<?, ?it/s]

Epoch 5, Validation Loss: 1.3623


In [7]:
# ファインチューニング済みモデルのロード
blip_model = BlipForConditionalGeneration.from_pretrained("finetuned_blip").to(device)
blip_processor = BlipProcessor.from_pretrained("finetuned_blip_processor")
blip_model.eval()

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-11): 12 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=768, out_features=2304, bias=True)
            (projection): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((768,), eps=1e-0

In [8]:
class TestBlipDataset(Dataset):
    def __init__(self, dataframe, image_dir, processor, transform=None):
        """
        Args:
            dataframe (pd.DataFrame): ファイル名を含むデータフレーム
            image_dir (str): 画像ディレクトリのパス
            processor (BlipProcessor): BLIPのプロセッサ
            transform (albumentations.Compose): 画像の変換パイプライン
        """
        self.dataframe = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform
        self.processor = processor
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        img_name = self.dataframe.loc[idx, 'file']
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        image = np.array(image)
    
        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']
    
        inputs = self.processor(images=image, return_tensors="pt")
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        return inputs, img_name


In [10]:
# テストデータのファイルリストを取得
test_files = [f for f in os.listdir(test_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
test_df = pd.DataFrame({'file': test_files})

# テストデータセットとデータローダーの作成
test_dataset = TestBlipDataset(test_df, test_dir, blip_processor, transform=val_transforms)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# ファインチューニング済みのBLIPモデルとプロセッサのロード
blip_model = BlipForConditionalGeneration.from_pretrained("finetuned_blip").to(device)
blip_processor = BlipProcessor.from_pretrained("finetuned_blip_processor")
blip_model.eval()

# キャプション生成
captions = []
image_names = []

for batch in tqdm(test_loader, desc="Generating captions for test data"):
    inputs, img_names = batch
    # input_ids = inputs['input_ids'].to(device)
    pixel_values = inputs['pixel_values'].to(device)
    attention_mask = inputs.get('attention_mask', None)
    if attention_mask is not None:
        attention_mask = attention_mask.to(device)
    
    with torch.no_grad():
        generated_ids = blip_model.generate(pixel_values=pixel_values, attention_mask=attention_mask, max_length=20)
        generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)
    
    captions.extend(generated_caption)
    image_names.extend(img_names)


Generating captions for test data:   0%|          | 0/2144 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [11]:
cap_df = pd.DataFrame({
    'file': image_names,
    'caption': captions
})
cap_df['caption'].value_counts()

a person holding an open fan                   18324
a fan is present but not held by the person    15846
no fan is present in the image                   129
Name: caption, dtype: int64

In [13]:
def map_caption_to_label(caption):
    caption = caption.lower()
    # 各クラスのキーワードを定義
    if "open fan" in caption and "holding" in caption and "not" not in caption:
        return 1  # "a person holding an open fan"
    elif "closed fan" in caption and "holding" in caption and "not" not in caption:
        return 1  # "a person holding a closed fan"
    elif "fan" in caption and "not held" in caption:
        return 0  # "a fan is present but not held by the person"
    elif "no fan" in caption:
        return 0  # "no fan is present in the image"
    else:
        # デフォルトで最も近いクラスを推定
        print(f"Unknown caption: {caption}")
        return 3

# キャプションからラベルへのマッピング
test_labels = [map_caption_to_label(c) for c in captions]

In [16]:
submission = pd.DataFrame({
    'file': image_names,
    'label': test_labels
})

submission.to_csv('submission.csv', index=False, header=False)
print("Submission file 'submission.csv' has been saved.")


Submission file 'submission.csv' has been saved.


In [17]:
!signate submit --competition-id=1506 submission.csv --note "BLIP Train pseudo labels" 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[32mYou have successfully submitted your predictions.We will send you the submission result to your email address.[0m
