In [1]:
import os

print(os.getcwd())

/home/cute/.jupyter/abnormal_detection


In [2]:
# 필요한 라이브러리 및 패키지 설치
!pip install git+https://github.com/huggingface/transformers.git@main
!pip install -q datasets
!pip install nltk

Collecting git+https://github.com/huggingface/transformers.git@main
  Cloning https://github.com/huggingface/transformers.git (to revision main) to /tmp/pip-req-build-hx6088me
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-hx6088me
  Resolved https://github.com/huggingface/transformers.git to commit 50290cf7a0234c1b30bfdbf08fbb714fae3a2f19
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [3]:
from torchvision.datasets import CocoCaptions
from torch.utils.data import Dataset, DataLoader

# COCO 데이터셋 다운로드 및 로드
coco_dataset = CocoCaptions(root="content/drive/MyDrive/abnormal_dataset", annFile="content/drive/MyDrive/abnormal_dataset.json", transform=None)
test_dataset = CocoCaptions(root="content/drive/MyDrive/test_abnormal_dataset", annFile="content/drive/MyDrive/test_abnormal_dataset/test_abnormal_dataset.json", transform=None)

# Hugging Face의 transformers 라이브러리에서 AutoProcessor 및 BlipForConditionalGeneration 불러오기
from transformers import AutoProcessor, BlipForConditionalGeneration

# Processor 및 모델 초기화
train_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
valid_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
test_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [1]:
import os
import gc
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
import nltk
from torchvision.datasets import CocoCaptions

# 필요한 라이브러리 설치
nltk.download("punkt")

# 데이터셋 경로 설정
train_root = "content/drive/MyDrive/abnormal_dataset"
train_annFile = "content/drive/MyDrive/abnormal_dataset.json"
test_root = "content/drive/MyDrive/test_abnormal_dataset"
test_annFile = "content/drive/MyDrive/test_abnormal_dataset/test_abnormal_dataset.json"

# COCO 데이터셋 로드
coco_train_dataset = CocoCaptions(root=train_root, annFile=train_annFile, transform=None)
coco_test_dataset = CocoCaptions(root=test_root, annFile=test_annFile, transform=None)

# Processor 및 모델 초기화
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# 이미지 캡션 데이터셋 클래스 정의
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor, return_image_name=False):
        self.dataset = dataset
        self.processor = processor
        self.return_image_name = return_image_name

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, captions = self.dataset[idx]
        text = captions[0]
        encoding = self.processor(images=image, text=text, padding="max_length", return_tensors="pt")
        encoding = {k: v.squeeze() for k, v in encoding.items()}

        if self.return_image_name:
            image_name = self.dataset.ids[idx]  # 이미지 파일 이름 반환
            return encoding, image_name
        else:
            return encoding

# 데이터셋 분할
total_size = len(coco_train_dataset)
train_size = int(0.8 * total_size)
valid_size = total_size - train_size

# 학습, 검증, 테스트 데이터셋 생성
train_dataset, valid_dataset = random_split(coco_train_dataset, [train_size, valid_size])
train_dataset = ImageCaptioningDataset(train_dataset, processor)
valid_dataset = ImageCaptioningDataset(valid_dataset, processor)
test_dataset = ImageCaptioningDataset(coco_test_dataset, processor, return_image_name=True)

# 데이터로더 초기화
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)
valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=4)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=1)

# AdamW 옵티마이저 및 장치 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# 학습 루프
for epoch in range(2):
    print(f"Training Epoch: {epoch + 1}")
    total_loss = 0.0
    model.train()
    for batch in train_dataloader:
        torch.cuda.empty_cache()
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    # 검증 루프
    model.eval()
    with torch.no_grad():
        total_val_loss = 0.0
        for val_batch in valid_dataloader:
            val_input_ids = val_batch.pop("input_ids").to(device)
            val_pixel_values = val_batch.pop("pixel_values").to(device)
            val_outputs = model(input_ids=val_input_ids, pixel_values=val_pixel_values, labels=val_input_ids)
            total_val_loss += val_outputs.loss.item()

    print(f"Epoch {epoch + 1} - Training Loss: {total_loss / len(train_dataloader)}, Validation Loss: {total_val_loss / len(valid_dataloader)}")

# Test loop
model.eval()
tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip-image-captioning-base")
with torch.no_grad():
    # 이미지와 생성된 캡션을 저장할 파일 열기
    with open("content/drive/MyDrive/blip_finetuning_weight/test_predictions.txt", "w") as predictions_file:
        for test_batch, image_name in test_dataloader:
            # 각 배치의 크기를 확인하여 디버깅
            print(f"Batch input_ids shape: {test_batch['input_ids'].shape}")
            print(f"Batch pixel_values shape: {test_batch['pixel_values'].shape}")

            input_ids = test_batch.pop("input_ids").to(device)
            pixel_values = test_batch.pop("pixel_values").to(device)

            # 문제 확인: input_ids가 비정상적으로 설정된 경우 디버그 메시지 출력
            if input_ids.shape[1] == 0:
                print(f"Skipping batch {image_name} due to empty input_ids.")
                continue

            # generate() 호출 시 max_new_tokens 설정
            try:
                outputs = model.generate(input_ids=input_ids, pixel_values=pixel_values, max_new_tokens=50)
            except RuntimeError as e:
                print(f"RuntimeError: {e} for batch {image_name}")
                continue

            # 생성된 캡션 디코딩
            generated_captions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            # 이미지 이름과 생성된 캡션을 파일에 저장하고 출력
            for caption in generated_captions:
                predictions_file.write(f"Image: {image_name}\nGenerated Caption: {caption}\n\n")
                print(f"Image: {image_name}\nGenerated Caption: {caption}\n")

print("Test complete.")


[nltk_data] Downloading package punkt to /home/cute/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Training Epoch: 1


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1 - Training Loss: 9.018897201372608, Validation Loss: 7.220662077990445
Training Epoch: 2
Epoch 2 - Training Loss: 6.3532005826087845, Validation Loss: 5.5812688610770484


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (512). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Batch input_ids shape: torch.Size([1, 512])
Batch pixel_values shape: torch.Size([1, 3, 384, 384])
Image: tensor([3])
Generated Caption: a warehouse is on fire.i

Batch input_ids shape: torch.Size([1, 512])
Batch pixel_values shape: torch.Size([1, 3, 384, 384])
Image: tensor([6])
Generated Caption: there were people behind the white car, and there was a big fire behind the people.

Batch input_ids shape: torch.Size([1, 512])
Batch pixel_values shape: torch.Size([1, 3, 384, 384])
Image: tensor([9])
Generated Caption: a fire broke out next to a house on the side of a mountain.

Batch input_ids shape: torch.Size([1, 512])
Batch pixel_values shape: torch.Size([1, 3, 384, 384])
Image: tensor([14])
Generated Caption: the car is on fire.

Batch input_ids shape: torch.Size([1, 512])
Batch pixel_values shape: torch.Size([1, 3, 384, 384])
Image: tensor([15])
Generated Caption: fire is coming down from the mountain behind the truck passing by.

Batch input_ids shape: torch.Size([1, 512])
Batch pi

In [6]:
# 최최최최종 !!!!!!!! 

import os
import gc
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
import nltk
from torchvision.datasets import CocoCaptions

# 필요한 라이브러리 설치
nltk.download("punkt")

# 데이터셋 경로 설정
train_root = "content/drive/MyDrive/abnormal_dataset"
train_annFile = "content/drive/MyDrive/abnormal_dataset.json"
test_root = "content/drive/MyDrive/test_abnormal_dataset"
test_annFile = "content/drive/MyDrive/test_abnormal_dataset/test_abnormal_dataset.json"

# COCO 데이터셋 로드
coco_train_dataset = CocoCaptions(root=train_root, annFile=train_annFile, transform=None)
coco_test_dataset = CocoCaptions(root=test_root, annFile=test_annFile, transform=None)

# Processor 및 모델 초기화
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# 이미지 캡션 데이터셋 클래스 정의
class ImageCaptioningDataset(Dataset):
    def __init__(self, dataset, processor, return_image_name=False):
        self.dataset = dataset
        self.processor = processor
        self.return_image_name = return_image_name

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, captions = self.dataset[idx]
        text = captions[0]

        # COCO 데이터셋에서 실제 이미지 파일 이름을 가져옴
        if hasattr(self.dataset, 'ids'):
            # 파일 경로를 추출하고 파일 이름만 반환
            image_name = self.dataset.ids[idx]
            image_file_name = os.path.basename(self.dataset.coco.imgs[image_name]['file_name'])  # 파일 이름 추출
        else:
            image_file_name = f"image_{idx}.jpg"

        # 입력 인코딩 생성
        encoding = self.processor(images=image, text=text, padding="max_length", return_tensors="pt")
        encoding = {k: v.squeeze() for k, v in encoding.items()}

        if self.return_image_name:
            return encoding, image_file_name
        else:
            return encoding

# 데이터셋 분할
total_size = len(coco_train_dataset)
train_size = int(0.8 * total_size)
valid_size = total_size - train_size

# 학습, 검증, 테스트 데이터셋 생성
train_dataset, valid_dataset = random_split(coco_train_dataset, [train_size, valid_size])
train_dataset = ImageCaptioningDataset(train_dataset, processor)
valid_dataset = ImageCaptioningDataset(valid_dataset, processor)
test_dataset = ImageCaptioningDataset(coco_test_dataset, processor, return_image_name=True)

# 데이터로더 초기화
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=4)
valid_dataloader = DataLoader(valid_dataset, shuffle=False, batch_size=4)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=1)

# AdamW 옵티마이저 및 장치 설정
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# 학습 루프
for epoch in range(1):
    print(f"Training Epoch: {epoch + 1}")
    total_loss = 0.0
    model.train()
    for batch in train_dataloader:
        torch.cuda.empty_cache()
        input_ids = batch.pop("input_ids").to(device)
        pixel_values = batch.pop("pixel_values").to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=input_ids)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    # 검증 루프
    model.eval()
    with torch.no_grad():
        total_val_loss = 0.0
        for val_batch in valid_dataloader:
            val_input_ids = val_batch.pop("input_ids").to(device)
            val_pixel_values = val_batch.pop("pixel_values").to(device)
            val_outputs = model(input_ids=val_input_ids, pixel_values=val_pixel_values, labels=val_input_ids)
            total_val_loss += val_outputs.loss.item()

    print(f"Epoch {epoch + 1} - Training Loss: {total_loss / len(train_dataloader)}, Validation Loss: {total_val_loss / len(valid_dataloader)}")

# Test loop
model.eval()
tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip-image-captioning-base")
with torch.no_grad():
    # 이미지와 생성된 캡션을 저장할 파일 열기
    with open("content/drive/MyDrive/blip_finetuning_weight/test_predictions.txt", "w") as predictions_file:
        for test_batch, image_name in test_dataloader:
            input_ids = test_batch.pop("input_ids").to(device)
            pixel_values = test_batch.pop("pixel_values").to(device)

            # generate() 호출 시 max_new_tokens 설정
            outputs = model.generate(input_ids=input_ids, pixel_values=pixel_values, max_new_tokens=50)

            # 생성된 캡션 디코딩
            generated_captions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            # 이미지 이름과 생성된 캡션을 파일에 저장하고 출력
            for caption in generated_captions:
                predictions_file.write(f"Image: {image_name}\nGenerated Caption: {caption}\n\n")
                print(f"Image: {image_name}\nGenerated Caption: {caption}\n")

print("Test complete.")

[nltk_data] Downloading package punkt to /home/cute/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Training Epoch: 1
Epoch 1 - Training Loss: 9.04211517338339, Validation Loss: 7.257724892009389
Image: ('fire00000004.png',)
Generated Caption: a warehouse is on fire.er

Image: ('fire00000007.png',)
Generated Caption: there were people behind the white car, and there was a big fire behind the people.e

Image: ('fire00000010.png',)
Generated Caption: a fire broke out next to a house on the side of a mountain.r

Image: ('fire00000015.png',)
Generated Caption: the car is on fire.

Image: ('fire00000016.png',)
Generated Caption: fire is coming down from the mountain behind the truck passing by.er

Image: ('fire00000022.png',)
Generated Caption: a fire is burning behind the red fire truck.e

Image: ('fire00000024.png',)
Generated Caption: a forest fire is burning behind the yellow car.e

Image: ('fire00000025.png',)
Generated 