In [None]:
!ls /content/

sample_data  Sample_data  Sample.zip


In [None]:
import zipfile
import os

zip_path = "/content/Sample.zip"
extract_path = "/content/Sample_data"

# 압축 해제
with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

# 디렉토리 경로 설정
image_root = os.path.join(extract_path, "원천데이터")
label_json_path = os.path.join(extract_path, "라벨링데이터", "라벨링데이터.json")

In [None]:
import os
import json

BASE_PATH = "/content/Sample_data"
IMG_ROOT = os.path.join(BASE_PATH, "원천데이터", "image(단일)")
LABEL_ROOT = os.path.join(BASE_PATH, "라벨링데이터", "라벨링데이터", "image(단일)")

samples = []

categories = os.listdir(IMG_ROOT)
for category in categories:
    img_dir = os.path.join(IMG_ROOT, category)
    label_dir = os.path.join(LABEL_ROOT, category)

    if not os.path.isdir(img_dir) or not os.path.isdir(label_dir):
        continue

    image_files = [
        f for f in os.listdir(img_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))
    ]

    for image_file in image_files:
        image_path = os.path.join(img_dir, image_file)
        image_base = os.path.splitext(image_file)[0]

        matched_json = None
        for json_file in os.listdir(label_dir):
            if json_file.startswith(image_base) and json_file.endswith(".json"):
                matched_json = os.path.join(label_dir, json_file)
                break

        if matched_json:
            with open(matched_json, "r", encoding="utf-8") as f:
                data = json.load(f)
                annotations = data.get("annotations", [])

                # annotations에서 korean 문장만 추출해서 이어 붙이기
                korean_sentences = [
                    anno.get("korean", "") for anno in annotations if "korean" in anno
                ]
                caption = " ".join(korean_sentences).strip()

                if caption:
                    samples.append({"image_path": image_path, "caption": caption})

print("✅ 최종 수집된 학습 샘플 수:", len(samples))

✅ 최종 수집된 학습 샘플 수: 434


In [None]:
# ✅ 1. PyTorch Dataset 정의
from torch.utils.data import Dataset
from PIL import Image


class ImageCaptionDataset(Dataset):
    def __init__(self, samples, processor):
        self.samples = samples
        self.processor = processor

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        image = Image.open(sample["image_path"]).convert("RGB")
        inputs = self.processor(
            images=image,
            text=sample["caption"],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=128,
        )

        # Trainer가 loss 계산을 위해 labels 필요
        inputs["labels"] = inputs["input_ids"].clone()
        return {k: v.squeeze() for k, v in inputs.items()}

In [None]:
# ✅ 2. 모델 & 프로세서 로드 + 데이터셋 생성
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
)

dataset = ImageCaptionDataset(samples, processor)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
# ✅ 3. TrainingArguments 및 Trainer 설정
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./blip-finetuned",  # 모델 저장 폴더
    per_device_train_batch_size=4,  # 배치 사이즈
    num_train_epochs=3,  # 학습 epoch 수
    logging_dir="./logs",  # 로그 저장 폴더
    logging_steps=10,  # 로그 출력 빈도
    save_strategy="epoch",  # 에포크 단위 저장
    save_total_limit=2,  # 최대 저장 체크포인트 수
    remove_unused_columns=False,  # 사용하지 않는 열 제거 X
    fp16=True,  # GPU 상에서 float16 사용 (에러 시 False)
    report_to="wandb",  # wandb 사용 (비사용시 "none")
    run_name="blip-korean-captioning",  # wandb run 이름
)

trainer = Trainer(model=model, args=training_args, train_dataset=dataset)

In [None]:
# ✅ 4. 학습 시작
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33measying1020[0m ([33measying1020-kookmin-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,6.1602
20,2.8927
30,2.2924
40,2.0391
50,1.8145
60,1.6471
70,1.5032
80,1.349
90,1.2776
100,1.1365


TrainOutput(global_step=327, training_loss=1.0515554330400005, metrics={'train_runtime': 312.4053, 'train_samples_per_second': 4.168, 'train_steps_per_second': 1.047, 'total_flos': 7.726375951969812e+17, 'train_loss': 1.0515554330400005, 'epoch': 3.0})

In [None]:
# ✅ 5. 학습 완료 후 저장
model.save_pretrained("./blip-finetuned")
processor.save_pretrained("./blip-finetuned")
print("✅ 모델과 processor 저장 완료!")

✅ 모델과 processor 저장 완료!


In [None]:
from PIL import Image

# 추론 테스트
test_image = Image.open(
    "/content/Sample_data/원천데이터/image(단일)/apple/IMG_0002969_apple(apple).jpg"
).convert("RGB")
inputs = processor(images=test_image, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_length=1000)
caption = processor.decode(output[0], skip_special_tokens=True)
print("📸 생성된 설명:", caption)

📸 생성된 설명: 이 사진의 주제는 사과입니다. 사과는 과실류에 속합니다. 사과가 사진에. 사과1은 사진의 중간에. 사과2는 사진의 중간에. 사ᄀ


In [None]:
!pip install googletrans
!pip install deep_translator



In [None]:
import torch
import json
import os
from PIL import Image, UnidentifiedImageError
from transformers import BlipProcessor, BlipForConditionalGeneration
from deep_translator import GoogleTranslator
from sentence_transformers import SentenceTransformer, util

# BLIP 모델 로드
processor = BlipProcessor.from_pretrained("./blip-finetuned")
model = BlipForConditionalGeneration.from_pretrained("./blip-finetuned")

# 번역기 설정
translator = GoogleTranslator(source="en", target="ko")

# 유사도 모델 로드
similarity_model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)


# 이미지 전처리
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    return image


# BLIP 설명 생성 (영어)
def generate_description(image_path):
    try:
        image = preprocess_image(image_path)
    except UnidentifiedImageError:
        print(f"[❗️에러] '{image_path}'는 유효한 이미지 파일이 아닙니다.")
        return "Image recognition failed."

    inputs = processor(images=image, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_length=50, top_p=0.9, temperature=0.7)
    blip_description = processor.decode(output[0], skip_special_tokens=True)

    if not blip_description.strip() or len(blip_description.split()) <= 3:
        return "Image recognition failed."

    return blip_description


# 영어 설명 → 한국어 번역
def translate_to_korean(text):
    try:
        translated_text = translator.translate(text)
    except Exception as e:
        print(f"[❗️번역 에러] {e}")
        translated_text = "Translation failed."
    return translated_text


# 사용자 중심 보정 로직
def adjust_description(blip_desc, brief_desc):
    blip_emb = similarity_model.encode(blip_desc, convert_to_tensor=True)
    brief_emb = similarity_model.encode(brief_desc, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(blip_emb, brief_emb).item()

    # 유사도가 낮으면 BLIP 설명을 참고로만 표시
    if similarity_score < 0.7:
        adjusted_description = f"{brief_desc} (참고: {blip_desc})"
    else:
        adjusted_description = brief_desc

    return adjusted_description


# 경로 설정
image_folder = "/content/images"
result_folder = "/content/result"

# 사용자 설명 로드
with open("/content/descriptions.json", "r", encoding="utf-8") as file:
    brief_descriptions = json.load(file)

# 입력 JSON 저장
os.makedirs(result_folder, exist_ok=True)
with open(f"{result_folder}/input_descriptions.json", "w", encoding="utf-8") as file:
    json.dump(brief_descriptions, file, ensure_ascii=False, indent=4)

# 결과 저장
results = {}
for image_name in os.listdir(image_folder):
    image_path = os.path.join(image_folder, image_name)

    if os.path.isdir(image_path):
        continue

    # 사용자 설명 가져오기
    brief_desc = brief_descriptions.get(image_name, "Description not available.")

    # BLIP 설명 생성 및 번역
    blip_description_en = generate_description(image_path)
    blip_description_ko = translate_to_korean(blip_description_en)

    # 사용자 설명 중심 보정
    final_description = adjust_description(blip_description_ko, brief_desc)

    # 저장 구조
    results[image_name] = {
        "BLIP_Description": blip_description_ko,
        "Brief_Description": brief_desc,
        "Final_Description": f"{final_description}",
    }

# 출력 JSON 저장
with open(f"{result_folder}/output_descriptions.json", "w", encoding="utf-8") as file:
    json.dump(results, file, ensure_ascii=False, indent=4)

print("✅ Input JSON and Output JSON creation completed with user-centered correction!")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



✅ Input JSON and Output JSON creation completed with user-centered correction!


In [None]:
# 모델이 아직 학습된 상태라면 이걸 실행
model.save_pretrained("./blip-finetuned")