In [53]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'




In [54]:
import pandas as pd
import numpy as np
import torch
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from sklearn.ensemble import IsolationForest
from tqdm import tqdm

In [55]:
from transformers import ViTForImageClassification, ViTFeatureExtractor

model_name = "google/vit-base-patch16-224-in21k"
model = ViTForImageClassification.from_pretrained(model_name, num_labels=2)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name,size=512)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
# 데이터 로딩 클래스 정의
class TrainDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.df = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df['img_path'].iloc[idx]
        images = Image.open(img_path).convert("RGB")
        label = self.df.iloc[idx, 2]  # 라벨
        if self.transform:
            images = self.transform(images)
        
        return {"pixel_values":images,"labels":label,"interpolate_pos_encoding=True":True}
# 이미지 전처리
transform = transforms.Compose([
    #transforms.Resize((224, 224)), # ViT 입력 크기에 맞춰 조정
    #transforms.Resize((512, 512)), # ViT 입력 크기에 맞춰 조정
    transforms.ToTensor(),
])    

In [57]:
train_ds = TrainDataset(csv_file='./bigdata/train.csv', transform=transform)

In [58]:
from transformers import TrainingArguments

epochs = 2000
lr = 0.00006
batch_size = 32


training_args = TrainingArguments(
    "./bigdata/vit-classification-001",
    do_train=True,
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    save_total_limit=3,
    save_strategy="steps",
    save_steps=600,
    logging_steps=600,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
)

In [59]:
trainer.train()#resume_from_checkpoint = True)

ValueError: Input image size (512*512) doesn't match model (224*224).

In [None]:
model = ViTForImageClassification.from_pretrained("./bigdata/vit-classification-001/checkpoint-13800", num_labels=2)

In [None]:
print(test_ds[0]["pixel_values"].shape)
print(train_ds[0]["pixel_values"].shape)

In [None]:
with torch.no_grad():
    model.eval()
    result = []
    for images in test_ds:
        
        
        outputs = model(torch.unsqueeze(images["pixel_values"], 0))
        logits = outputs.logits
        print(logits)
#         # Second, apply argmax on the class dimension
#         #pred_seg = upsampled_logits.argmax(dim=1)[0]
#         masks = torch.sigmoid(upsampled_logits).cpu().numpy()
#         # print(masks.shape)
#         # masks = np.squeeze(masks, axis=1)
#         masks = (masks > 0.40).astype(np.uint8) # Threshold = 0.35
#         for i in range(len(images)):
#             mask_rle = rle_encode(masks[i][0])
#             if mask_rle == '': # 예측된 건물 픽셀이 아예 없는 경우 -1
#                 result.append(-1)
#             else:
#                 result.append(mask_rle)

In [18]:
# 데이터 로딩 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.df = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df['img_path'].iloc[idx]
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image

In [19]:
from transformers import ViTModel
model = ViTModel.from_pretrained("./bigdata/vit-classification-001/checkpoint-13800")
model.eval() # 추론 모드 설정


Some weights of the model checkpoint at ./bigdata/vit-classification-001/checkpoint-13800 were not used when initializing ViTModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at ./bigdata/vit-classification-001/checkpoint-13800 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0): ViTLayer(
        (attention): ViTAttention(
          (attention): ViTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
      

In [21]:
# ViT 특성 추출을 위한 함수
def get_embeddings(dataset, model, device):
    embeddings = []
    model.to(device)
    with torch.no_grad():
        for images in dataset:
            # 이미지를 배치 단위로 처리
            inputs = {"pixel_values":images,"interpolate_pos_encoding":True}
            if self.transform:
                images = self.transform(images)
            outputs = model(**inputs)
            
            # [CLS] 토큰의 출력을 사용하여 특성 벡터 추출
            embeddings.append(outputs.last_hidden_state[:,0,:].cpu().numpy())
    return np.concatenate(embeddings, axis=0)

# 데이터 로더 준비
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = CustomDataset(csv_file='./bigdata/train.csv', transform=transform)

# 임베딩 추출
train_embeddings = get_embeddings(train_loader, model, device)

100%|█████████████████████████████████████████████| 7/7 [00:02<00:00,  2.63it/s]


In [22]:
# Isolation Forest 모델 학습
clf = IsolationForest(random_state=42)
clf.fit(train_embeddings)

# 테스트 데이터에 대해 이상 탐지 수행
test_data = CustomDataset(csv_file='./bigdata/test.csv', transform=transform)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


In [23]:
test_embeddings = get_embeddings(test_loader, model, device)
test_pred = clf.predict(test_embeddings)

# 결과 변환 및 저장
test_pred = np.where(test_pred == -1, 1, 0) # Isolation Forest의 예측 결과 변환
submit = pd.read_csv('./bigdata/sample_submission.csv')
submit['label'] = test_pred
submit.to_csv('./bigdata/finetuned_vit_isolation_forest_submit.csv', index=False)

100%|█████████████████████████████████████████████| 4/4 [00:01<00:00,  3.15it/s]
