## data slicing

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # nvidia-smi로 비어있는 gpu 확인하고 여기서 선택할것!

In [1]:
import cv2
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torchvision import transforms

from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

#import segmentation_models_pytorch as smp
import matplotlib.pyplot as plt
from IPython.display import clear_output 
import time
import os
import json
import random
from transformers import Mask2FormerForUniversalSegmentation, AutoImageProcessor




random_seed = 42
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)

#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
print(device)

cpu


In [2]:
# RLE 디코딩 함수
def rle_decode(mask_rle, shape):
    if mask_rle == -1:
        return np.zeros(shape, dtype=np.uint8)
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)

# RLE 인코딩 함수
def rle_encode(mask):
    pixels = mask.flatten()
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

## custom cells

In [4]:
# # RLE 디코딩 함수 # 원시함수임 현성이가 짠 코드쓸것!
# def rle_decode(mask_rle, shape):
#     s = mask_rle.split()
#     starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
#     starts -= 1
#     ends = starts + lengths
#     img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
#     for lo, hi in zip(starts, ends):
#         img[lo:hi] = 1
#     return img.reshape(shape)

# # RLE 인코딩 함수
# def rle_encode(mask):
#     pixels = mask.flatten()
#     pixels = np.concatenate([[0], pixels, [0]])
#     runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
#     runs[1::2] -= runs[::2]
#     return ' '.join(str(x) for x in runs)

In [3]:
from transformers import Mask2FormerModel
id2label = {1:'building'}
label2id = {'building':1}
pretrained_model_name = "facebook/mask2former-swin-tiny-cityscapes-semantic"
#temp = "segformer-b3-15000steps/checkpoint-18000"
#pretrained_model_name = "nvidia/segformer-b5-finetuned-ade-640-640" 
model = Mask2FormerModel.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

feature_extractor = AutoImageProcessor.from_pretrained(
    pretrained_model_name,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
    )

Some weights of the model checkpoint at facebook/mask2former-swin-tiny-cityscapes-semantic were not used when initializing Mask2FormerModel: ['class_predictor.weight', 'criterion.empty_weight', 'class_predictor.bias']
- This IS expected if you are initializing Mask2FormerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Mask2FormerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
class SatelliteDataset(Dataset):
    def __init__(self, csv_file, transform=None, infer=False):
        self.data = pd.read_csv(csv_file)
        self.transform = transform
        self.infer = infer
        
        print("full dataset size : ",len(self.data))
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #img_path = self.data.iloc[idx, 1]    # default : ./train_img/TRAIN_0000.png
        img_path = "../data"+self.data.iloc[idx, 1][1:]
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if self.infer:
            if self.transform:
                image = self.transform(image=image)['image']
                dic = {"pixel_values":image}

            return dic
        assert False , "SatelliteDataset class must be used as test dataset obj"

In [23]:
class TV_SatelliteDataset(Dataset):
    def __init__(self, csv_file="../data/jhs_stride_160.csv", transform=None, is_train = True, stride=200):
        self.is_train = is_train
        self.transform = transform
        self.stride = stride
        self.size = 224
        self.cutter = int(241920*0.8) #tv cutter
        
        if self.is_train:
            self.data = pd.read_csv(csv_file)[:self.cutter]
        else:
            self.data = pd.read_csv(csv_file)[self.cutter:]


        
        print("Full dataset size:", len(self.data))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        augmented = self.transform(image=cv2.cvtColor(cv2.imread(".."+self.data.iloc[idx, 1][1:]), cv2.COLOR_BGR2RGB), mask=rle_decode(self.data.iloc[idx, 2], (224, 224)))
        #return augmented['image'],augmented['mask'].type(torch.LongTensor)
        return {"pixel_values":augmented['image'],"labels":augmented['mask'].type(torch.LongTensor)},10
        

        
        


In [24]:
aug1 = A.Compose(
    [   
        #A.Resize(224, 224),
        A.OneOf([
            A.CLAHE(p = 0.1),
            A.RandomBrightnessContrast(contrast_limit=0.1, brightness_by_max=False),
            A.GaussNoise(var_limit=(0.0, 25.0), p = 0.1)
        ]),

        # A.RandomCrop(width=224, height=224, p = 0.1),

        A.Affine(shear=(-10, 10)),
        #A.HorizontalFlip(),
        #A.VerticalFlip(),
        #A.RandomRotate90(),

        A.Normalize(),
        ToTensorV2()
    ]
)

aug2 = A.Compose(
    [   
        #A.Resize(224, 224),
        
        A.VerticalFlip(),
        

        A.Normalize(),
        ToTensorV2()
    ]
)

aug3 = A.Compose(
    [   
        #A.Resize(224, 224),
        
        A.HorizontalFlip(),

        A.Normalize(),
        ToTensorV2()
    ]
)

aug4 = A.Compose(
    [   
        #A.Resize(224, 224),
        
        A.Rotate(),

        A.Normalize(),
        ToTensorV2()
    ]
)

transform = A.Compose(
    [   
        #A.Resize(224, 224),
        A.Normalize(),
        ToTensorV2()
    ]
)


# aug1_dataset = TV_SatelliteDataset(transform=aug1, is_train=True)
# aug2_dataset = TV_SatelliteDataset(transform=aug2, is_train=True)
# aug3_dataset = TV_SatelliteDataset(transform=aug3, is_train=True)
# aug4_dataset = TV_SatelliteDataset(transform=aug4, is_train=True)
train_ds = TV_SatelliteDataset(transform=transform, is_train=True)

# train_ds = ConcatDataset([train_ds,aug1_dataset,aug2_dataset,aug3_dataset,aug4_dataset])
val_ds = TV_SatelliteDataset(transform=transform, is_train=False)


test_ds = SatelliteDataset(csv_file='../data/test.csv', transform=transform, infer=True)

Full dataset size: 193536
Full dataset size: 48384
full dataset size :  60640


In [25]:
train_dataloader = DataLoader(train_ds, batch_size=1, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True, num_workers=4)

In [26]:
# model 초기화
model.to(device)

# loss function과 optimizer 정의
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# validation
best_validation_loss = 200.0

# training loop
for epoch in range(2):  # 10 에폭 동안 학습합니다.
    model.train()
    epoch_loss = 0
    val_loss = 0
    for images, masks in tqdm(train_dataloader):
        #images = images.float().to(device)
        #masks = masks.float().to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = outputs.masks_queries_logits
        #loss = criterion(outputs, masks.unsqueeze(1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    model.eval()
    with torch.no_grad():
        for images, masks in tqdm(val_dataloader):
            images = images.float().to(device)
            masks = masks.float().to(device)
            outputs = model(images)
            
            loss = criterion(outputs, masks.unsqueeze(1))

            val_loss += loss.item()
    
    if best_validation_loss > val_loss/len(val_dataloader):
        best_validation_loss = val_loss/len(val_dataloader)
        torch.save(model, f'../best_model/best_model.pth')
        print('Model saved!')

    print(f'Epoch {epoch+1}, Train_Loss: {epoch_loss/len(train_dataloader)}, Val_Loss: {val_loss/len(val_dataloader)}')

  0%|          | 0/193536 [00:00<?, ?it/s]


AttributeError: 'dict' object has no attribute 'shape'

## huggingface cells

In [9]:
train_ds[0]["pixel_values"].shape # our data

torch.Size([3, 224, 224])

In [10]:
train_ds[0]["labels"].shape # our data

torch.Size([224, 224])

In [11]:
test_ds[0]["pixel_values"].shape # our data

torch.Size([3, 224, 224])

In [12]:
from transformers import TrainingArguments

epochs = 4
lr = 0.00006
batch_size = 2

hub_model_id = "mask2former-swin-tiny-cityscapes-semantic"

training_args = TrainingArguments(
    "mask2former-swin-tiny-cityscapes-semantic",
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=3000,
    eval_steps=3000,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    #push_to_hub=True,
    #hub_model_id=hub_model_id,
    #hub_strategy="end",
    seed=random_seed
)


In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model.to('cpu'),
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    #compute_metrics=compute_metrics,
)


In [14]:
#trainer.train(resume_from_checkpoint = True)
trainer.train()



RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED

In [None]:
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=False, num_workers=4)

In [None]:
torch.save(model, f'../best_model/huggingface_model_0.pth')

In [None]:
model = SegformerForSemanticSegmentation.from_pretrained(
    "./segformer-b4-15000steps/checkpoint-27000",
    id2label=id2label,
    label2id=label2id
)

In [None]:
#model = torch.load('../best_model/huggingface_model_0.pth')
model.to(device)

SegformerForSemanticSegmentation(
  (segformer): SegformerModel(
    (encoder): SegformerEncoder(
      (patch_embeddings): ModuleList(
        (0): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(3, 64, kernel_size=(7, 7), stride=(4, 4), padding=(3, 3))
          (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        )
        (1): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        )
        (2): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(128, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((320,), eps=1e-05, elementwise_affine=True)
        )
        (3): SegformerOverlapPatchEmbeddings(
          (proj): Conv2d(320, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)

In [None]:
with torch.no_grad():
    model.eval()
    result = []
    for images in tqdm(test_dataloader):
        images = images["pixel_values"].float().to(device)
        
        outputs = model(images)
        logits = outputs.logits
        # masks = torch.sigmoid(outputs).cpu().numpy()
        # print(masks.shape)
        # masks = np.squeeze(masks, axis=1)
        # masks = (masks > 0.35).astype(np.uint8) # Threshold = 0.35
        upsampled_logits = nn.functional.interpolate(
                logits,
                size=(224,224), # (height, width)
                mode='bilinear',
                align_corners=False
                )

        # Second, apply argmax on the class dimension
        #pred_seg = upsampled_logits.argmax(dim=1)[0]
        masks = torch.sigmoid(upsampled_logits).cpu().numpy()
        # print(masks.shape)
        # masks = np.squeeze(masks, axis=1)
        masks = (masks > 0.40).astype(np.uint8) # Threshold = 0.35
        for i in range(len(images)):
            mask_rle = rle_encode(masks[i][0])
            if mask_rle == '': # 예측된 건물 픽셀이 아예 없는 경우 -1
                result.append(-1)
            else:
                result.append(mask_rle)

100%|███████████████████████████████████████| 3790/3790 [04:38<00:00, 13.60it/s]


In [None]:
submit = pd.read_csv('../data/sample_submission.csv')
submit['mask_rle'] = result

In [None]:
submit.to_csv('../submit/b4_48batch_27000steps_040.csv', index=False)